Implement microcode auxiliaries for AMD x86-64 compiled code.

author Taylor R Campbell <campbell@mumble.net>

Fri, 30 Oct 2009 22:14:32 +0000 (18:14 -0400)

committer Taylor R Campbell <campbell@mumble.net>

Fri, 30 Oct 2009 22:14:32 +0000 (18:14 -0400)
author Taylor R Campbell <campbell@mumble.net>
Fri, 30 Oct 2009 22:14:32 +0000 (18:14 -0400)
committer Taylor R Campbell <campbell@mumble.net>
Fri, 30 Oct 2009 22:14:32 +0000 (18:14 -0400)
diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4

new file mode 100644 (file)

index 0000000..fbf7417
--- /dev/null
+++ b/src/microcode/cmpauxmd/x86-64.m4
@@ -0,0 +1,1114 @@
+### -*-Midas-*-
+###
+### Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
+###     1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+###     2004, 2005, 2006, 2007, 2008, 2009 Massachusetts Institute of
+###     Technology
+###
+### This file is part of MIT/GNU Scheme.
+###
+### MIT/GNU Scheme is free software; you can redistribute it and/or
+### modify it under the terms of the GNU General Public License as
+### published by the Free Software Foundation; either version 2 of the
+### License, or (at your option) any later version.
+###
+### MIT/GNU Scheme is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with MIT/GNU Scheme; if not, write to the Free Software
+### Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+### 02110-1301, USA.
+\f
+### AMD x86-64 assembly language part of the compiled code interface.
+### See cmpint.txt, cmpint.c, cmpint-mc68k.h, and cmpgc.h for more
+### documentation.
+###
+### This m4 source expands into either Unix (gas) source or PC
+### (masm/wasm) source.
+###
+### NOTE:
+###    Assumptions:
+###
+###    0) Segment registers and paging are set up for 64-bit "flat"
+###    operation.
+###
+###    1) All registers and stack locations hold a C long object.
+###
+###    2) The C compiler divides registers into three groups:
+###    - Linkage registers, used for procedure calls and global
+###    references.  On AMD64 Unix ABI: %rbp, %rsp.
+###    - super temporaries, not preserved accross procedure calls and
+###    always usable. On AMD64 Unix ABI: everything but what is
+###    listed below.
+###    - preserved registers saved by the callee if they are written.
+###    On AMD64 Unix ABI: %rbp, %rbx, %r12-%r15, MXCSR, x87 control
+###    word.
+###
+###    3) Arguments, if passed on a stack, are popped by the caller
+###    or by the procedure return instruction (as on the VAX).  Thus
+###    most "leaf" procedures need not worry about them.  On x86-64,
+###    arguments beyond the sixth are passed on the stack; the first
+###    through sixth are passed in %rdi, %rsi, %rdx, %rcx, %r8, or
+###    %r9.  (Non-integer arguments are passed in other ways.)
+###
+###    4) There is a hardware or software maintained stack for
+###    control.  The procedure calling sequence may leave return
+###    addresses in registers, but they must be saved somewhere for
+###    nested calls and recursive procedures.  On x86-64: saved on
+###    the stack by the CALL instruction.
+###
+###    5) C procedures return long values in a super temporary
+###    register.  Two word structures are returned in super temporary
+###    registers as well in the AMD64 Unix ABI: %rax and %rdi.
+###
+###    6) Floating point registers are not preserved by this
+###    interface.  The interface is only called from the Scheme
+###    interpreter, which does not use floating point data.  Thus
+###    although the calling convention would require us to preserve
+###    them, they contain garbage.
+###
+### Compiled Scheme code uses the following register convention:
+###    - %rsp containts the Scheme stack pointer, not the C stack
+###    pointer.
+###    - %rsi contains a pointer to the Scheme interpreter's "register"
+###    block.  This block contains the compiler's copy of MemTop,
+###    the interpreter's registers (val, env, exp, etc.),
+###    temporary locations for compiled code, and the addresses
+###    of various hooks defined in this file.
+###    - %rdi contains the Scheme free pointer.
+###    - %rbp contains the Scheme datum mask.
+###    The dynamic link (when needed) is in Registers[REGBLOCK_COMPILER_TEMP]
+###    Values are returned in Registers[REGBLOCK_VAL]
+###    [TRC 20091025: Later, we ought to use machine registers for
+###    these.]
+###
+###    All other registers are available to the compiler.  A
+###    caller-saves convention is used, so the registers need not be
+###    preserved by subprocedures.
+\f
+### The following m4 macros can be defined to change how this file is
+### expanded.
+###
+### DASM
+###    If defined, expand to Intel assembly-language syntax, used by
+###    Microsoft assembler (MASM) and Watcom assembler (WASM).
+###    Otherwise, expand to AT&T syntax, used by GAS.  [TRC 20091025:
+###    The Intel syntax probably won't work here.]
+###
+### WIN32
+###    If defined, expand to run under Win32; implies DASM.
+###
+### SUPPRESS_LEADING_UNDERSCORE
+###    If defined, external symbol names are generated as written;
+###    otherwise, they have an underscore prepended to them.
+### WCC386
+###    Should be defined when using Watcom assembler.
+### WCC386R
+###    Should be defined when using Watcom assembler and generating
+###    code to use the Watcom register-based argument conventions.
+### TYPE_CODE_LENGTH
+###    Normally defined to be 6.  Don't change this unless you know
+###    what you're doing.
+### DISABLE_387
+###    If defined, do not generate 387 floating-point instructions.
+### VALGRIND_MODE
+###    If defined, modify code to make it work with valgrind.
+\f
+####   Utility macros and definitions
+
+ifdef(`WIN32',
+      `define(IF_WIN32,`$1')',
+      `define(IF_WIN32,`')')
+
+ifdef(`DISABLE_387',
+      `define(IF387,`')',
+      `define(IF387,`$1')')
+
+ifdef(`DISABLE_387',
+      `define(IFN387,`$1')',
+      `define(IFN387,`')')
+
+IF_WIN32(`define(DASM,1)')
+ifdef(`WCC386R',`define(WCC386,1)')
+
+ifdef(`DASM',
+      `define(IFDASM,`$1')',
+      `define(IFDASM,`')')
+
+ifdef(`DASM',
+      `define(IFNDASM,`')',
+      `define(IFNDASM,`$1')')
+
+ifdef(`DASM',
+      `define(use_external_data,`      extrn $1':dword)',
+      `define(use_external_data,`')')
+
+ifdef(`DASM',
+       `define(use_external_code,`     extrn $1':near)',
+       `define(use_external_code,`')')
+
+ifdef(`DASM',
+      `define(export_label,`   public $1')',
+      `define(export_label,`   .globl $1')')
+
+IFNDASM(`      .file   "cmpaux-x86-64.s"')
+
+# GAS doesn't implement these, for no obvious reason.
+IFNDASM(`define(pushad,`pusha')')
+IFNDASM(`define(popad,`popa')')
+IFNDASM(`define(pushfd,`pushf')')
+IFNDASM(`define(popfd,`popf')')
+
+ifdef(`SUPPRESS_LEADING_UNDERSCORE',
+       `define(EVR,`$1')',
+       `define(EVR,`_$1')')
+
+# When using the Watcom C compiler with register-based calling
+# conventions, source-code function names normally expand to `FOO_',
+# but functions that are compiled with prefix keywords such as
+# `__cdecl' or `__syscall' expand differently.  References to the
+# former type of name are marked with `EFR', while references to the
+# latter are marked with `EPFR'.
+
+ifdef(`SUPPRESS_LEADING_UNDERSCORE',
+      `define(EPFR,`$1')',
+      `define(EPFR,`_$1')')
+
+ifdef(`WCC386R',
+      `define(EFR,`$1_')',
+      `define(EFR,`EPFR($1)')')
+
+define(hook_reference,`EFR(asm_$1)')
+
+define(define_data,`export_label(EVR($1))')
+
+define(define_code_label,`
+export_label($1)
+$1:')
+
+define(define_c_label,`define_code_label(EPFR($1))')
+define(define_debugging_label,`define_code_label($1)')
+define(define_hook_label,`define_code_label(hook_reference($1))')
+
+ifdef(`DASM',
+      `define(DECLARE_DATA_SEGMENT,`   .data')',
+      `define(DECLARE_DATA_SEGMENT,`   .data')')
+
+ifdef(`DASM',
+      `define(DECLARE_CODE_SEGMENT,`   .code')',
+      `define(DECLARE_CODE_SEGMENT,`   .text')')
+
+ifdef(`DASM',
+      `define(declare_alignment,`      align $1')',
+      `define(declare_alignment,`      .align $1')')
+
+ifdef(`DASM',
+      `define(allocate_word,`EVR($1) dw 0')',
+      `define(allocate_word,`  .comm EVR($1),2')')
+
+ifdef(`DASM',
+      `define(allocate_longword,`EVR($1) dd 0')',
+      `define(allocate_longword,`      .comm EVR($1),4')')
+
+ifdef(`DASM',
+      `define(allocate_quadword,`EVR($1) dq 0')',
+      `define(allocate_quadword,`      .comm EVR($1),8')')
+
+ifdef(`DASM',
+      `define(allocate_space,`EVR($1) db $2 dup (0)')',
+      `define(allocate_space,`EVR($1):
+       .space $2')')
+\f
+ifdef(`DASM',
+      `define(HEX, `0$1H')',
+      `define(HEX, `0x$1')')
+
+ifdef(`DASM',
+      `define(OP,`$1$3')',
+      `define(OP,`$1$2')')
+
+ifdef(`DASM',
+      `define(TW,`$2,$1')',
+      `define(TW,`$1,$2')')
+
+ifdef(`DASM',
+      `define(ABS, `dword ptr $1')',
+      `define(ABS, `$1(%rip)')')
+
+ifdef(`DASM',
+      `define(IMM, `$1')',
+      `define(IMM, `$$1')')
+
+ifdef(`DASM',
+      `define(REG,`$1')',
+      `define(REG,`%$1')')
+
+ifdef(`DASM',
+      `define(ST,`st($1)')',
+      `define(ST,`%st ($1)')')
+
+ifdef(`DASM',
+      `define(IND,`dword ptr [$1]')',
+      `define(IND,`($1)')')
+
+ifdef(`DASM',
+      `define(BOF,`byte ptr $1[$2]')',
+      `define(BOF,`$1($2)')')
+
+ifdef(`DASM',
+      `define(WOF,`word ptr $1[$2]')',
+      `define(WOF,`$1($2)')')
+
+ifdef(`DASM',
+      `define(LOF,`dword ptr $1[$2]')',
+      `define(LOF,`$1($2)')')
+
+ifdef(`DASM',
+      `define(DOF,`qword ptr $1[$2]')',
+      `define(DOF,`$1($2)')')
+
+ifdef(`DASM',
+      `define(IDX,`dword ptr [$1] [$2]')',
+      `define(IDX,`($1,$2)')')
+
+ifdef(`DASM',
+      `define(SDX,`dword ptr $1[$2*$3]')',
+      `define(SDX,`$1(,$2,$3)')')
+
+ifdef(`DASM',
+      `define(IJMP,`$1')',
+      `define(IJMP,`*$1')')
+\f
+define(TC_LENGTH, ifdef(`TYPE_CODE_LENGTH', TYPE_CODE_LENGTH, 6))
+define(DATUM_LENGTH, eval(64 - TC_LENGTH))
+define(DATUM_SHIFT, eval(1 << DATUM_LENGTH))
+# This doesn't work because m4 is !@#&$*%^!#!$(%!&*@#^(.
+#define(ADDRESS_MASK, eval(DATUM_SHIFT - 1))
+define(ADDRESS_MASK, HEX(3ffffffffffffff))
+# TAG doesn't work for the same reason.
+#define(TAG, ($2 + ($1 * DATUM_SHIFT)))
+
+define(TC_FALSE,0)
+define(TC_FLONUM,6)
+define(TC_TRUE,8)
+define(TC_FIXNUM,26)
+define(TC_MANIFEST_NM_VECTOR,39)
+define(TC_COMPILED_ENTRY,40)
+
+# TAG doesn't work due to m4 stupidity, so define these magic
+# constants here.  These are computed in terms of the parameters
+# above.
+
+define(IMM_MANIFEST_NM_VECTOR_1, `IMM(HEX(9c00000000000001))')
+define(IMM_TRUE, `IMM(HEX(2000000000000000))')
+define(IMM_FALSE, `IMM(HEX(0000000000000000))')
+
+define(REGBLOCK_VAL,16)
+define(REGBLOCK_COMPILER_TEMP,32)
+define(REGBLOCK_LEXPR_ACTUALS,56)
+define(REGBLOCK_PRIMITIVE,64)
+define(REGBLOCK_CLOSURE_FREE,72)
+
+define(REGBLOCK_DLINK,REGBLOCK_COMPILER_TEMP)
+define(REGBLOCK_UTILITY_ARG4,REGBLOCK_CLOSURE_FREE)
+
+define(COMPILER_REGBLOCK_N_FIXED,16)
+define(COMPILER_REGBLOCK_N_HOOKS,80)
+define(COMPILER_REGBLOCK_N_TEMPS,256)
+define(COMPILER_FIXED_SIZE,1)
+define(COMPILER_HOOK_SIZE,1)
+define(COMPILER_TEMP_SIZE,2)
+define(REGBLOCK_SIZE_IN_OBJECTS,
+       eval((COMPILER_REGBLOCK_N_FIXED*COMPILER_FIXED_SIZE)
+           +(COMPILER_REGBLOCK_N_HOOKS*COMPILER_HOOK_SIZE)
+           +(COMPILER_REGBLOCK_N_TEMPS*COMPILER_TEMP_SIZE)))
+
+# Define the floating-point processor control word.  Always set
+# round-to-even and double precision.  Under Win32, mask all
+# exceptions.  Under unix and OS/2, mask only the inexact result
+# exception.
+ifdef(`WIN32',
+      `define(FP_CONTROL_WORD,HEX(023f))',
+      `define(FP_CONTROL_WORD,HEX(0220))')
+
+define(regs,REG(rsi))
+define(rfree,REG(rdi))
+define(rmask,REG(rbp))
+
+IFDASM(`.586p
+.model flat')
+
+DECLARE_DATA_SEGMENT()
+declare_alignment(2)
+
+use_external_data(EVR(Free))
+use_external_data(EVR(stack_pointer))
+use_external_data(EVR(utility_table))
+
+ifdef(`WIN32',`
+use_external_data(EVR(RegistersPtr))
+',`
+define_data(Regstart)
+allocate_space(Regstart,256)
+
+define_data(Registers)
+allocate_space(Registers,eval(REGBLOCK_SIZE_IN_OBJECTS*8))
+')
+
+define_data(i387_presence)
+allocate_quadword(i387_presence)
+
+define_data(C_Stack_Pointer)
+allocate_quadword(C_Stack_Pointer)
+
+define_data(C_Frame_Pointer)
+allocate_quadword(C_Frame_Pointer)
+
+# [TRC 20091025: CPUID is always supported.]
+# define_data(x86_64_cpuid_supported)
+# allocate_quadword(x86_64_cpuid_supported)
+
+# [TRC 20091025: The cache synchronization bug does not occur in any
+# x86-64 machines of which I am aware.]
+# define_data(x86_64_cpuid_needed)
+# allocate_quadword(x86_64_cpuid_needed)
+\f
+DECLARE_CODE_SEGMENT()
+declare_alignment(2)
+
+# [TRC 20091025: We need to check for MMX/SSEn instructions too.]
+
+define_c_label(x86_64_interface_initialize)
+       OP(push,q)      REG(rbp)
+       OP(mov,q)       TW(REG(rsp),REG(rbp))
+       OP(xor,q)       TW(REG(rax),REG(rax))           # No 387 available
+
+# [TRC 20091025: The AMD64 reference manual suggests using the CPUID
+# instruction to detect instruction subsets instead.]
+
+# Unfortunately, the `movl cr0,ecx' instruction is privileged.
+# Use the deprecated `smsw cx' instruction instead.
+
+IF387(`
+#      OP(mov,q)       TW(REG(cr0),REG(rcx))           # Test for 387 presence
+ifdef(`VALGRIND_MODE',`',`
+       smsw            REG(cx)
+       OP(mov,q)       TW(IMM(HEX(12)),REG(rdx))
+       OP(and,q)       TW(REG(rdx),REG(rcx))
+       OP(cmp,q)       TW(REG(rdx),REG(rcx))
+       jne     x86_64_initialize_no_fp
+')
+       OP(inc,q)       REG(rax)                        # 387 available
+       OP(sub,q)       TW(IMM(8),REG(rsp))
+       fclex
+       fnstcw          WOF(-2,REG(rbp))
+       OP(and,w)       TW(IMM(HEX(f0e0)),WOF(-2,REG(rbp)))
+       OP(or,w)        TW(IMM(FP_CONTROL_WORD),WOF(-2,REG(rbp)))
+       fldcw           WOF(-2,REG(rbp))
+x86_64_initialize_no_fp:
+')
+       OP(mov,q)       TW(REG(rax),ABS(EVR(i387_presence)))
+
+# [TRC 20091025: CPUID is always supported.]
+
+# Do a bunch of hair to determine if we need to do cache synchronization.
+# See if the CPUID instruction is supported.
+
+#      OP(xor,q)       TW(REG(rax),REG(rax))
+#      OP(mov,q)       TW(REG(rax),ABS(EVR(x86_64_cpuid_supported)))
+#      OP(mov,q)       TW(REG(rax),ABS(EVR(x86_64_cpuid_needed)))
+
+# First test: can we toggle the AC bit?
+
+#      pushfd
+#      OP(pop,l)       REG(eax)
+#      OP(mov,l)       TW(REG(eax),REG(ecx))
+#      OP(xor,l)       TW(IMM(HEX(00040000)),REG(eax))
+#      OP(push,l)      REG(eax)
+#      popfd
+#      pushfd
+#      OP(pop,l)       REG(eax)
+
+# if AC bit can't be toggled, this is a 386 (and doesn't support CPUID).
+
+#      OP(xor,l)       TW(REG(ecx),REG(eax))
+#      jz              no_cpuid_instr
+#      OP(push,l)      REG(ecx)                        # restore EFLAGS
+#      popfd
+
+# Now test to see if the ID bit can be toggled.
+
+#      OP(mov,l)       TW(REG(ecx),REG(eax))
+#      OP(xor,l)       TW(IMM(HEX(00200000)),REG(eax))
+#      OP(push,l)      REG(eax)
+#      popfd
+#      pushfd
+#      OP(pop,l)       REG(eax)
+
+# if ID bit can't be toggled, this is a 486 that doesn't support CPUID.
+
+#      OP(xor,l)       TW(REG(ecx),REG(eax))
+#      jz              no_cpuid_instr
+#      OP(push,l)      REG(ecx)                        # restore EFLAGS
+#      popfd
+
+# Now we know that cpuid is supported.
+
+#      OP(mov,q)       TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_supported)))
+
+# Next, use the CPUID instruction to determine the processor type.
+
+#      OP(push,l)      REG(ebx)
+#      OP(xor,l)       TW(REG(eax),REG(eax))
+#      cpuid
+
+# Check that CPUID accepts argument 1.
+
+#      OP(cmp,l)       TW(IMM(HEX(00000001)),REG(eax))
+#      jl              done_setting_up_cpuid
+
+# Detect "GenuineIntel".
+
+#      OP(cmp,l)       TW(IMM(HEX(756e6547)),REG(ebx))
+#      jne             not_intel_cpu
+#      OP(cmp,l)       TW(IMM(HEX(49656e69)),REG(edx))
+#      jne             not_intel_cpu
+#      OP(cmp,l)       TW(IMM(HEX(6c65746e)),REG(ecx))
+#      jne             not_intel_cpu
+
+# For CPU families 4 (486), 5 (Pentium), or 6 (Pentium Pro, Pentium
+# II, Pentium III), don't use CPUID synchronization.
+
+#      OP(mov,l)       TW(IMM(HEX(01)),REG(eax))
+#      cpuid
+#      OP(shr,l)       TW(IMM(HEX(08)),REG(eax))
+#      OP(and,l)       TW(IMM(HEX(0000000F)),REG(eax))
+#      OP(cmp,l)       TW(IMM(HEX(4)),REG(eax))
+#      jl              done_setting_up_cpuid
+#      OP(cmp,l)       TW(IMM(HEX(6)),REG(eax))
+#      jg              done_setting_up_cpuid
+#
+#      jmp             cpuid_not_needed
+#
+#not_intel_cpu:
+
+# Detect "AuthenticAMD".
+
+#      OP(cmp,l)       TW(IMM(HEX(68747541)),REG(ebx))
+#      jne             not_amd_cpu
+#      OP(cmp,l)       TW(IMM(HEX(69746e65)),REG(edx))
+#      jne             not_amd_cpu
+#      OP(cmp,l)       TW(IMM(HEX(444d4163)),REG(ecx))
+#      jne             not_amd_cpu
+
+# Problem appears to exist only on Athlon models 1, 3, and 4.
+
+#      OP(mov,l)       TW(IMM(HEX(01)),REG(eax))
+#      cpuid
+
+#      OP(mov,l)       TW(REG(eax),REG(ecx))
+#      OP(shr,l)       TW(IMM(HEX(08)),REG(eax))
+#      OP(and,l)       TW(IMM(HEX(0000000F)),REG(eax))
+#      OP(cmp,l)       TW(IMM(HEX(6)),REG(eax))        # family 6 = Athlon
+#      jne             done_setting_up_cpuid
+
+#      OP(mov,l)       TW(REG(ecx),REG(eax))
+#      OP(shr,l)       TW(IMM(HEX(04)),REG(eax))
+#      OP(and,l)       TW(IMM(HEX(0000000F)),REG(eax))
+#      OP(cmp,l)       TW(IMM(HEX(6)),REG(eax))        # model 6 and up OK
+#      jge             done_setting_up_cpuid
+#      OP(cmp,l)       TW(IMM(HEX(2)),REG(eax))        # model 2 OK
+#      je              done_setting_up_cpuid
+
+#      OP(mov,l)       TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_needed)))
+
+#not_amd_cpu:
+#done_setting_up_cpuid:
+#      OP(pop,l)       REG(ebx)
+#no_cpuid_instr:
+       leave
+       ret
+
+define_c_label(C_to_interface)
+       OP(push,q)      REG(rbp)                        # Link according
+       OP(mov,q)       TW(REG(rsp),REG(rbp))           #  to C's conventions
+       OP(push,q)      REG(rbx)                        # Save callee-saves
+       OP(push,q)      REG(r12)                        #  registers
+       OP(push,q)      REG(r13)
+       OP(push,q)      REG(r14)
+       OP(push,q)      REG(r15)
+       OP(mov,q)       TW(REG(rdi),REG(rdx))           # Entry point
+                                                       # Preserve frame ptr
+       OP(mov,q)       TW(REG(rbp),ABS(EVR(C_Frame_Pointer)))
+                                                       # Preserve stack ptr
+       OP(mov,q)       TW(REG(rsp),ABS(EVR(C_Stack_Pointer)))
+       jmp     EPFR(interface_to_scheme)
+
+define_hook_label(trampoline_to_interface)
+define_debugging_label(trampoline_to_interface)
+       OP(pop,q)       REG(rcx)                        # trampoline storage
+       jmp     scheme_to_interface
+
+define_hook_label(scheme_to_interface_call)
+define_debugging_label(scheme_to_interface_call)
+       OP(pop,q)       REG(rcx)                        # arg1 = ret. add
+       OP(add,q)       TW(IMM(4),REG(rcx))             # Skip format info
+#      jmp     scheme_to_interface
+\f
+define_hook_label(scheme_to_interface)
+define_debugging_label(scheme_to_interface)
+
+# These two moves must happen _before_ the ffree instructions below.
+# Otherwise recovery from SIGFPE there will fail.
+       OP(mov,q)       TW(REG(rsp),ABS(EVR(stack_pointer)))
+       OP(mov,q)       TW(rfree,ABS(EVR(Free)))
+
+# [TRC 20091025: I think this should be excised.]
+
+IF387(`
+       OP(cmp,q)       TW(IMM(0),ABS(EVR(i387_presence)))
+       je      scheme_to_interface_proceed
+       ffree   ST(0)                                   # Free floating "regs"
+       ffree   ST(1)
+       ffree   ST(2)
+       ffree   ST(3)
+       ffree   ST(4)
+       ffree   ST(5)
+       ffree   ST(6)
+       ffree   ST(7)
+scheme_to_interface_proceed:
+')
+
+       OP(mov,q)       TW(ABS(EVR(C_Stack_Pointer)),REG(rsp))
+       OP(mov,q)       TW(ABS(EVR(C_Frame_Pointer)),REG(rbp))
+
+       OP(sub,q)       TW(IMM(16),REG(rsp))    # alloc struct return
+
+       # Shuffle Scheme -> AMD64 calling conventions:
+       #   struct pointer -> rdi
+       #   rcx -> rsi
+       #   rdx -> rdx
+       #   rbx -> rcx
+       #   arg4 -> r8
+       # Parallel assignment problems:
+       #   arg4 depends on rsi: do arg4->r8 first
+       #   target depends on rcx (why?): use r11 as a temporary
+       # [TRC 20091025: Perhaps we can rearrange LIAR to generate
+       # arguments in the registers we want, to avoid this
+       # shuffling.]
+
+       OP(mov,q)       TW(REG(rcx),REG(r11))
+
+       OP(xor,q)       TW(REG(rcx),REG(rcx))
+       OP(mov,b)       TW(REG(al),REG(cl))
+       OP(mov,q)       TW(SDX(EVR(utility_table),REG(rcx),8),REG(rax))
+
+       OP(mov,q)       TW(REG(rsp),REG(rdi))
+       OP(mov,q)       TW(DOF(REGBLOCK_UTILITY_ARG4(),regs),REG(r8))
+       OP(mov,q)       TW(REG(r11),REG(rsi))
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+
+       call            IJMP(REG(rax))
+
+define_debugging_label(scheme_to_interface_return)
+       OP(pop,q)       REG(rax)                # pop struct return
+       OP(pop,q)       REG(rdx)
+       jmp             IJMP(REG(rax))          # Invoke handler
+
+define_c_label(interface_to_scheme)
+IF387(`
+       OP(cmp,q)       TW(IMM(0),ABS(EVR(i387_presence)))
+       je      interface_to_scheme_proceed
+       ffree   ST(0)                                   # Free floating "regs"
+       ffree   ST(1)
+       ffree   ST(2)
+       ffree   ST(3)
+       ffree   ST(4)
+       ffree   ST(5)
+       ffree   ST(6)
+       ffree   ST(7)
+interface_to_scheme_proceed:
+')
+                                                       # Register block = %rsi
+                                                       # Scheme offset in NT
+ifdef(`WIN32',
+`      OP(mov,q)       TW(ABS(EVR(RegistersPtr)),regs)',
+`      OP(lea,q)       TW(ABS(EVR(Registers)),regs)')
+
+       OP(mov,q)       TW(ABS(EVR(Free)),rfree)        # Free pointer = %rdi
+       OP(mov,q)       TW(DOF(REGBLOCK_VAL(),regs),REG(rax)) # Value/dynamic link
+       OP(mov,q)       TW(IMM(ADDRESS_MASK),rmask)     # = %rbp
+
+       OP(mov,q)       TW(ABS(EVR(stack_pointer)),REG(rsp))
+       OP(mov,q)       TW(REG(rax),REG(rcx))           # Preserve if used
+       OP(and,q)       TW(rmask,REG(rcx))              # Restore potential dynamic link
+       OP(mov,q)       TW(REG(rcx),DOF(REGBLOCK_DLINK(),regs))
+       jmp             IJMP(REG(rdx))
+
+IF_WIN32(`
+use_external_code(EFR(WinntExceptionTransferHook))
+define_code_label(EFR(callWinntExceptionTransferHook))
+       call    EFR(WinntExceptionTransferHook)
+       mov     rdx,rax
+')
+
+define_c_label(interface_to_C)
+IF387(`
+       OP(cmp,q)       TW(IMM(0),ABS(EVR(i387_presence)))
+       je      interface_to_C_proceed
+       ffree   ST(0)                                   # Free floating "regs"
+       ffree   ST(1)
+       ffree   ST(2)
+       ffree   ST(3)
+       ffree   ST(4)
+       ffree   ST(5)
+       ffree   ST(6)
+       ffree   ST(7)
+interface_to_C_proceed:')
+
+       OP(mov,q)       TW(REG(rdx),REG(rax))           # Set up result
+       OP(pop,q)       REG(r15)                        # Restore callee-saves
+       OP(pop,q)       REG(r14)                        #  registers
+       OP(pop,q)       REG(r13)
+       OP(pop,q)       REG(r12)
+       OP(pop,q)       REG(rbx)
+       leave
+       ret
+\f
+# [TRC 20091025: The cache synchronization bug does not occur in any
+# x86-64 machines of which I am aware.]
+
+#define_code_label(EFR(x86_64_cache_synchronize))
+#      OP(push,q)      REG(rbp)
+#      OP(mov,q)       TW(REG(rsp),REG(rbp))
+#      OP(push,q)      REG(rbx)
+#      OP(xor,q)       TW(REG(rax),REG(rax))
+#      cpuid
+#      OP(pop,q)       REG(rbx)
+#      leave
+#      ret
+
+### Run the CPUID instruction for serialization.
+
+#define_hook_label(serialize_cache)
+#      pushad
+#      OP(xor,q)       TW(REG(rax),REG(rax))
+#      cpuid
+#      popad
+#      ret
+
+### Stub to be used in place of above on machines that don't need it.
+
+#define_hook_label(dont_serialize_cache)
+#      ret
+\f
+###    Assembly language hooks used to reduce code size.
+###    There is no time advantage to using these over using
+###    scheme_to_interface (or scheme_to_interface_call), but the
+###    code generated by the compiler can be somewhat smaller.
+
+define(define_jump_indirection,
+`define_hook_label($1)
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
+       jmp     scheme_to_interface')
+
+define(define_call_indirection,
+`define_hook_label($1)
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
+       jmp     scheme_to_interface_call')
+
+define_call_indirection(interrupt_procedure,1a)
+define_call_indirection(interrupt_continuation,1b)
+define_jump_indirection(interrupt_closure,18)
+define_jump_indirection(interrupt_continuation_2,3b)
+
+define_hook_label(interrupt_dlink)
+       OP(mov,q)       TW(DOF(REGBLOCK_DLINK(),regs),REG(rdx))
+       OP(mov,b)       TW(IMM(HEX(19)),REG(al))
+       jmp     scheme_to_interface_call
+
+###
+###    This saves even more instructions than primitive_apply
+###    When the PC is not available.  Instead of jumping here,
+###    a call instruction is used, and the longword offset to
+###    the primitive object follows the call instruction.
+###    This code loads the primitive object and merges with
+###    apply_primitive
+###
+###     [TRC 20091025: But on the x86-64, we have RIP-relative
+###     addressing, so we don't need this.]
+###
+
+#declare_alignment(2)
+#define_hook_label(short_primitive_apply)
+#      OP(pop,l)       REG(edx)                        # offset pointer
+#      OP(mov,l)       TW(IND(REG(edx)),REG(ecx))      # offset
+#                                                      # Primitive object
+#      OP(mov,l)       TW(IDX(REG(edx),REG(ecx)),REG(ecx))
+#                                                      # Merge
+#      jmp     hook_reference(primitive_apply)
+
+declare_alignment(2)
+define_jump_indirection(primitive_apply,12)
+
+define_jump_indirection(primitive_lexpr_apply,13)
+define_jump_indirection(error,15)
+define_call_indirection(link,17)
+define_call_indirection(assignment_trap,1d)
+define_call_indirection(reference_trap,1f)
+define_call_indirection(safe_reference_trap,20)
+define_call_indirection(primitive_error,36)
+\f
+###    Assembly language hooks used to increase speed.
+
+# define_jump_indirection(sc_apply,14)
+# 
+# define(define_apply_fixed_size,
+# `define_hook_label(sc_apply_size_$1)
+#      OP(mov,q)       TW(IMM($1),REG(rdx))
+#      OP(mov,b)       TW(IMM(HEX(14)),REG(al))
+#      jmp     scheme_to_interface')
+
+declare_alignment(2)
+define_hook_label(sc_apply)
+       OP(mov,q)       TW(REG(rcx),REG(rax))           # Copy for type code
+       OP(mov,q)       TW(REG(rcx),REG(rbx))           # Copy for address
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))  # Select type code
+       OP(and,q)       TW(rmask,REG(rbx))              # Select datum
+       OP(cmp,b)       TW(IMM(TC_COMPILED_ENTRY),REG(al))
+       jne     asm_sc_apply_generic
+       # [TRC 20091025: How big are the frame sizes?]
+       OP(movs,bq,x)   TW(BOF(-4,REG(rbx)),REG(rax))   # Extract frame size
+       OP(cmp,q)       TW(REG(rax),REG(rdx))           # Compare to nargs+1
+       jne     asm_sc_apply_generic
+       jmp     IJMP(REG(rbx))                          # Invoke
+
+define_debugging_label(asm_sc_apply_generic)
+       OP(mov,q)       TW(IMM(HEX(14)),REG(rax))
+       jmp     scheme_to_interface     
+
+define(define_apply_fixed_size,
+`declare_alignment(2)
+define_hook_label(sc_apply_size_$1)
+       OP(mov,q)       TW(REG(rcx),REG(rax))           # Copy for type code
+       OP(mov,q)       TW(REG(rcx),REG(rbx))           # Copy for address
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))  # Select type code
+       OP(and,q)       TW(rmask,REG(rbx))              # Select datum
+       OP(cmp,b)       TW(IMM(TC_COMPILED_ENTRY),REG(al))
+       jne     asm_sc_apply_generic_$1
+       # [TRC 20091025: How big are the frame sizes?]
+       OP(cmp,b)       TW(IMM($1),BOF(-4,REG(rbx)))    # Compare frame size
+       jne     asm_sc_apply_generic_$1 # to nargs+1
+       jmp     IJMP(REG(rbx))
+
+asm_sc_apply_generic_$1:
+       OP(mov,q)       TW(IMM($1),REG(rdx))
+       OP(mov,b)       TW(IMM(HEX(14)),REG(al))
+       jmp     scheme_to_interface')
+
+define_apply_fixed_size(1)
+define_apply_fixed_size(2)
+define_apply_fixed_size(3)
+define_apply_fixed_size(4)
+define_apply_fixed_size(5)
+define_apply_fixed_size(6)
+define_apply_fixed_size(7)
+define_apply_fixed_size(8)
+\f
+###    The following code is used by generic arithmetic
+###    whether the fixnum case is open-coded in line or not.
+###    This takes care of fixnums and flonums so that the common
+###    numeric types are much faster than the rare ones
+###    (bignums, ratnums, recnums)
+
+IF387(`declare_alignment(2)
+asm_generic_flonum_result:
+       # The MOV instruction can take a 64-bit immediate operand only
+       # if the target is a register, so we store the manifest in rax
+       # before moving it to memory.
+       OP(mov,q)       TW(IMM_MANIFEST_NM_VECTOR_1,REG(rax))
+       OP(mov,q)       TW(REG(rax), IND(rfree))
+       # The OR instruction cannot take a 64-bit immediate either, so
+       # we need to store the tag in rax first, shift it up, and then
+       # OR the datum into it.
+       OP(mov,q)       TW(IMM(TC_FLONUM),REG(rax))
+       OP(shl,q)       TW(IMM(DATUM_LENGTH),REG(rax))
+       OP(or,q)        TW(rfree,REG(rax))
+       OP(fstp,l)      DOF(8,rfree)                    # fstpd
+       OP(and,q)       TW(rmask,IND(REG(rsp)))
+       OP(add,q)       TW(IMM(16),rfree)
+       OP(mov,q)       TW(REG(rax),DOF(REGBLOCK_VAL(),regs))
+       ret
+
+declare_alignment(2)
+asm_generic_fixnum_result:
+       OP(and,q)       TW(rmask,IND(REG(rsp)))
+       OP(or,b)        TW(IMM(TC_FIXNUM),REG(al))
+       OP(ror,q)       TW(IMM(TC_LENGTH),REG(rax))
+       OP(mov,q)       TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
+       ret
+
+declare_alignment(2)
+asm_generic_return_sharp_t:
+       OP(and,q)       TW(rmask,IND(REG(rsp)))
+       OP(mov,q)       TW(IMM_TRUE,REG(rax))
+       OP(mov,q)       TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
+       ret
+
+declare_alignment(2)
+asm_generic_return_sharp_f:
+       OP(and,q)       TW(rmask,IND(REG(rsp)))
+       OP(mov,q)       TW(IMM_FALSE,REG(rax))
+       OP(mov,q)       TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
+       ret')
+\f
+define(define_unary_operation,
+`declare_alignment(2)
+define_hook_label(generic_$1)
+       OP(pop,q)       REG(rdx)
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
+       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
+       je      asm_generic_$1_fix
+       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
+       jne     asm_generic_$1_fail
+       OP(and,q)       TW(rmask,REG(rdx))
+       fld1
+       OP($4,l)        DOF(8,REG(rdx))
+       jmp     asm_generic_flonum_result
+
+asm_generic_$1_fix:
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rax))
+       OP($3,q)        TW(IMM(eval(1 << TC_LENGTH)),REG(rax))
+       jno     asm_generic_fixnum_result
+
+asm_generic_$1_fail:
+       OP(push,q)      REG(rdx)
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
+       jmp     scheme_to_interface')
+
+define(define_unary_predicate,
+`declare_alignment(2)
+define_hook_label(generic_$1)
+       OP(pop,q)       REG(rdx)
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
+       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
+       je      asm_generic_$1_fix
+       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
+       jne     asm_generic_$1_fail
+       OP(and,q)       TW(rmask,REG(rdx))
+       OP(fld,l)       DOF(8,REG(rdx))
+       ftst
+       fstsw   REG(ax)
+       fstp    ST(0)
+       sahf
+       $4      asm_generic_return_sharp_t
+       jmp     asm_generic_return_sharp_f
+
+asm_generic_$1_fix:
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rax))
+       OP(cmp,q)       TW(IMM(0),REG(rax))
+       $3      asm_generic_return_sharp_t
+       jmp     asm_generic_return_sharp_f
+
+asm_generic_$1_fail:
+       OP(push,q)      REG(rdx)
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
+       jmp     scheme_to_interface')
+\f
+define(define_binary_operation,
+`declare_alignment(2)
+define_hook_label(generic_$1)
+       OP(pop,q)       REG(rdx)
+       OP(pop,q)       REG(rbx)
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rcx))
+       OP(cmp,b)       TW(REG(al),REG(cl))
+       jne     asm_generic_$1_fail
+       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
+       je      asm_generic_$1_fix
+       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
+       je      asm_generic_$1_flo
+
+asm_generic_$1_fail:
+       OP(push,q)      REG(rbx)
+       OP(push,q)      REG(rdx)
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
+       jmp     scheme_to_interface
+
+asm_generic_$1_fix:
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rax))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rcx))
+       $5
+       OP($3,q)        TW(REG(rcx),REG(rax))           # subq
+       jo      asm_generic_$1_fail
+       jmp     asm_generic_fixnum_result
+
+asm_generic_$1_flo:
+       OP(and,q)       TW(rmask,REG(rdx))
+       OP(and,q)       TW(rmask,REG(rbx))
+       OP(fld,l)       DOF(8,REG(rdx))                 # fldd
+       OP($4,l)        DOF(8,REG(rbx))                 # fsubl
+       jmp     asm_generic_flonum_result')
+\f
+IF387(`declare_alignment(2)
+define_hook_label(generic_divide)
+       OP(pop,q)       REG(rdx)
+       OP(pop,q)       REG(rbx)
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rcx))
+       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
+       je      asm_generic_divide_fix
+       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
+       jne     asm_generic_divide_fail
+       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(cl))
+       je      asm_generic_divide_flo_flo
+       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(cl))
+       jne     asm_generic_divide_fail
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rcx))
+       je      asm_generic_divide_fail
+       OP(and,q)       TW(rmask,REG(rdx))
+       OP(sar,q)       TW(IMM(TC_LENGTH),REG(rcx))
+       OP(fld,l)       DOF(8,REG(rdx))                 # fldd
+       OP(mov,q)       TW(REG(rcx),IND(rfree))
+       OP(fidiv,l)     IND(rfree)
+       jmp     asm_generic_flonum_result
+
+asm_generic_divide_fix:
+       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(cl))
+       jne     asm_generic_divide_fail
+       OP(mov,q)       TW(REG(rdx),REG(rcx))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rcx))
+       je      asm_generic_divide_fail
+       OP(and,q)       TW(rmask,REG(rbx))
+       OP(sar,q)       TW(IMM(TC_LENGTH),REG(rcx))
+       OP(fld,l)       DOF(8,REG(rbx))                 # fldd
+       OP(mov,q)       TW(REG(rcx),IND(rfree))
+       OP(fidivr,l)    IND(rfree)
+       jmp     asm_generic_flonum_result
+
+asm_generic_divide_flo_flo:
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(and,q)       TW(rmask,REG(rcx))
+       OP(fld,l)       DOF(8,REG(rcx))                 # fldd
+       ftst
+       fstsw   REG(ax)
+       sahf
+       je      asm_generic_divide_by_zero
+       OP(and,q)       TW(rmask,REG(rdx))
+       OP(fdivr,l)     DOF(8,REG(rdx))
+       jmp     asm_generic_flonum_result       
+
+asm_generic_divide_by_zero:
+       fstp    ST(0)                                   # Pop second arg
+
+asm_generic_divide_fail:
+       OP(push,q)      REG(rbx)
+       OP(push,q)      REG(rdx)
+       OP(mov,b)       TW(IMM(HEX(23)),REG(al))
+       jmp     scheme_to_interface')
+\f
+define(define_binary_predicate,
+`declare_alignment(2)
+define_hook_label(generic_$1)
+       OP(pop,q)       REG(rdx)
+       OP(pop,q)       REG(rbx)
+       OP(mov,q)       TW(REG(rdx),REG(rax))
+       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
+       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rcx))
+       OP(cmp,b)       TW(REG(al),REG(cl))
+       jne     asm_generic_$1_fail
+       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
+       jne     asm_generic_$1_fail
+
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rdx))
+       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rbx))
+       OP(cmp,q)       TW(REG(rbx),REG(rdx))
+       $3      asm_generic_return_sharp_t      
+       jmp     asm_generic_return_sharp_f
+
+asm_generic_$1_fail:
+       OP(push,q)      REG(rbx)
+       OP(push,q)      REG(rdx)
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
+       jmp     scheme_to_interface')
+\f
+IF387(`define_unary_operation(decrement,22,sub,fsubr)
+define_unary_operation(increment,26,add,fadd)
+
+define_unary_predicate(negative,2a,jl,jb)
+define_unary_predicate(positive,2c,jg,ja)
+define_unary_predicate(zero,2d,je,je)
+
+# define_binary_operation(name,index,fix*fix,flo*flo, fixup)
+# define_binary_operation(  $1,   $2,     $3,     $4, $5)
+# The fixup is optional; only multiplication needs it to shift the
+# result back down by six bits.
+define_binary_operation(add,2b,add,fadd)
+define_binary_operation(subtract,28,sub,fsub)
+define_binary_operation(multiply,29,imul,fmul,
+       `OP(shr,q)      TW(IMM(6),REG(rax))')
+# Divide needs to check for 0, so we cant really use the following
+# define_binary_operation(divide,23,NONE,fdiv)
+
+# define_binary_predicate(name,index,fix*fix,flo*flo)
+define_binary_predicate(equal,24,je,je)
+define_binary_predicate(greater,25,jg,ja)
+define_binary_predicate(less,27,jl,jb)')
+
+IFN387(`define_jump_indirection(generic_decrement,22)
+define_jump_indirection(generic_divide,23)
+define_jump_indirection(generic_equal,24)
+define_jump_indirection(generic_greater,25)
+define_jump_indirection(generic_increment,26)
+define_jump_indirection(generic_less,27)
+define_jump_indirection(generic_subtract,28)
+define_jump_indirection(generic_multiply,29)
+define_jump_indirection(generic_negative,2a)
+define_jump_indirection(generic_add,2b)
+define_jump_indirection(generic_positive,2c)
+define_jump_indirection(generic_zero,2d)')
+
+# These don't currently differ according to whether there
+# is a 387 or not.
+
+define_jump_indirection(generic_quotient,37)
+define_jump_indirection(generic_remainder,38)
+define_jump_indirection(generic_modulo,39)
+
+define_jump_indirection(nofp_decrement,22)
+define_jump_indirection(nofp_divide,23)
+define_jump_indirection(nofp_equal,24)
+define_jump_indirection(nofp_greater,25)
+define_jump_indirection(nofp_increment,26)
+define_jump_indirection(nofp_less,27)
+define_jump_indirection(nofp_subtract,28)
+define_jump_indirection(nofp_multiply,29)
+define_jump_indirection(nofp_negative,2a)
+define_jump_indirection(nofp_add,2b)
+define_jump_indirection(nofp_positive,2c)
+define_jump_indirection(nofp_zero,2d)
+define_jump_indirection(nofp_quotient,37)
+define_jump_indirection(nofp_remainder,38)
+define_jump_indirection(nofp_modulo,39)
+\f
+IFDASM(`end')
+
+### Edwin Variables:
+### comment-column: 56
+### comment-start: "#"
+### End:
diff --git a/src/microcode/cmpintmd/x86-64-config.h b/src/microcode/cmpintmd/x86-64-config.h

new file mode 100644 (file)

index 0000000..7678d46
--- /dev/null
+++ b/src/microcode/cmpintmd/x86-64-config.h
@@ -0,0 +1,32 @@
+/* -*-C-*-
+
+Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
+    1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
+    2006, 2007, 2008, 2009 Massachusetts Institute of Technology
+
+This file is part of MIT/GNU Scheme.
+
+MIT/GNU Scheme is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+MIT/GNU Scheme is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with MIT/GNU Scheme; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+USA.
+
+*/
+
+#ifndef SCM_CMPINTMD_CONFIG_H_INCLUDED
+#define SCM_CMPINTMD_CONFIG_H_INCLUDED 1
+
+#define COMPILER_PROCESSOR_TYPE COMPILER_X86_64_TYPE
+#define CC_IS_NATIVE 1
+
+#endif /* !SCM_CMPINTMD_CONFIG_H_INCLUDED */
diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c

new file mode 100644 (file)

index 0000000..a7cc7f0
--- /dev/null
+++ b/src/microcode/cmpintmd/x86-64.c
@@ -0,0 +1,372 @@
+/* -*-C-*-
+
+Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
+    1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
+    2006, 2007, 2008, 2009 Massachusetts Institute of Technology
+
+This file is part of MIT/GNU Scheme.
+
+MIT/GNU Scheme is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+MIT/GNU Scheme is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with MIT/GNU Scheme; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+USA.
+
+*/
+
+/* Compiled code interface for AMD x86-64.  */
+
+#include "cmpint.h"
+#include "extern.h"
+#include "outf.h"
+#include "errors.h"
+
+extern void * tospace_to_newspace (void *);
+\f
+bool
+read_cc_entry_type (cc_entry_type_t * cet, insn_t * address)
+{
+  return (decode_old_style_format_word (cet, (((uint16_t *) address) [-2])));
+}
+
+bool
+write_cc_entry_type (cc_entry_type_t * cet, insn_t * address)
+{
+  return (encode_old_style_format_word (cet, ((uint16_t *) address) - 2));
+}
+
+bool
+read_cc_entry_offset (cc_entry_offset_t * ceo, insn_t * address)
+{
+  uint16_t n = (((uint16_t *) address) [-1]);
+  (ceo->offset) = (n >> 1);
+  (ceo->continued_p) = ((n & 1) != 0);
+  return (false);
+}
+
+bool
+write_cc_entry_offset (cc_entry_offset_t * ceo, insn_t * address)
+{
+  if (! ((ceo->offset) < 0x4000))
+    return (true);
+  (((uint16_t *) address) [-1])
+    = (((ceo->offset) << 1) | ((ceo->continued_p) ? 1 : 0));
+  return (false);
+}
+\f
+/* Compiled closures */
+
+/* MOV RAX,imm64 has two bytes of opcode cruft before the imm64.  */
+
+insn_t *
+read_compiled_closure_target (insn_t * start)
+{
+  return (* ((insn_t **) (start + CC_ENTRY_HEADER_SIZE + 2)));
+}
+
+void
+write_compiled_closure_target (insn_t * target, insn_t * start)
+{
+  (* ((insn_t **) (start + CC_ENTRY_HEADER_SIZE + 2))) = target;
+}
+
+unsigned long
+compiled_closure_count (SCHEME_OBJECT * block)
+{
+  /* `block' is a pointer to the first object after the manifest.  The
+     first object following it is the entry count.  */
+  return ((unsigned long) (* ((uint32_t *) block)));
+}
+
+insn_t *
+compiled_closure_start (SCHEME_OBJECT * block)
+{
+  /* Skip the 32-bit entry count.  */
+  return (((insn_t *) block) + 4);
+}
+
+insn_t *
+compiled_closure_entry (insn_t * start)
+{
+  return (start + CC_ENTRY_HEADER_SIZE);
+}
+
+insn_t *
+compiled_closure_next (insn_t * start)
+{
+  return (start + CC_ENTRY_HEADER_SIZE + 12);
+}
+
+SCHEME_OBJECT *
+skip_compiled_closure_padding (insn_t * start)
+{
+  /* The padding is the same size as the entry header (format word).  */
+  return ((SCHEME_OBJECT *) (start + CC_ENTRY_HEADER_SIZE));
+}
+
+SCHEME_OBJECT
+compiled_closure_entry_to_target (insn_t * entry)
+{
+  /* `entry' points to the start of the MOV RAX,imm64 instruction,
+     which has two bytes of opcode cruft before the imm64.  */
+  return (MAKE_CC_ENTRY (* ((long *) (entry + 2))));
+}
+\f
+/* Execution caches (UUO links)
+
+   An execution cache is a region of memory that lives in the
+   constants section of a compiled-code block.  It is an indirection
+   for calling external procedures that allows the linker to control
+   the calling process without having to find and change all the
+   places in the compiled code that refer to it.
+
+   Prior to linking, the execution cache has two pieces of
+   information: (1) the name of the procedure being called (a symbol),
+   and (2) the number of arguments that will be passed to the
+   procedure.  `saddr' points to the arity at the beginning of the
+   execution cache.  */
+
+SCHEME_OBJECT
+read_uuo_symbol (SCHEME_OBJECT * saddr)
+{
+  return (saddr[1]);
+}
+
+unsigned int
+read_uuo_frame_size (SCHEME_OBJECT * saddr)
+{
+  return (* ((uint16_t *) saddr));
+}
+
+insn_t *
+read_uuo_target (SCHEME_OBJECT * saddr)
+{
+  insn_t * mov_addr = ((insn_t *) (saddr + 1));
+  return (* ((insn_t **) (mov_addr + 2)));
+}
+
+insn_t *
+read_uuo_target_no_reloc (SCHEME_OBJECT * saddr)
+{
+  return (read_uuo_target (saddr));
+}
+
+void
+write_uuo_target (insn_t * target, SCHEME_OBJECT * saddr)
+{
+  /* Skip the arity. */
+  insn_t * addr = ((insn_t *) (saddr + 1));
+  (*addr++) = 0x48;            /* REX.W (64-bit operand size prefix) */
+  (*addr++) = 0xB8;            /* MOV RAX,imm64 */
+  (* ((insn_t **) addr)) = target;
+  addr += 8;
+  (*addr++) = 0xFF;            /* JMP reg/mem64 */
+  (*addr++) = 0xE0;            /* ModR/M for RAX */
+}
+\f
+#define BYTES_PER_TRAMPOLINE_ENTRY_PADDING 4
+#define OBJECTS_PER_TRAMPOLINE_ENTRY 2
+
+#define RSI_TRAMPOLINE_TO_INTERFACE_OFFSET                             \
+  ((COMPILER_REGBLOCK_N_FIXED + (2 * COMPILER_HOOK_SIZE))              \
+   * SIZEOF_SCHEME_OBJECT)
+
+unsigned long
+trampoline_entry_size (unsigned long n_entries)
+{
+  return (n_entries * OBJECTS_PER_TRAMPOLINE_ENTRY);
+}
+
+insn_t *
+trampoline_entry_addr (SCHEME_OBJECT * block, unsigned long index)
+{
+  return (((insn_t *) (block + 2 + (index * OBJECTS_PER_TRAMPOLINE_ENTRY)))
+         + BYTES_PER_TRAMPOLINE_ENTRY_PADDING + CC_ENTRY_HEADER_SIZE);
+}
+
+bool
+store_trampoline_insns (insn_t * entry, byte_t code)
+{
+  (*entry++) = 0xB0;           /* MOV AL,code */
+  (*entry++) = code;
+  (*entry++) = 0xFF;           /* CALL /2 disp32(ESI) */
+  (*entry++) = 0x96;
+  (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
+  X86_64_CACHE_SYNCHRONIZE ();
+  return (false);
+}
+\f
+#ifdef _MACH_UNIX
+#  include <mach.h>
+#  define VM_PROT_SCHEME (VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE)
+#endif
+
+#define SETUP_REGISTER(hook) do                                                \
+{                                                                      \
+  (* ((unsigned long *) (rsi_value + offset)))                         \
+    = ((unsigned long) (hook));                                                \
+  offset += (COMPILER_HOOK_SIZE * (sizeof (SCHEME_OBJECT)));           \
+  declare_builtin (((unsigned long) hook), #hook);                     \
+} while (0)
+
+void
+x86_64_reset_hook (void)
+{
+  int offset = (COMPILER_REGBLOCK_N_FIXED * (sizeof (SCHEME_OBJECT)));
+  unsigned char * rsi_value = ((unsigned char *) Registers);
+  bool fp_support_present = (x86_64_interface_initialize ());
+
+  /* These must match machines/x86-64/lapgen.scm */
+
+  SETUP_REGISTER (asm_scheme_to_interface);            /* 0 */
+  SETUP_REGISTER (asm_scheme_to_interface_call);       /* 1 */
+
+  if (offset != RSI_TRAMPOLINE_TO_INTERFACE_OFFSET)
+    {
+      outf_fatal ("\nx86_64_reset_hook: RSI_TRAMPOLINE_TO_INTERFACE_OFFSET\n");
+      Microcode_Termination (TERM_EXIT);
+    }
+  SETUP_REGISTER (asm_trampoline_to_interface);                /* 2 */
+
+  SETUP_REGISTER (asm_interrupt_procedure);            /* 3 */
+  SETUP_REGISTER (asm_interrupt_continuation);         /* 4 */
+  SETUP_REGISTER (asm_interrupt_closure);              /* 5 */
+  SETUP_REGISTER (asm_interrupt_dlink);                        /* 6 */
+  SETUP_REGISTER (asm_primitive_apply);                        /* 7 */
+  SETUP_REGISTER (asm_primitive_lexpr_apply);          /* 8 */
+  SETUP_REGISTER (asm_assignment_trap);                        /* 9 */
+  SETUP_REGISTER (asm_reference_trap);                 /* 10 */
+  SETUP_REGISTER (asm_safe_reference_trap);            /* 11 */
+  SETUP_REGISTER (asm_link);                           /* 12 */
+  SETUP_REGISTER (asm_error);                          /* 13 */
+  SETUP_REGISTER (asm_primitive_error);                        /* 14 */
+  /* [TRC 20091025: This was an i386 hack for when the PC is not
+     available, which on x86-64 it always is. */
+  /* SETUP_REGISTER (asm_short_primitive_apply); */            /* 15 */
+
+  /* No more room for positive offsets without going to 32-bit
+     offsets!  */
+
+  /* This is a hack to make all the hooks be addressable with byte
+     offsets (instead of longword offsets).  The register block
+     extends to negative offsets as well, so all the following hooks
+     are accessed with negative offsets, and all fit in a byte.  */
+
+  /* [TRC 20091029: This hack doesn't work any longer; this code
+     should be cleaned up, since we must use longword offsets anyway.]
+     */
+
+  offset = -256;
+  if (fp_support_present)
+    {
+      SETUP_REGISTER (asm_generic_add);                        /* -32 */
+      SETUP_REGISTER (asm_generic_subtract);           /* -31 */
+      SETUP_REGISTER (asm_generic_multiply);           /* -30 */
+      SETUP_REGISTER (asm_generic_divide);             /* -29 */
+      SETUP_REGISTER (asm_generic_equal);              /* -28 */
+      SETUP_REGISTER (asm_generic_less);               /* -27 */
+      SETUP_REGISTER (asm_generic_greater);            /* -26 */
+      SETUP_REGISTER (asm_generic_increment);          /* -25 */
+      SETUP_REGISTER (asm_generic_decrement);          /* -24 */
+      SETUP_REGISTER (asm_generic_zero);               /* -23 */
+      SETUP_REGISTER (asm_generic_positive);           /* -22 */
+      SETUP_REGISTER (asm_generic_negative);           /* -21 */
+      SETUP_REGISTER (asm_generic_quotient);           /* -20 */
+      SETUP_REGISTER (asm_generic_remainder);          /* -19 */
+      SETUP_REGISTER (asm_generic_modulo);             /* -18 */
+    }
+  else
+    {
+      SETUP_REGISTER (asm_nofp_add);                   /* -32 */
+      SETUP_REGISTER (asm_nofp_subtract);              /* -31 */
+      SETUP_REGISTER (asm_nofp_multiply);              /* -30 */
+      SETUP_REGISTER (asm_nofp_divide);                        /* -29 */
+      SETUP_REGISTER (asm_nofp_equal);                 /* -28 */
+      SETUP_REGISTER (asm_nofp_less);                  /* -27 */
+      SETUP_REGISTER (asm_nofp_greater);               /* -26 */
+      SETUP_REGISTER (asm_nofp_increment);             /* -25 */
+      SETUP_REGISTER (asm_nofp_decrement);             /* -24 */
+      SETUP_REGISTER (asm_nofp_zero);                  /* -23 */
+      SETUP_REGISTER (asm_nofp_positive);              /* -22 */
+      SETUP_REGISTER (asm_nofp_negative);              /* -21 */
+      SETUP_REGISTER (asm_nofp_quotient);              /* -20 */
+      SETUP_REGISTER (asm_nofp_remainder);             /* -19 */
+      SETUP_REGISTER (asm_nofp_modulo);                        /* -18 */
+    }
+
+  SETUP_REGISTER (asm_sc_apply);                       /* -17 */
+  SETUP_REGISTER (asm_sc_apply_size_1);                        /* -16 */
+  SETUP_REGISTER (asm_sc_apply_size_2);                        /* -15 */
+  SETUP_REGISTER (asm_sc_apply_size_3);                        /* -14 */
+  SETUP_REGISTER (asm_sc_apply_size_4);                        /* -13 */
+  SETUP_REGISTER (asm_sc_apply_size_5);                        /* -12 */
+  SETUP_REGISTER (asm_sc_apply_size_6);                        /* -11 */
+  SETUP_REGISTER (asm_sc_apply_size_7);                        /* -10 */
+  SETUP_REGISTER (asm_sc_apply_size_8);                        /* -9 */
+  SETUP_REGISTER (asm_interrupt_continuation_2);       /* -8 */
+  /* [TRC 20091025: The cache synchronization bug does not occur in any
+      x86-64 machines of which I am aware.]
+
+  if (x86_64_cpuid_needed)
+    SETUP_REGISTER (asm_serialize_cache);              /\* -7 *\/
+  else
+    SETUP_REGISTER (asm_dont_serialize_cache);         /\* -7 *\/
+  */
+
+#ifdef _MACH_UNIX
+  {
+    vm_address_t addr;
+    vm_size_t size;
+    vm_prot_t prot;
+    vm_prot_t max_prot;
+    vm_inherit_t inheritance;
+    boolean_t shared;
+    port_t object;
+    vm_offset_t offset;
+
+    addr = ((vm_address_t) Heap);
+    if ((vm_region ((task_self ()), &addr, &size, &prot, &max_prot,
+                   &inheritance, &shared, &object, &offset))
+       != KERN_SUCCESS)
+      {
+       outf_fatal ( "compiler_reset: vm_region() failed.\n");
+       Microcode_Termination (TERM_EXIT);
+       /*NOTREACHED*/
+      }
+    if ((prot & VM_PROT_SCHEME) != VM_PROT_SCHEME)
+      {
+       if ((max_prot & VM_PROT_SCHEME) != VM_PROT_SCHEME)
+         {
+           outf_fatal (
+                       "compiler_reset: inadequate protection for Heap.\n");
+           outf_fatal ( "maximum = 0x%lx; desired = 0x%lx\n",
+                       ((unsigned long) (max_prot & VM_PROT_SCHEME)),
+                       ((unsigned long) VM_PROT_SCHEME));
+           Microcode_Termination (TERM_EXIT);
+           /*NOTREACHED*/
+         }
+       if ((vm_protect ((task_self ()), ((vm_address_t) Heap),
+                        (((char *) constant_end) - ((char *) Heap)),
+                        0, VM_PROT_SCHEME))
+           != KERN_SUCCESS)
+         {
+           outf_fatal ("Unable to change protection for Heap.\n");
+           outf_fatal ("actual = 0x%lx; desired = 0x%lx\n",
+                       ((unsigned long) (prot & VM_PROT_SCHEME)),
+                       ((unsigned long) VM_PROT_SCHEME));
+           Microcode_Termination (TERM_EXIT);
+           /*NOTREACHED*/
+         }
+      }
+  }
+#endif /* _MACH_UNIX */
+}
diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h

new file mode 100644 (file)

index 0000000..49d0f81
--- /dev/null
+++ b/src/microcode/cmpintmd/x86-64.h
@@ -0,0 +1,351 @@
+/* -*-C-*-
+
+Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
+    1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
+    2006, 2007, 2008, 2009 Massachusetts Institute of Technology
+
+This file is part of MIT/GNU Scheme.
+
+MIT/GNU Scheme is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at
+your option) any later version.
+
+MIT/GNU Scheme is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with MIT/GNU Scheme; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+USA.
+
+*/
+
+/* Compiled code interface macros for AMD x86-64.  */
+
+#ifndef SCM_CMPINTMD_H_INCLUDED
+#define SCM_CMPINTMD_H_INCLUDED 1
+\f
+/*
+
+Problems with the AMD x86-64 instruction set architecture
+====================================================
+
+1. Jumps are PC-relative.  There are absolute jumps, assuming the PC
+   is in a data location, or with immediate destinations that include
+   a segment descriptor (16 bits).  The short forms have a PC-relative
+   offset defined with respect to the immediately following
+   instruction.
+
+Problem: Closures and execute caches need their address in old space
+   in order to be relocated correctly.
+
+Fix:
+
+For execute caches we can define a new linker field, called
+load-relocation-address which on every GC/relocation stores the new
+address and the old contents into global variables and stores the new
+address in the field.  Alternatively the difference between the new
+address and the old contents can be stored into a single global
+variable, and this can be used, together with the new address of each
+cache, to find the old code.
+
+For closures the code that reads the header (manifest closure) can do
+the same.
+
+
+2. The CALL and JMP instructions do not accept 64-bit displacements.
+
+Problem: We want heaps bigger than 4 GB.
+
+Fix:
+
+Assemble more than one instruction for closure entry points, expanding
+them even more.  Yech.
+
+
+3. The stack pointer register (RSP) cannot be used as the base in
+   (base + displacement) addressing mode.
+
+Problem: Common operation in the compiler, which assumes direct access
+   to the stack.
+
+Fix: Use base + indexed mode, which allows specification of RSP as
+   base and nullification of the index (by using RSP again).  This is
+   one byte longer than otherwise, but...
+
+
+Register assignments
+====================
+
+RAX (0)                Unassigned
+RCX (1)                Unassigned
+RDX (2)                Unassigned
+RBX (3)                Unassigned
+
+RSP (4)                Stack Pointer
+RBP (5)                Register Mask
+RSI (6)                Pointer to register block, etc.
+RDI (7)                Free Pointer
+
+R8-R15         Unassigned
+
+The dynamic link and value "registers" are not processor registers.
+Slots in the register array must be reserved for them.
+[TRC 20091025: Later, we ought to use machine registers for these.]
+
+The Free Pointer is RDI because RDI is the implicit base register for
+the memory-to-memory move instructions, and the string store
+instruction.  Perhaps we can make use of it.
+
+The pointer to register block is not held in RBP (the processor's
+"frame" register is typically used) because its most common use, (RBP)
+(address syllable for memory memtop) takes more bytes than (RSI).
+\f
+Encodings and layout of various control features
+================================================
+
+Assumptions:
+
+The processor will be in 64-bit address and operand mode.  Thus
+instructions use 64-bit operands, and displacements for addressing
+modes and jump instructions are all 64 bits by default.
+
+       Offset          Contents                Encoding
+
+
+- Execute cache entry encoding:
+
+               Before linking
+
+       0               16-bit arity    \
+       2               zero              [TC_FIXNUM | arity]
+       7               0x1A            /
+entry  8               symbol
+       16              <eight bytes of padding>
+       24              <next cache>
+
+               After linking
+
+       0               16-bit arity
+       2               zero
+       7               0x1A
+entry  8               MOV RAX,imm64           0x48 0xB8
+       10              <address>
+       18              JMP (RAX)               0xFF 0xE0
+       19-23           <four bytes of padding>
+       24              <next cache>
+
+
+- Closures:
+
+The only reason for a 32-bit entry count is to align everything
+nicely.
+
+       0               <closure manifest>
+       8               <entry count>
+       12              <type/arity info>       \__ format word
+       14              <gc offset>             /
+entry0 16              MOV RAX,imm64           0x48 0xB8
+       18              <address>
+       26              CALL (RAX)              0xFF 0xD0
+       28              <four bytes of padding or next format word>
+       ...
+       16*(n+1)        <variables>
+
+
+- Trampoline encoding:
+
+       -8              <padding>
+       -4              <type/arity info>
+       -2              <gc offset>
+entry  0               MOV     AL,code         0xB0, code-byte
+       2               CALL    n(RSI)          0xFF 0x96 n-longword
+       8               <trampoline dependent storage>
+
+
+[TRC 20091027: The next two are wrong; need to update.]
+
+- GC & interrupt check at procedure/continuation entry:
+
+gc_lab -7              CALL    n(RSI)          0xFF 0x56 n-byte
+       -4              <type/arity info>
+       -2              <gc offset>
+entry  0               CMP     RDI,(RSI)       0x48 0x39 0x3e
+       3               JAE     gc_lab          0x73 -12
+       5               <real code>
+
+
+- GC & interrupt check at closure entry:
+
+gc_lab -11             ADD     (RSP),&offset   0x83 0x04 0x24 offset-byte
+       -7              JMP     n(RSI)          0xFF 0x66 n-byte
+       -4              <type/arity info>
+       -2              <gc offset>
+entry  0               ADD     (RSP),&magic    0x81 0x04 0x24 magic-longword
+       7               CMP     RDI,(RSI)       0x39 0x3e
+       9               JAE     gc_lab          0x73 0xea (= -22)
+       11              <real code>
+
+The magic value depends on the closure because of canonicalization.
+
+The ADD instruction at offset -11 is not present for the 0th closure
+entry, since it is the canonical entry point.  Its format depends on
+the value of offset, since the sign-extending forms often suffice.
+
+offset = entry_number * entry_size
+magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction))
+
+*/
+\f
+#define ASM_RESET_HOOK x86_64_reset_hook
+#define FPE_RESET_TRAPS x86_64_interface_initialize
+
+#define CMPINT_USE_STRUCS 1
+
+/* These next definitions must agree with "cmpauxmd/x86-64.m4", which is
+   where the register block is allocated.  */
+#define COMPILER_REGBLOCK_N_FIXED 16
+/* Big enough to hold 80-bit floating-point value: */
+#define COMPILER_TEMP_SIZE 2
+#define COMPILER_REGBLOCK_N_TEMPS 256
+#define COMPILER_REGBLOCK_N_HOOKS 80
+#define COMPILER_HOOK_SIZE 1
+
+#define COMPILER_REGBLOCK_EXTRA_SIZE                                   \
+  (COMPILER_REGBLOCK_N_HOOKS * COMPILER_HOOK_SIZE)
+
+#define REGBLOCK_ALLOCATED_BY_INTERFACE true
+
+typedef byte_t insn_t;
+
+/* Number of insn_t units preceding entry address in which header
+   (type and offset info) is stored.  */
+#define CC_ENTRY_HEADER_SIZE (CC_ENTRY_TYPE_SIZE + CC_ENTRY_OFFSET_SIZE)
+#define CC_ENTRY_TYPE_SIZE 2
+#define CC_ENTRY_OFFSET_SIZE 2
+
+/* Number of insn_t units preceding entry header in which GC trap
+   instructions are stored.  */
+#define CC_ENTRY_GC_TRAP_SIZE 3
+\f
+#define EMBEDDED_CLOSURE_ADDRS_P 1
+
+#define DECLARE_RELOCATION_REFERENCE(name)
+
+#define START_CLOSURE_RELOCATION(scan, ref)    do {} while (0)
+#define START_OPERATOR_RELOCATION(scan, ref)   do {} while (0)
+
+#define OPERATOR_RELOCATION_OFFSET 0
+
+#define READ_COMPILED_CLOSURE_TARGET(a, r)                             \
+  read_compiled_closure_target (a)
+
+/* Size of execution cache in SCHEME_OBJECTS.  */
+#define UUO_LINK_SIZE 3
+
+#define UUO_WORDS_TO_COUNT(nw) ((nw) / UUO_LINK_SIZE)
+#define UUO_COUNT_TO_WORDS(nc) ((nc) * UUO_LINK_SIZE)
+
+#define READ_UUO_TARGET(a, r) read_uuo_target (a)
+
+#define FLUSH_I_CACHE() X86_64_CACHE_SYNCHRONIZE ()
+#define FLUSH_I_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE ()
+#define PUSH_D_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE ()
+
+/* [TRC 20091025: The cache synchronization bug does not occur in any
+    x86-64 machines of which I am aware.] */
+
+#define X86_64_CACHE_SYNCHRONIZE() do {} while (0)
+
+/*
+#define X86_64_CACHE_SYNCHRONIZE() do                                  \
+{                                                                      \
+  if (x86_64_cpuid_needed)                                             \
+    x86_64_cache_synchronize ();                                       \
+} while (false)
+*/
+\f
+#if defined(__OS2__) && (defined(__IBMC__) || defined(__WATCOMC__))
+#  define ASM_ENTRY_POINT(name) (_System name)
+#elif defined(__WIN32__) && defined(__WATCOMC__)
+#  define ASM_ENTRY_POINT(name) (__cdecl name)
+#else
+#  define ASM_ENTRY_POINT(name) name
+#endif
+
+extern int ASM_ENTRY_POINT (x86_64_interface_initialize) (void);
+
+extern void asm_assignment_trap (void);
+extern void asm_dont_serialize_cache (void);
+extern void asm_error (void);
+extern void asm_generic_add (void);
+extern void asm_generic_decrement (void);
+extern void asm_generic_divide (void);
+extern void asm_generic_equal (void);
+extern void asm_generic_greater (void);
+extern void asm_generic_increment (void);
+extern void asm_generic_less (void);
+extern void asm_generic_modulo (void);
+extern void asm_generic_multiply (void);
+extern void asm_generic_negative (void);
+extern void asm_generic_positive (void);
+extern void asm_generic_quotient (void);
+extern void asm_generic_remainder (void);
+extern void asm_generic_subtract (void);
+extern void asm_generic_zero (void);
+extern void asm_interrupt_closure (void);
+extern void asm_interrupt_continuation (void);
+extern void asm_interrupt_continuation_2 (void);
+extern void asm_interrupt_dlink (void);
+extern void asm_interrupt_procedure (void);
+extern void asm_link (void);
+extern void asm_nofp_add (void);
+extern void asm_nofp_decrement (void);
+extern void asm_nofp_divide (void);
+extern void asm_nofp_equal (void);
+extern void asm_nofp_greater (void);
+extern void asm_nofp_increment (void);
+extern void asm_nofp_less (void);
+extern void asm_nofp_modulo (void);
+extern void asm_nofp_multiply (void);
+extern void asm_nofp_negative (void);
+extern void asm_nofp_positive (void);
+extern void asm_nofp_quotient (void);
+extern void asm_nofp_remainder (void);
+extern void asm_nofp_subtract (void);
+extern void asm_nofp_zero (void);
+extern void asm_primitive_apply (void);
+extern void asm_primitive_error (void);
+extern void asm_primitive_lexpr_apply (void);
+extern void asm_reference_trap (void);
+extern void asm_safe_reference_trap (void);
+extern void asm_sc_apply (void);
+extern void asm_sc_apply_size_1 (void);
+extern void asm_sc_apply_size_2 (void);
+extern void asm_sc_apply_size_3 (void);
+extern void asm_sc_apply_size_4 (void);
+extern void asm_sc_apply_size_5 (void);
+extern void asm_sc_apply_size_6 (void);
+extern void asm_sc_apply_size_7 (void);
+extern void asm_sc_apply_size_8 (void);
+extern void asm_scheme_to_interface (void);
+extern void asm_scheme_to_interface_call (void);
+extern void asm_serialize_cache (void);
+/* [TRC 20091025: This was an i386 hack for when the PC is not
+   available, which on x86-64 it always is. */
+/* extern void asm_short_primitive_apply (void); */
+extern void asm_trampoline_to_interface (void);
+
+/* extern void x86_64_cache_synchronize (void); */
+/* extern void start_closure_relocation (SCHEME_OBJECT *, reloc_ref_t *); */
+extern insn_t * read_compiled_closure_target (insn_t *);
+/* extern void start_operator_relocation (SCHEME_OBJECT *, reloc_ref_t *); */
+extern insn_t * read_uuo_target (SCHEME_OBJECT *);
+extern void x86_64_reset_hook (void);
+
+extern int x86_64_cpuid_needed;
+
+#endif /* !SCM_CMPINTMD_H_INCLUDED */
diff --git a/src/microcode/confshared.h b/src/microcode/confshared.h

index eaf38be517ccd96e13f2f83534409c2870577c37..be6837643b301fd72f86976c0db7b9b62290d540 100644 (file)
--- a/src/microcode/confshared.h
+++ b/src/microcode/confshared.h
@@ -225,7 +225,8 @@ typedef enum
    COMPILER_ALPHA_TYPE,
    COMPILER_MIPS_TYPE,
    COMPILER_C_TYPE,
-  COMPILER_SVM_TYPE
+  COMPILER_SVM_TYPE,
+  COMPILER_X86_64_TYPE,
  } cc_arch_t;
  
  #include "cmpintmd-config.h"
@@ -609,6 +610,8 @@ extern void win32_stack_reset (void);
  #ifdef __x86_64__
  #  define MACHINE_TYPE         "x86-64"
  #  define CURRENT_FASL_ARCH    FASL_X86_64
+#  define PC_ZERO_BITS         0
+#  define HEAP_IN_LOW_MEMORY   1
  #endif
  
  #ifdef __ia64__
diff --git a/src/microcode/utabmd.c b/src/microcode/utabmd.c

index 91fcb0d74e0923b4305f4024fbbc8c466363cbc4..bc89addf0a8818ac23259b40978d76d41136bd3c 100644 (file)
--- a/src/microcode/utabmd.c
+++ b/src/microcode/utabmd.c
@@ -117,6 +117,7 @@ cc_arch_name (void)
      case COMPILER_MIPS_TYPE: return ("mips");
      case COMPILER_C_TYPE: return ("c");
      case COMPILER_SVM_TYPE: return ("svm1");
+    case COMPILER_X86_64_TYPE: return ("x86-64");
      default: return (0);
      }
  }
author	Taylor R Campbell <campbell@mumble.net>
	Fri, 30 Oct 2009 22:14:32 +0000 (18:14 -0400)
committer	Taylor R Campbell <campbell@mumble.net>
	Fri, 30 Oct 2009 22:14:32 +0000 (18:14 -0400)
src/microcode/cmpauxmd/x86-64.m4	[new file with mode: 0644]	patch \| blob
src/microcode/cmpintmd/x86-64-config.h	[new file with mode: 0644]	patch \| blob
src/microcode/cmpintmd/x86-64.c	[new file with mode: 0644]	patch \| blob
src/microcode/cmpintmd/x86-64.h	[new file with mode: 0644]	patch \| blob
src/microcode/confshared.h		patch \| blob \| history
src/microcode/utabmd.c		patch \| blob \| history