From abb9331ce3cd6dcfd424e64b4999a46742d1ba53 Mon Sep 17 00:00:00 2001 From: Stephen Adams Date: Wed, 22 Oct 1997 06:47:47 +0000 Subject: [PATCH] Implemented a pattern based peephole optimizer. Main improvements: 1. every CONS-[MULTI]CLOSURE saves one instruction 2. most funcalls save one stack access 3. fixnum arithmetic (plus and minus constants) 4. fixnum case of generic add/subtract constant 5. fixnum bitwise operations with constants (and/or/not) 1 and 2 give about 5% on closure/funcall heavy code. 3-5 are improvements made possible by the screwy fixnum tag. `Natural' fixnum tags would make them obsolete (but some of the rewrites are as good). 5 improves hash table operations up to 10% due to how `flags' are implemented. --- v7/src/compiler/machines/i386/lapopt.scm | 343 ++++++++++++++++++++++- 1 file changed, 339 insertions(+), 4 deletions(-) diff --git a/v7/src/compiler/machines/i386/lapopt.scm b/v7/src/compiler/machines/i386/lapopt.scm index bf21f9ac6..490abbdbf 100644 --- a/v7/src/compiler/machines/i386/lapopt.scm +++ b/v7/src/compiler/machines/i386/lapopt.scm @@ -1,8 +1,8 @@ #| -*-Scheme-*- -$Header: /Users/cph/tmp/foo/mit-scheme/mit-scheme/v7/src/compiler/machines/i386/lapopt.scm,v 1.2 1992/02/28 20:22:42 jinx Exp $ +$Id: lapopt.scm,v 1.3 1997/10/22 06:47:47 adams Exp $ -Copyright (c) 1992 Massachusetts Institute of Technology +Copyright (c) 1992-1997 Massachusetts Institute of Technology This material was developed by the Scheme project at the Massachusetts Institute of Technology, Department of Electrical Engineering and @@ -33,8 +33,343 @@ promotional, or sales literature without prior written consent from MIT in each case. |# ;;;; LAP Optimizer for Intel i386. +;;; package: (compiler lap-optimizer) (declare (usual-integrations)) - + (define (optimize-linear-lap instructions) - instructions) \ No newline at end of file + (rewrite-lap instructions)) + +;; i386 LAPOPT uses its own pattern matcher because we want to match +;; patterns while ignoring comments. + +(define (comment? thing) + (and (pair? thing) (eq? (car thing) 'COMMENT))) + +(define (match pat thing dict) ; -> #F or dictionary (alist) + (if (pair? pat) + (if (eq? (car pat) '?) + (cond ((assq (cadr pat) dict) + => (lambda (pair) + (and (equal? (cdr pair) thing) + dict))) + (else (cons (cons (cadr pat) thing) dict))) + (and (pair? thing) + (let ((dict* (match (car pat) (car thing) dict))) + (and dict* + (match (cdr pat) (cdr thing) dict*))))) + (and (eqv? pat thing) + dict))) + +(define (match-sequence pats things dict comments success fail) + ;; SUCCESS = (lambda (dict* comments* things-tail) ...) + ;; FAIL = (lambda () ...) + + (define (eat-comment) + (match-sequence pats (cdr things) dict (cons (car things) comments) + success fail)) + + (cond ((not (pair? pats)) ; i.e. null + (if (and (pair? things) + (comment? (car things))) + (eat-comment) + (success dict comments things))) + ((not (pair? things)) + (fail)) + ((comment? (car things)) + (eat-comment)) + ((match (car pats) (car things) dict) + => (lambda (dict*) + (match-sequence (cdr pats) (cdr things) dict* comments + success fail))) + (else (fail)))) + +(define-structure + (rule) + name ; used only for information + pattern ; INSNs (in reverse order) + predicate ; (lambda (dict) ...) -> bool + constructor) ; (lambda (dict) ...) -> lap + + +(define *rules* '()) + +(define (define-lapopt name pattern predicate constructor) + (set! *rules* + (cons (make-rule name + (reverse! pattern) + (if ((access procedure? system-global-environment) + predicate) + predicate + (lambda (dict) dict #T)) + constructor) + *rules*)) + name) + +;; Rules are tried in the reverse order in which they are defined. +;; +;; Rules are matched against the LAP from the bottom up. +;; +;; Once a rule has been applied, the rewritten LAP is matched again, +;; so a rule must rewrite to something different to avoid a loop. +;; (One way to ensure this is to always rewrite to fewer instructions.) + +(define (rewrite-lap lap) + (let loop ((unseen (reverse lap)) (finished '())) + (if (null? unseen) + finished + (if (comment? (car unseen)) + (loop (cdr unseen) (cons (car unseen) finished)) + (let try-rules ((rules *rules*)) + (if (null? rules) + (loop (cdr unseen) (cons (car unseen) finished)) + (let ((rule (car rules))) + (match-sequence + (rule-pattern rule) + unseen + '(("empty")) ; initial dict, distinct from #F and () + '() ; initial comments + (lambda (dict comments unseen*) + (let ((dict (alist->dict dict))) + (if ((rule-predicate rule) dict) + (let ((rewritten + (cons + `(COMMENT (LAP-OPT ,(rule-name rule))) + (append comments + ((rule-constructor rule) dict))))) + (loop (append (reverse rewritten) unseen*) + finished)) + (try-rules (cdr rules))))) + (lambda () + (try-rules (cdr rules))))))))))) + +;; The DICT passed to the rule predicate and action procedures is a +;; procedure mapping pattern names to their matched values. + +(define (alist->dict dict) + (lambda (symbol) + (cond ((assq symbol dict) => cdr) + (else (error "Undefined lapopt pattern symbol" symbol dict))))) + +(define-lapopt 'PUSH-POP->MOVE + `((PUSH (? reg1)) + (POP (? reg2))) + #F + (lambda (dict) + `((MOV W ,(dict 'reg2) ,(dict 'reg1))))) + +(define-lapopt 'PUSH-POP->NOP + `((PUSH (? reg)) + (POP (? reg))) + #F + (lambda (dict) + dict + `())) + +;; The following rules must have the JMP else we don't know if the +;; register that we are avoiding loading is dead. + +(define-lapopt 'LOAD-PUSH-POP-JUMP->REGARGETTED-LOAD-JUMP + ;; Note that reg1 must match a register because of the PUSH insn. + `((MOV W (? reg1) (? ea/value)) + (PUSH (? reg1)) + (POP (R ,ecx)) + (JMP (@RO B 6 (? hook-offset)))) + #F + (lambda (dict) + `((MOV W (R ,ecx) ,(dict 'ea/value)) + (JMP (@RO B 6 ,(dict 'hook-offset)))))) + +(define-lapopt 'LOAD-STACKTOPWRITE-POP-JUMP->RETARGETTED-LOAD-JUMP + `((MOV W (? reg) (? ea/value)) + (MOV W (@r ,esp) (? reg)) + (POP (R ,ecx)) + (JMP (@RO B 6 (? hook-offset)))) + #F + (lambda (dict) + `((MOV W (R ,ecx) ,(dict 'ea/value)) + (ADD W (R ,esp) (& 4)) + (JMP (@RO B 6 ,(dict 'hook-offset)))))) + + +(define-lapopt 'STACKWRITE-POP-JUMP->RETARGETTED-LOAD-JUMP + `((MOV W (@RO B ,esp (? stack-offset)) (? ea/value)) + (ADD W (R ,esp) (& (? stack-offset))) + (POP (R ,ecx)) + (JMP (@RO B 6 (? hook-offset)))) + #F + (lambda (dict) + `((MOV W (R ,ecx) ,(dict 'ea/value)) + (ADD W (R ,esp) (& ,(+ 4 (dict 'stack-offset)))) + (JMP (@RO B 6 ,(dict 'hook-offset)))))) + + + +;; The following rules recognize arithmetic followed by tag injection, +;; and fold the tag-injection into the arithmetic. We can do this +;; because we know the bottom six bits of the fixnum are all 0. This +;; is particularly crafty in the generic arithmetic case, as it does +;; not mess up the overflow detection. +;; +;; These patterns match the code generated by subtractions too. + +(define fixnum-tag (object-type 1)) + +(define-lapopt 'FIXNUM-ADD-CONST-TAG + `((ADD W (R (? reg)) (& (? const))) + (OR W (R (? reg)) (& ,fixnum-tag)) + (ROR W (R (? reg)) (& 6))) + #F + (lambda (dict) + `((ADD W (R ,(dict 'reg)) (& ,(+ (dict 'const) fixnum-tag))) + (ROR W (R ,(dict 'reg)) (& 6))))) + +(define-lapopt 'FIXNUM-ADD-REG-TAG + `((ADD W (R (? reg)) (R (? reg-2))) + (OR W (R (? reg)) (& ,fixnum-tag)) + (ROR W (R (? reg)) (& 6))) + #F + (lambda (dict) + `((LEA (R ,(dict 'reg)) (@ROI B ,(dict 'reg) ,fixnum-tag ,(dict 'reg-2) 1)) + (ROR W (R ,(dict 'reg)) (& 6))))) + +(define-lapopt 'GENERIC-ADD-TAG + `((ADD W (R (? reg)) (& (? const))) + (JO (@PCR (? label))) + (OR W (R (? reg)) (& ,fixnum-tag)) + (ROR W (R (? reg)) (& 6))) + #F + (lambda (dict) + `((ADD W (R ,(dict 'reg)) (& ,(+ (dict 'const) fixnum-tag))) + (JO (@PCR ,(dict 'label))) + (ROR W (R ,(dict 'reg)) (& 6))))) + +;; If the fixnum tag is even, the zero LSB works as a place to hold +;; the overflow from addition which can be discarded by masking it +;; out. We must arrange that the constant is positive, so we don't +;; borrow from the tag bits. + +(if (even? fixnum-tag) + (define-lapopt 'FIXNUM-ADD-CONST-IN-PLACE + `((SAL W (? reg) (& ,scheme-type-width)) + (ADD W (? reg) (& (? const))) + (OR W (? reg) (& ,fixnum-tag)) + (ROR W (? reg) (& ,scheme-type-width))) + #F + (lambda (dict) + (let ((const (sar-32 (dict 'const) scheme-type-width)) + (mask (make-non-pointer-literal + fixnum-tag + (-1+ (expt 2 scheme-datum-width))))) + (let ((const + (if (negative? const) + (+ const (expt 2 scheme-datum-width)) + const))) + `(,(if (= const 1) + `(INC W ,(dict 'reg)) ; shorter instruction + `(ADD W ,(dict 'reg) (& ,const))) + (AND W ,(dict 'reg) (& ,mask)))))))) + +;; Similar tag-injection combining rule for fix:or is a little more +;; general. + +(define (or-32-signed x y) + (bit-string->signed-integer + (bit-string-or (signed-integer->bit-string 32 x) + (signed-integer->bit-string 32 y)))) + +(define (ror-32-signed w count) + (let ((bs (signed-integer->bit-string 32 w))) + (bit-string->signed-integer + (bit-string-append (bit-substring bs count 32) + (bit-substring bs 0 count))))) + +(define (sar-32 w count) + (let ((bs (signed-integer->bit-string 32 w))) + (bit-string->signed-integer (bit-substring bs count 32)))) + +(define-lapopt 'OR-OR + `((OR W (R (? reg)) (& (? const-1))) + (OR W (R (? reg)) (& (? const-2)))) + #F + (lambda (dict) + `((OR W (R ,(dict 'reg)) + (& ,(or-32-signed (dict 'const-1) (dict 'const-2))))))) + +;; These rules match a whole fixnum detag-AND/OR-retag operation. In +;; principle, these operations could be done in rulfix.scm, but the +;; instruction combiner wants all the intermediate steps. + +(define-lapopt 'FIXNUM-OR-CONST-IN-PLACE + `((SAL W (? reg) (& ,scheme-type-width)) + (OR W (? reg) (& (? const))) + (OR W (? reg) (& ,fixnum-tag)) + (ROR W (? reg) (& ,scheme-type-width))) + #F + (lambda (dict) + `((OR W ,(dict 'reg) + (& ,(careful-object-datum + (sar-32 (dict 'const) scheme-type-width))))))) + +(define-lapopt 'FIXNUM-AND-CONST-IN-PLACE + `((SAL W (? reg) (& ,scheme-type-width)) + (AND W (? reg) (& (? const))) + (OR W (? reg) (& ,fixnum-tag)) + (ROR W (? reg) (& ,scheme-type-width))) + #F + (lambda (dict) + `((AND W ,(dict 'reg) + (& ,(make-non-pointer-literal + fixnum-tag + (careful-object-datum + (sar-32 (dict 'const) scheme-type-width)))))))) + +;; FIXNUM-NOT. The first (partial) pattern uses the XOR operation to +;; put the tag bits in the low part of the result. This pattern +;; occurs in the hash table hash functions, where the OBJECT->FIXNUM +;; has been shared by CSE. + +(define-lapopt 'FIXNUM-NOT-TAG + `((NOT W (? reg)) + (AND W (? reg) (& #x-40)) + (OR W (? reg) (& ,fixnum-tag)) + (ROR W (? reg) (& ,scheme-type-width))) + #F + (lambda (dict) + (let ((magic-bits (+ (* -1 (expt 2 scheme-type-width)) fixnum-tag))) + `((XOR W ,(dict 'reg) (& ,magic-bits)) + (ROR W ,(dict 'reg) (& ,scheme-type-width)))))) + +(define-lapopt 'FIXNUM-NOT-IN-PLACE + `((SAL W (? reg) (& ,scheme-type-width)) + (NOT W (? reg)) + (AND W (? reg) (& #x-40)) + (OR W (? reg) (& ,fixnum-tag)) + (ROR W (? reg) (& ,scheme-type-width))) + #F + (lambda (dict) + `((XOR W ,(dict 'reg) (& ,(-1+ (expt 2 scheme-datum-width))))))) + + +;; CLOSURES +;; +;; This rule recognizes code duplicated at the end of the CONS-CLOSURE +;; and CONS-MULTICLOSURE and the following CONS-POINTER. (This happens +;; because of the hack of storing the entry point as a tagged object +;; in the closure to allow GC to work correctly with relative jumps in +;; the closure code. A better fix would be to alter the GC to make +;; absolute the addresses during closure transport.) +;; +;; The rule relies on the fact the REG-TEMP is a temporary for the +;; expansions of CONS-CLOSURE and CONS-MULTICLOSURE, so it is dead +;; afterwards, and is specific in matching because it is the only code +;; that stores an entry at a negative offset from the free pointer. + +(define-lapopt 'CONS-CLOSURE-FIXUP + `((LEA (? reg-temp) (@RO UW (? regno-closure) #xA0000000)) + (MOV W (@RO B ,regnum:free-pointer -4) (? regno-temp)) + (LEA (? reg-object) (@RO UW (? regno-closure) #xA0000000))) + #F + (lambda (dict) + `((LEA ,(dict 'reg-object) (@RO UW ,(dict 'regno-closure) #xA0000000)) + (MOV W (@RO B ,regnum:free-pointer -4) ,(dict 'reg-object))))) -- 2.25.1