From abb9331ce3cd6dcfd424e64b4999a46742d1ba53 Mon Sep 17 00:00:00 2001
From: Stephen Adams <edu/mit/csail/zurich/adams>
Date: Wed, 22 Oct 1997 06:47:47 +0000
Subject: [PATCH] Implemented a pattern based peephole optimizer.

Main improvements:

 1. every CONS-[MULTI]CLOSURE saves one instruction
 2. most funcalls save one stack access
 3. fixnum arithmetic (plus and minus constants)
 4. fixnum case of generic add/subtract constant
 5. fixnum bitwise operations with constants (and/or/not)

1 and 2 give about 5% on closure/funcall heavy code.

3-5 are improvements made possible by the screwy fixnum tag.
`Natural' fixnum tags would make them obsolete (but some of the
rewrites are as good). 5 improves hash table operations up to 10% due
to how `flags' are implemented.
---
 v7/src/compiler/machines/i386/lapopt.scm | 343 ++++++++++++++++++++++-
 1 file changed, 339 insertions(+), 4 deletions(-)

diff --git a/v7/src/compiler/machines/i386/lapopt.scm b/v7/src/compiler/machines/i386/lapopt.scm
index bf21f9ac6..490abbdbf 100644
--- a/v7/src/compiler/machines/i386/lapopt.scm
+++ b/v7/src/compiler/machines/i386/lapopt.scm
@@ -1,8 +1,8 @@
 #| -*-Scheme-*-
 
-$Header: /Users/cph/tmp/foo/mit-scheme/mit-scheme/v7/src/compiler/machines/i386/lapopt.scm,v 1.2 1992/02/28 20:22:42 jinx Exp $
+$Id: lapopt.scm,v 1.3 1997/10/22 06:47:47 adams Exp $
 
-Copyright (c) 1992 Massachusetts Institute of Technology
+Copyright (c) 1992-1997 Massachusetts Institute of Technology
 
 This material was developed by the Scheme project at the Massachusetts
 Institute of Technology, Department of Electrical Engineering and
@@ -33,8 +33,343 @@ promotional, or sales literature without prior written consent from
 MIT in each case. |#
 
 ;;;; LAP Optimizer for Intel i386.
+;;; package: (compiler lap-optimizer)
 
 (declare (usual-integrations))
-
+
 (define (optimize-linear-lap instructions)
-  instructions)
\ No newline at end of file
+  (rewrite-lap instructions))
+
+;; i386 LAPOPT uses its own pattern matcher because we want to match
+;; patterns while ignoring comments.
+
+(define (comment? thing)
+  (and (pair? thing) (eq? (car thing) 'COMMENT)))
+
+(define (match pat thing dict)		; -> #F or dictionary (alist)
+  (if (pair? pat)
+      (if (eq? (car pat) '?)
+	  (cond ((assq (cadr pat) dict)
+		 => (lambda (pair)
+		      (and (equal? (cdr pair) thing)
+			   dict)))
+		(else (cons (cons (cadr pat) thing) dict)))
+	  (and (pair? thing)
+	       (let ((dict* (match (car pat) (car thing) dict)))
+		 (and dict*
+		      (match (cdr pat) (cdr thing) dict*)))))
+      (and (eqv? pat thing)
+	   dict)))
+
+(define (match-sequence pats things dict comments success fail)
+  ;; SUCCESS = (lambda (dict* comments* things-tail) ...)
+  ;; FAIL =  (lambda () ...)
+
+  (define (eat-comment)
+    (match-sequence pats (cdr things) dict (cons (car things) comments)
+		    success fail))
+
+  (cond ((not (pair? pats)) ; i.e. null
+	 (if (and (pair? things)
+		  (comment? (car things)))
+	     (eat-comment)
+	     (success dict comments things)))
+	((not (pair? things))
+	 (fail))
+	((comment? (car things))
+	 (eat-comment))
+	((match (car pats) (car things) dict)
+	 => (lambda (dict*)
+	      (match-sequence (cdr pats) (cdr things) dict* comments
+			      success fail)))
+	(else (fail))))
+
+(define-structure
+    (rule)
+  name					; used only for information
+  pattern				; INSNs (in reverse order)
+  predicate				; (lambda (dict) ...) -> bool
+  constructor)				; (lambda (dict) ...) -> lap
+
+
+(define *rules* '())
+
+(define (define-lapopt name pattern predicate constructor)
+  (set! *rules*
+	(cons (make-rule name
+			 (reverse! pattern)
+			 (if ((access procedure? system-global-environment)
+			      predicate)
+			     predicate
+			     (lambda (dict) dict #T))
+			 constructor)
+	      *rules*))
+  name)
+
+;; Rules are tried in the reverse order in which they are defined.
+;;
+;; Rules are matched against the LAP from the bottom up.
+;;
+;; Once a rule has been applied, the rewritten LAP is matched again,
+;; so a rule must rewrite to something different to avoid a loop.
+;; (One way to ensure this is to always rewrite to fewer instructions.)
+
+(define (rewrite-lap lap)
+  (let loop ((unseen (reverse lap)) (finished '()))
+    (if (null? unseen)
+	finished
+	(if (comment? (car unseen))
+	    (loop (cdr unseen) (cons (car unseen) finished))
+	    (let try-rules ((rules *rules*))
+	      (if (null? rules)
+		  (loop (cdr unseen) (cons (car unseen) finished))
+		  (let ((rule (car rules)))
+		    (match-sequence
+		     (rule-pattern rule)
+		     unseen
+		     '(("empty"))	; initial dict, distinct from #F and ()
+		     '()		; initial comments
+		     (lambda (dict comments unseen*)
+		       (let ((dict (alist->dict dict)))
+			 (if ((rule-predicate rule) dict)
+			     (let ((rewritten
+				    (cons
+				     `(COMMENT (LAP-OPT ,(rule-name rule)))
+				     (append comments
+					     ((rule-constructor rule) dict)))))
+			       (loop (append (reverse rewritten) unseen*)
+				     finished))
+			     (try-rules (cdr rules)))))
+		     (lambda ()
+		       (try-rules (cdr rules)))))))))))
+
+;; The DICT passed to the rule predicate and action procedures is a
+;; procedure mapping pattern names to their matched values.
+
+(define (alist->dict dict)
+  (lambda (symbol)
+    (cond ((assq symbol dict) => cdr)
+	  (else (error "Undefined lapopt pattern symbol" symbol dict)))))
+
+(define-lapopt 'PUSH-POP->MOVE
+  `((PUSH (? reg1))
+    (POP  (? reg2)))
+  #F
+  (lambda (dict)
+    `((MOV W ,(dict 'reg2) ,(dict 'reg1)))))
+
+(define-lapopt 'PUSH-POP->NOP
+  `((PUSH (? reg))
+    (POP  (? reg)))
+  #F
+  (lambda (dict)
+    dict
+    `()))
+
+;; The following rules must have the JMP else we don't know if the
+;; register that we are avoiding loading is dead.
+
+(define-lapopt 'LOAD-PUSH-POP-JUMP->REGARGETTED-LOAD-JUMP
+  ;; Note that reg1 must match a register because of the PUSH insn.
+  `((MOV W (? reg1) (? ea/value))
+    (PUSH (? reg1))
+    (POP  (R ,ecx))
+    (JMP (@RO B 6 (? hook-offset))))
+  #F
+  (lambda (dict)
+    `((MOV W (R ,ecx) ,(dict 'ea/value))
+      (JMP (@RO B 6 ,(dict 'hook-offset))))))
+
+(define-lapopt 'LOAD-STACKTOPWRITE-POP-JUMP->RETARGETTED-LOAD-JUMP
+  `((MOV W (? reg) (? ea/value))
+    (MOV W (@r ,esp) (? reg))
+    (POP (R ,ecx))
+    (JMP (@RO B 6 (? hook-offset))))
+  #F
+  (lambda (dict)
+    `((MOV W (R ,ecx) ,(dict 'ea/value))
+      (ADD W (R ,esp) (& 4))
+      (JMP (@RO B 6 ,(dict 'hook-offset))))))
+
+
+(define-lapopt 'STACKWRITE-POP-JUMP->RETARGETTED-LOAD-JUMP
+  `((MOV W (@RO B ,esp (? stack-offset)) (? ea/value))
+    (ADD W (R ,esp) (& (? stack-offset)))
+    (POP (R ,ecx))
+    (JMP (@RO B 6 (? hook-offset))))
+  #F
+  (lambda (dict)
+    `((MOV W (R ,ecx) ,(dict 'ea/value))
+      (ADD W (R ,esp) (& ,(+ 4 (dict 'stack-offset))))
+      (JMP (@RO B 6 ,(dict 'hook-offset))))))
+
+
+
+;; The following rules recognize arithmetic followed by tag injection,
+;; and fold the tag-injection into the arithmetic.  We can do this
+;; because we know the bottom six bits of the fixnum are all 0.  This
+;; is particularly crafty in the generic arithmetic case, as it does
+;; not mess up the overflow detection.
+;;
+;; These patterns match the code generated by subtractions too.
+
+(define fixnum-tag (object-type 1))
+
+(define-lapopt 'FIXNUM-ADD-CONST-TAG
+  `((ADD W (R (? reg)) (& (? const)))
+    (OR W (R (? reg)) (& ,fixnum-tag))
+    (ROR W (R (? reg)) (& 6)))
+  #F
+  (lambda (dict)
+    `((ADD W (R ,(dict 'reg)) (& ,(+ (dict 'const) fixnum-tag)))
+      (ROR W (R ,(dict 'reg)) (& 6)))))
+
+(define-lapopt 'FIXNUM-ADD-REG-TAG
+  `((ADD W (R (? reg)) (R (? reg-2)))
+    (OR W (R (? reg)) (& ,fixnum-tag))
+    (ROR W (R (? reg)) (& 6)))
+  #F
+  (lambda (dict)
+    `((LEA (R ,(dict 'reg)) (@ROI B ,(dict 'reg) ,fixnum-tag ,(dict 'reg-2) 1))
+      (ROR W (R ,(dict 'reg)) (& 6)))))
+
+(define-lapopt 'GENERIC-ADD-TAG
+  `((ADD W (R (? reg)) (& (? const)))
+    (JO (@PCR (? label)))
+    (OR W (R (? reg)) (& ,fixnum-tag))
+    (ROR W (R (? reg)) (& 6)))
+  #F
+  (lambda (dict)
+    `((ADD W (R ,(dict 'reg)) (& ,(+ (dict 'const) fixnum-tag)))
+      (JO (@PCR ,(dict 'label)))
+      (ROR W (R ,(dict 'reg)) (& 6)))))
+
+;; If the fixnum tag is even, the zero LSB works as a place to hold
+;; the overflow from addition which can be discarded by masking it
+;; out.  We must arrange that the constant is positive, so we don't
+;; borrow from the tag bits.
+
+(if (even? fixnum-tag)
+    (define-lapopt 'FIXNUM-ADD-CONST-IN-PLACE
+      `((SAL W (? reg) (& ,scheme-type-width))
+	(ADD W (? reg) (& (? const)))
+	(OR W (? reg)  (& ,fixnum-tag))
+	(ROR W (? reg) (& ,scheme-type-width)))
+      #F
+      (lambda (dict)
+	(let ((const (sar-32 (dict 'const) scheme-type-width))
+	      (mask  (make-non-pointer-literal
+		      fixnum-tag
+		      (-1+ (expt 2 scheme-datum-width)))))
+	  (let ((const
+		 (if (negative? const)
+		     (+ const (expt 2 scheme-datum-width))
+		     const)))
+	    `(,(if (= const 1)
+		   `(INC W ,(dict 'reg)) ; shorter instruction
+		   `(ADD W ,(dict 'reg) (& ,const)))
+	      (AND W ,(dict 'reg) (& ,mask))))))))
+
+;; Similar tag-injection combining rule for fix:or is a little more
+;; general.
+
+(define (or-32-signed x y)
+  (bit-string->signed-integer
+   (bit-string-or (signed-integer->bit-string 32 x)
+		  (signed-integer->bit-string 32 y))))
+
+(define (ror-32-signed w count)
+  (let ((bs (signed-integer->bit-string 32 w)))
+    (bit-string->signed-integer
+     (bit-string-append (bit-substring bs count 32)
+			(bit-substring bs 0 count)))))
+
+(define (sar-32 w count)
+  (let ((bs (signed-integer->bit-string 32 w)))
+    (bit-string->signed-integer (bit-substring bs count 32))))
+
+(define-lapopt 'OR-OR
+  `((OR W (R (? reg)) (& (? const-1)))
+    (OR W (R (? reg)) (& (? const-2))))
+  #F
+  (lambda (dict)
+    `((OR W (R ,(dict 'reg))
+	  (& ,(or-32-signed (dict 'const-1) (dict 'const-2)))))))
+
+;; These rules match a whole fixnum detag-AND/OR-retag operation.  In
+;; principle, these operations could be done in rulfix.scm, but the
+;; instruction combiner wants all the intermediate steps.
+
+(define-lapopt 'FIXNUM-OR-CONST-IN-PLACE
+  `((SAL W (? reg) (& ,scheme-type-width))
+    (OR W (? reg) (& (? const)))
+    (OR W (? reg) (& ,fixnum-tag))
+    (ROR W (? reg) (& ,scheme-type-width)))
+  #F
+  (lambda (dict)
+    `((OR W ,(dict 'reg)
+	  (& ,(careful-object-datum
+	       (sar-32 (dict 'const) scheme-type-width)))))))
+
+(define-lapopt 'FIXNUM-AND-CONST-IN-PLACE
+  `((SAL W (? reg) (& ,scheme-type-width))
+    (AND W (? reg) (& (? const)))
+    (OR W (? reg) (& ,fixnum-tag))
+    (ROR W (? reg) (& ,scheme-type-width)))
+  #F
+  (lambda (dict)
+    `((AND W ,(dict 'reg)
+	   (& ,(make-non-pointer-literal
+		fixnum-tag
+		(careful-object-datum
+		 (sar-32 (dict 'const) scheme-type-width))))))))
+
+;; FIXNUM-NOT.  The first (partial) pattern uses the XOR operation to
+;; put the tag bits in the low part of the result.  This pattern
+;; occurs in the hash table hash functions, where the OBJECT->FIXNUM
+;; has been shared by CSE.
+
+(define-lapopt 'FIXNUM-NOT-TAG
+  `((NOT W (? reg))
+    (AND W (? reg) (& #x-40))
+    (OR W (? reg) (& ,fixnum-tag))
+    (ROR W (? reg) (& ,scheme-type-width)))
+  #F
+  (lambda (dict)
+    (let ((magic-bits (+ (* -1 (expt 2 scheme-type-width)) fixnum-tag)))
+      `((XOR W ,(dict 'reg) (& ,magic-bits))
+	(ROR W ,(dict 'reg) (& ,scheme-type-width))))))
+
+(define-lapopt 'FIXNUM-NOT-IN-PLACE
+  `((SAL W (? reg) (& ,scheme-type-width))
+    (NOT W (? reg))
+    (AND W (? reg) (& #x-40))
+    (OR W (? reg) (& ,fixnum-tag))
+    (ROR W (? reg) (& ,scheme-type-width)))
+  #F
+  (lambda (dict)
+    `((XOR W ,(dict 'reg) (& ,(-1+ (expt 2 scheme-datum-width)))))))
+
+
+;; CLOSURES
+;;
+;; This rule recognizes code duplicated at the end of the CONS-CLOSURE
+;; and CONS-MULTICLOSURE and the following CONS-POINTER. (This happens
+;; because of the hack of storing the entry point as a tagged object
+;; in the closure to allow GC to work correctly with relative jumps in
+;; the closure code.  A better fix would be to alter the GC to make
+;; absolute the addresses during closure transport.)
+;;
+;; The rule relies on the fact the REG-TEMP is a temporary for the
+;; expansions of CONS-CLOSURE and CONS-MULTICLOSURE, so it is dead
+;; afterwards, and is specific in matching because it is the only code
+;; that stores an entry at a negative offset from the free pointer.
+
+(define-lapopt 'CONS-CLOSURE-FIXUP
+  `((LEA (? reg-temp) (@RO UW (? regno-closure) #xA0000000))
+    (MOV W (@RO B ,regnum:free-pointer -4) (? regno-temp))
+    (LEA (? reg-object) (@RO UW (? regno-closure) #xA0000000)))
+  #F
+  (lambda (dict)
+    `((LEA ,(dict 'reg-object) (@RO UW ,(dict 'regno-closure) #xA0000000))
+      (MOV W (@RO B ,regnum:free-pointer -4) ,(dict 'reg-object)))))
-- 
2.25.1