Initial revision

author Chris Hanson <org/chris-hanson/cph>

Tue, 14 Jul 1987 03:00:59 +0000 (03:00 +0000)

committer Chris Hanson <org/chris-hanson/cph>

Tue, 14 Jul 1987 03:00:59 +0000 (03:00 +0000)
author Chris Hanson <org/chris-hanson/cph>
Tue, 14 Jul 1987 03:00:59 +0000 (03:00 +0000)
committer Chris Hanson <org/chris-hanson/cph>
Tue, 14 Jul 1987 03:00:59 +0000 (03:00 +0000)
diff --git a/v7/src/microcode/regex.c b/v7/src/microcode/regex.c

new file mode 100644 (file)

index 0000000..adba8fc
--- /dev/null
+++ b/v7/src/microcode/regex.c
@@ -0,0 +1,1137 @@
+/* -*-C-*-
+
+$Header: /Users/cph/tmp/foo/mit-scheme/mit-scheme/v7/src/microcode/regex.c,v 1.1 1987/07/14 03:00:59 cph Exp $
+
+Copyright (c) 1987 Massachusetts Institute of Technology
+
+This material was developed by the Scheme project at the Massachusetts
+Institute of Technology, Department of Electrical Engineering and
+Computer Science.  Permission to copy this software, to redistribute
+it, and to use it for any purpose is granted, subject to the following
+restrictions and understandings.
+
+1. Any copy made of this software must include this copyright notice
+in full.
+
+2. Users of this software agree to make their best efforts (a) to
+return to the MIT Scheme project any improvements or extensions that
+they make, so that these may be included in future releases; and (b)
+to inform MIT of noteworthy uses of this software.
+
+3. All materials developed as a consequence of the use of this
+software shall duly acknowledge such use, in accordance with the usual
+standards of acknowledging credit in academic research.
+
+4. MIT has made no warrantee or representation that the operation of
+this software will be error-free, and MIT is under no obligation to
+provide any services, by way of maintenance, update, or otherwise.
+
+5. In conjunction with products arising from the use of this material,
+there shall be no use of the name of the Massachusetts Institute of
+Technology nor of any adaptation thereof in any advertising,
+promotional, or sales literature without prior written consent from
+MIT in each case. */
+
+/* Regular expression matching and search.
+   Translated from GNU Emacs. */
+
+/* This code is not yet tested. -- CPH */
+
+#include "scheme.h"
+#include "character.h"
+#include "syntax.h"
+#include "regex.h"
+\f
+#ifndef SIGN_EXTEND_CHAR
+#define SIGN_EXTEND_CHAR(x) (x)
+#endif /* not SIGN_EXTEND_CHAR */
+
+#ifndef SWITCH_ENUM
+#define SWITCH_ENUM(enum_type, expression)                             \
+  switch ((enum enum_type) (expression))
+#endif /* not SWITCH_ENUM */
+
+#define RE_NFAILURES 80
+
+#define FOR_INDEX_RANGE(index, start, end)                             \
+  for (index = (start); (index < (end)); index += 1)
+
+#define FOR_INDEX_BELOW(index, limit)                                  \
+  FOR_INDEX_RANGE (index, 0, (limit))
+
+#define FOR_ALL_ASCII(index)                                           \
+  FOR_INDEX_BELOW (index, MAX_ASCII)
+
+#define FOR_ALL_ASCII_SUCH_THAT(index, expression)                     \
+  FOR_ALL_ASCII (index)                                                        \
+    if (expression)
+
+#define TRANSLATE_CHAR(ascii)                                          \
+  ((translation == NULL) ? (ascii) : (translation [(ascii)]))
+
+#define WORD_CONSTITUENT_P(ascii)                                      \
+  (SYNTAX_CONSTITUENT_P (syntaxcode_word, (ascii)))
+
+#define SYNTAX_CONSTITUENT_P(code, ascii)                              \
+  ((SYNTAX_ENTRY_CODE (SYNTAX_TABLE_REF (syntax_table, (ascii)))) == (code))
+
+#define CHAR_SET_MEMBER_P(length, char_set, ascii)                     \
+  (((ascii) < ((length) * ASCII_LENGTH)) &&                            \
+   (CHAR_SET_MEMBER_P_INTERNAL (char_set, ascii)))
+
+#define CHAR_SET_MEMBER_P_INTERNAL(char_set, ascii)                    \
+  ((((char_set) [((ascii) / ASCII_LENGTH)]) &                          \
+    (1 << ((ascii) % ASCII_LENGTH)))                                   \
+   != 0)
+\f
+#define READ_PATTERN_CHAR(target) do                                   \
+{                                                                      \
+  if (pattern_pc >= pattern_end)                                       \
+    BAD_PATTERN ();                                                    \
+  (target) = (*pattern_pc++);                                          \
+} while (0)
+
+#define READ_PATTERN_OFFSET(target) do                                 \
+{                                                                      \
+  if ((pattern_pc + 1) >= pattern_end)                                 \
+    BAD_PATTERN ();                                                    \
+  (target) = (*pattern_pc++);                                          \
+  (target) +=                                                          \
+    ((SIGN_EXTEND_CHAR (* ((char *) (pattern_pc++)))) << ASCII_LENGTH);        \
+  if (((pattern_pc + (target)) < pattern_start) ||                     \
+      ((pattern_pc + (target)) > pattern_end))                         \
+    BAD_PATTERN ();                                                    \
+} while (0)
+
+#define READ_PATTERN_LENGTH(target) do                                 \
+{                                                                      \
+  if ((pattern_pc >= pattern_end) ||                                   \
+      ((pattern_pc + ((target) = (*pattern_pc++))) > pattern_end))     \
+    BAD_PATTERN ();                                                    \
+} while (0)
+
+#define READ_PATTERN_REGISTER(target) do                               \
+{                                                                      \
+  if ((pattern_pc >= pattern_end) ||                                   \
+      (((target) = (*pattern_pc++)) >= RE_NREGS))                      \
+    BAD_PATTERN ();                                                    \
+} while (0)
+
+#define READ_PATTERN_SYNTAXCODE(target) do                             \
+{                                                                      \
+  if ((pattern_pc >= pattern_end) ||                                   \
+      (((int) ((target) = ((enum syntaxcode) (*pattern_pc++))))                \
+       >= ((int) syntaxcode_max)))                                     \
+    BAD_PATTERN ();                                                    \
+} while (0)
+
+#define BAD_PATTERN() RE_RETURN (-2)
+\f
+#define PUSH_FAILURE_POINT(pattern_pc, match_pc) do                    \
+{                                                                      \
+  if (stack_pointer == stack_end)                                      \
+    {                                                                  \
+      long stack_length;                                               \
+      unsigned char **stack_temporary;                                 \
+                                                                       \
+      stack_length = ((stack_end - stack_start) * 2);                  \
+      if (stack_length > (re_max_failures * 2))                                \
+       RE_RETURN (-4);                                                 \
+      stack_temporary =                                                        \
+       ((unsigned char **)                                             \
+        (realloc                                                       \
+         (stack_start, (stack_length * (sizeof (unsigned char *)))))); \
+      if (stack_temporary == NULL)                                     \
+       RE_RETURN (-3);                                                 \
+      stack_end = (& (stack_temporary [stack_length]));                        \
+      stack_pointer =                                                  \
+       (& (stack_temporary [(stack_pointer - stack_start)]));          \
+      stack_start = stack_temporary;                                   \
+    }                                                                  \
+  (*stack_pointer++) = (pattern_pc);                                   \
+  (*stack_pointer++) = (match_pc);                                     \
+} while (0)
+
+#define RE_RETURN(value)                                               \
+{                                                                      \
+  return_value = (value);                                              \
+  goto return_point;                                                   \
+}
+\f
+void
+re_buffer_initialize (buffer, translation, syntax_table, text,
+                     text_start_index, text_end_index,
+                     gap_start_index, gap_end_index)
+     struct re_buffer *buffer;
+     unsigned char *translation;
+     SYNTAX_TABLE_TYPE syntax_table;
+     unsigned char *text;
+     unsigned long text_start_index, text_end_index,
+       gap_start_index, gap_end_index;
+{
+  unsigned char *text_start, *text_end, *gap_start, *gap_end;
+
+  /* Assumes that
+     ((text_start_index <= gap_start_index) &&
+      (gap_start_index <= gap_end_index) &&
+      (gap_end_index <= text_end_index)) */
+
+  text_start = (text + text_start_index);
+  text_end = (text + text_end_index);
+  gap_start = (text + gap_start_index);
+  gap_end = (text + gap_end_index);
+
+  (buffer -> translation) = translation;
+  (buffer -> syntax_table) = syntax_table;
+  (buffer -> text) = text;
+  (buffer -> text_start) = ((text_start == gap_start) ? gap_end : text_start);
+  (buffer -> text_end) = ((text_end == gap_end) ? gap_start : text_end);
+  (buffer -> gap_start) = gap_start;
+  (buffer -> gap_end) = gap_end;
+  return;
+}
+\f
+/* Given a compiled pattern between `pattern_start' and `pattern_end',
+   generate a character set which is true of all characters which can
+   be the first character of a match.
+
+   See the documentation of `struct re_buffer' for a description of
+   `translation' and `syntax_table'.
+
+   `fastmap' is the resulting character set.  It is a character array
+   whose elements are either `FASTMAP_FALSE' or `FASTMAP_TRUE'.
+
+   Return values:
+   0 => pattern cannot match the null string.
+   1 => pattern can match the null string.
+   2 => pattern can match the null string, but only at end of match
+     text or to left of a character in `fastmap'.
+   -2 => the pattern is improperly formed.
+   else => undefined. */
+
+#define FASTMAP_FALSE '\0'
+#define FASTMAP_TRUE '\1'
+
+int
+re_compile_fastmap (pattern_start, pattern_end, translation, syntax_table,
+                   fastmap)
+     unsigned char *pattern_start;
+     fast unsigned char *pattern_end;
+     unsigned char *translation;
+     SYNTAX_TABLE_TYPE syntax_table;
+     fast unsigned char *fastmap;
+{
+  fast unsigned char *pattern_pc;
+  unsigned char *stack_start[RE_NFAILURES];
+  unsigned char **stack_pointer;
+  int return_value;
+
+  pattern_pc = pattern_start;
+  return_value = 0;
+  stack_pointer = stack_start;
+
+  {
+    fast int i;
+
+    FOR_ALL_ASCII (i)
+      (fastmap [i]) = FASTMAP_FALSE;
+  }
+\f
+ loop:
+  if (pattern_pc >= pattern_end)
+    RE_RETURN (1);
+
+  SWITCH_ENUM (regexpcode, (*pattern_pc++))
+    {
+    case regexpcode_unused:
+    case regexpcode_line_start:
+    case regexpcode_buffer_start:
+    case regexpcode_buffer_end:
+    case regexpcode_word_start:
+    case regexpcode_word_end:
+    case regexpcode_word_bound:
+    case regexpcode_not_word_bound:
+      goto loop;
+
+    case regexpcode_line_end:
+      {
+       (fastmap [(TRANSLATE_CHAR ('\n'))]) = FASTMAP_TRUE;
+       if (return_value == 0)
+         return_value = 2;
+       goto next;
+      }
+
+    case regexpcode_exact_1:
+      {
+       fast int ascii;
+
+       READ_PATTERN_CHAR (ascii);
+       (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+
+    case regexpcode_exact_n:
+      {
+       fast int length;
+
+       READ_PATTERN_LENGTH (length);
+       if (length == 0)
+         goto loop;
+       (fastmap [(TRANSLATE_CHAR (pattern_pc [1]))]) = FASTMAP_TRUE;
+       goto next;
+      }
+
+    case regexpcode_any_char:
+      {
+       fast int ascii;
+
+       FOR_ALL_ASCII_SUCH_THAT (ascii, (ascii != '\n'))
+         (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       if (return_value != 0)
+         goto return_point;
+      }
+\f
+    case regexpcode_char_set:
+      {
+       fast int length;
+       fast int ascii;
+
+       READ_PATTERN_LENGTH (length);
+       length = (length * ASCII_LENGTH);
+       FOR_INDEX_BELOW (ascii, length)
+         if (CHAR_SET_MEMBER_P_INTERNAL (pattern_pc, ascii))
+           (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+
+    case regexpcode_not_char_set:
+      {
+       fast int length;
+       fast int ascii;
+
+       READ_PATTERN_LENGTH (length);
+       length = (length * ASCII_LENGTH);
+       FOR_INDEX_BELOW (ascii, length)
+         if (! (CHAR_SET_MEMBER_P_INTERNAL (pattern_pc, ascii)))
+           (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       FOR_INDEX_RANGE (ascii, length, MAX_ASCII)
+         (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+\f
+    case regexpcode_word_char:
+      {
+       fast int ascii;
+
+       FOR_ALL_ASCII_SUCH_THAT (ascii, (WORD_CONSTITUENT_P (ascii)))
+         (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+
+    case regexpcode_not_word_char:
+      {
+       fast int ascii;
+
+       FOR_ALL_ASCII_SUCH_THAT (ascii, (! (WORD_CONSTITUENT_P (ascii))))
+         (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+
+    case regexpcode_syntax_spec:
+      {
+       fast enum syntaxcode code;
+       fast int ascii;
+
+       READ_PATTERN_SYNTAXCODE (code);
+       FOR_ALL_ASCII_SUCH_THAT (ascii, (SYNTAX_CONSTITUENT_P (code, ascii)))
+         (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+
+    case regexpcode_not_syntax_spec:
+      {
+       fast enum syntaxcode code;
+       fast int ascii;
+
+       READ_PATTERN_SYNTAXCODE (code);
+       FOR_ALL_ASCII_SUCH_THAT (ascii,
+                                (! (SYNTAX_CONSTITUENT_P (code, ascii))))
+         (fastmap [(TRANSLATE_CHAR (ascii))]) = FASTMAP_TRUE;
+       goto next;
+      }
+\f
+    case regexpcode_start_memory:
+    case regexpcode_stop_memory:
+      {
+       fast int register_number;
+
+       READ_PATTERN_REGISTER (register_number);
+       goto loop;
+      }
+
+    case regexpcode_duplicate:
+      {
+       fast int register_number;
+       fast int ascii;
+
+       READ_PATTERN_REGISTER (register_number);
+       FOR_ALL_ASCII (ascii)
+         (fastmap [ascii]) = FASTMAP_TRUE;
+       RE_RETURN (1);
+      }
+\f
+    case regexpcode_jump:
+    case regexpcode_finalize_jump:
+    case regexpcode_maybe_finalize_jump:
+    case regexpcode_dummy_failure_jump:
+      {
+       fast int offset;
+
+       return_value = 1;
+       READ_PATTERN_OFFSET (offset);
+       pattern_pc += offset;
+       if (offset > 0)
+         goto loop;
+
+       /* Jump backward reached implies we just went through the
+          body of a loop and matched nothing.  Opcode jumped to
+          should be an on_failure_jump.  Just treat it like an
+          ordinary jump.  For a * loop, it has pushed its failure
+          point already; if so, discard that as redundant. */
+       if (pattern_pc >= pattern_end)
+         BAD_PATTERN ();
+       if (((enum regexpcode) (pattern_pc [0])) !=
+           regexpcode_on_failure_jump)
+         goto loop;
+       READ_PATTERN_OFFSET (offset);
+       pattern_pc += offset;
+       if ((stack_pointer != stack_start) &&
+           ((stack_pointer [-1]) == pattern_pc))
+         stack_pointer -= 1;
+       goto loop;
+      }
+
+    case regexpcode_on_failure_jump:
+      {
+       fast int offset;
+
+       READ_PATTERN_OFFSET (offset);
+       (*stack_pointer++) = (pattern_pc + offset);
+       goto loop;
+      }
+
+    default:
+      BAD_PATTERN ();
+    }
+
+ next:
+  if (stack_pointer != stack_start)
+    {
+      pattern_pc = (*--stack_pointer);
+      goto loop;
+    }
+
+ return_point:
+  return (return_value);
+}
+\f
+/* Match the compiled pattern described by `pattern_start' and
+   `pattern_end' against the characters in `buffer' between
+   `match_start' and `match_end'.
+
+   `registers', if not NULL, will be filled with the start and end
+   indices of the match registers if the match succeeds.
+
+   It is assumed that the following is true:
+
+   (! ((gap_start < gap_end) &&
+       (match_start < match_end) &&
+       ((match_start == gap_start) || (match_end == gap_end))))
+
+   Return values:
+
+   non-negative => the end index (exclusive) of the match.
+   -1 => no match.
+   -2 => the pattern is badly formed.
+   -3 => memory allocation error.
+   -4 => match stack overflow.
+   other => undefined. */
+
+#define RE_MATCH_FAILED (-1)
+
+#define ADDRESS_TO_INDEX(address)                                      \
+  ((((address) > gap_start) ? ((address) - gap_length) : (address))    \
+   - (buffer -> text))
+
+#define READ_MATCH_CHAR(target) do                                     \
+{                                                                      \
+  if (match_pc >= match_end)                                           \
+    goto re_match_fail;                                                        \
+  (target) = (TRANSLATE_CHAR (*match_pc++));                           \
+  if (match_pc == gap_start)                                           \
+    match_pc = gap_end;                                                        \
+} while (0)
+
+static Boolean
+beq_translate (scan1, scan2, length, translation)
+     fast unsigned char *scan1, *scan2;
+     fast long length;
+     fast unsigned char *translation;
+{
+  while ((length--) > 0)
+    if ((TRANSLATE_CHAR (*scan1++)) != (TRANSLATE_CHAR (*scan2++)))
+      return (false);
+  return (true);
+}
+
+int re_max_failures = 1000;
+\f
+int
+re_match (pattern_start, pattern_end, buffer, registers,
+         match_start, match_end)
+     unsigned char *pattern_start, *pattern_end;
+     struct re_buffer *buffer;
+     struct re_registers *registers;
+     unsigned char *match_start, *match_end;
+{
+  fast unsigned char *pattern_pc, *match_pc;
+  unsigned char *gap_start, *gap_end;
+  unsigned char *translation;
+  SYNTAX_TABLE_TYPE syntax_table;
+  long gap_length;
+  int return_value;
+
+  /* Failure point stack.  Each place that can handle a failure
+     further down the line pushes a failure point on this stack.  It
+     consists of two char *'s.  The first one pushed is where to
+     resume scanning the pattern; the second pushed is where to resume
+     scanning the match text.  If the latter is NULL, the failure
+     point is a "dummy".  If a failure happens and the innermost
+     failure point is dormant, it discards that failure point and
+     tries the next one. */
+
+  unsigned char **stack_start, **stack_end, **stack_pointer;
+
+  /* Information on the "contents" of registers.  These are pointers
+     into the match text; they record just what was matched (on this
+     attempt) by some part of the pattern.  The start_memory command
+     stores the start of a register's contents and the stop_memory
+     command stores the end.
+
+     At that point, (register_start [regnum]) points to the first
+     character in the register, and (register_end [regnum]) points to
+     the first character beyond the end of the register. */
+
+  unsigned char *register_start[RE_NREGS];
+  unsigned char *register_end[RE_NREGS];
+\f
+  pattern_pc = pattern_start;
+  match_pc = match_start;
+  gap_start = (buffer -> gap_start);
+  gap_end = (buffer -> gap_end);
+  gap_length = (gap_end - gap_start);
+  translation = (buffer -> translation);
+  syntax_table = (buffer -> syntax_table);
+
+  stack_start =
+    ((unsigned char **) (malloc ((2 * RE_NFAILURES) * (sizeof (char *)))));
+  if (stack_start == NULL)
+    RE_RETURN (-3);
+
+  stack_end = (& (stack_start [(2 * RE_NFAILURES)]));
+  stack_pointer = stack_start;
+
+  {
+    fast int i;
+
+    FOR_INDEX_BELOW (i, RE_NREGS)
+      {
+       (register_start [i]) = NULL;
+       (register_end [i]) = NULL;
+      }
+  }
+
+ re_match_loop:
+  if (pattern_pc >= pattern_end)
+    {
+      /* Reaching here indicates that match was successful. */
+      if (registers != NULL)
+       {
+         fast int i;
+
+         (register_start [0]) = match_start;
+         (register_end [0]) = match_pc;
+         FOR_INDEX_BELOW (i, RE_NREGS)
+           {
+             ((registers -> start) [i]) =
+               (((register_start [i]) == NULL)
+                ? -1
+                : (ADDRESS_TO_INDEX (register_start [i])));
+             ((registers -> end) [i]) =
+               (((register_end [i]) == NULL)
+                ? -1
+                : (ADDRESS_TO_INDEX (register_end [i])));
+           }
+       }
+      RE_RETURN (ADDRESS_TO_INDEX (match_pc));
+    }
+\f
+  SWITCH_ENUM (regexpcode, (*pattern_pc++))
+    {
+    case regexpcode_unused:
+      goto re_match_loop;
+
+    case regexpcode_exact_1:
+      {
+       fast int ascii;
+       fast int ascii_p;
+
+       READ_MATCH_CHAR (ascii);
+       READ_PATTERN_CHAR (ascii_p);
+       if (ascii == ascii_p)
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_exact_n:
+      {
+       fast int length;
+       fast int ascii;
+
+       READ_PATTERN_LENGTH (length);
+       while ((length--) > 0)
+         {
+           READ_MATCH_CHAR (ascii);
+           if (ascii != (*pattern_pc++))
+             goto re_match_fail;
+         }
+       goto re_match_loop;
+      }
+
+    case regexpcode_any_char:
+      {
+       fast int ascii;
+
+       READ_MATCH_CHAR (ascii);
+       if (ascii == '\n')
+         goto re_match_fail;
+       goto re_match_loop;
+      }
+\f
+#define RE_MATCH_CHAR_SET(winning_label, losing_label)                 \
+      {                                                                        \
+       fast int ascii;                                                 \
+       fast int length;                                                \
+                                                                       \
+       READ_MATCH_CHAR (ascii);                                        \
+       READ_PATTERN_LENGTH (length);                                   \
+       if (CHAR_SET_MEMBER_P (length, pattern_pc, ascii))              \
+         {                                                             \
+           pattern_pc += length;                                       \
+           goto winning_label;                                         \
+         }                                                             \
+       else                                                            \
+         {                                                             \
+           pattern_pc += length;                                       \
+           goto losing_label;                                          \
+         }                                                             \
+      }
+
+    case regexpcode_char_set:
+      RE_MATCH_CHAR_SET (re_match_loop, re_match_fail);
+
+    case regexpcode_not_char_set:
+      RE_MATCH_CHAR_SET (re_match_fail, re_match_loop);
+
+#undef RE_MATCH_CHAR_SET
+
+    /* \( is represented by a start_memory, \) by a stop_memory.  Both
+       of those commands contain a "register number" argument.  The
+       text matched within the \( and \) is recorded under that
+       number.  Then, \<digit> turns into a `duplicate' command which
+       is followed by the numeric value of <digit> as the register
+       number. */
+
+    case regexpcode_start_memory:
+      {
+       fast int register_number;
+
+       READ_PATTERN_REGISTER (register_number);
+       (register_start [register_number]) = match_pc;
+       goto re_match_loop;
+      }
+
+    case regexpcode_stop_memory:
+      {
+       fast int register_number;
+
+       READ_PATTERN_REGISTER (register_number);
+       (register_end [register_number]) =
+         ((match_pc == gap_end) ? gap_start : match_pc);
+       goto re_match_loop;
+      }
+\f
+    case regexpcode_duplicate:
+      {
+       fast int register_number;
+       unsigned char *start, *end, *new_end;
+       long length;
+
+       READ_PATTERN_REGISTER (register_number);
+       start = (register_start [register_number]);
+       end = (register_end [register_number]);
+       length = (end - start);
+       if (length <= 0)
+         goto re_match_loop;
+       new_end = (match_pc + length);
+       if (new_end > match_end)
+         goto re_match_fail;
+       if ((match_pc <= gap_start) && (new_end > gap_start))
+         {
+           long length1, length2;
+
+           new_end += gap_length;
+           if (new_end > match_end)
+             goto re_match_fail;
+           length1 = (gap_start - match_pc);
+           length2 = (length - length1);
+           if (!
+               ((beq_translate (match_pc, start, length1, translation)) &&
+                (beq_translate (gap_end, (start + length1), length2,
+                                translation))))
+             goto re_match_fail;
+         }
+       else if ((start <= gap_start) && (end > gap_start))
+         {
+           long length1, length2;
+
+           length1 = (gap_start - start);
+           length2 = (end - gap_end);
+           if (!
+               ((beq_translate (match_pc, start, length1, translation)) &&
+                (beq_translate ((match_pc + length1), gap_end, length2,
+                                translation))))
+             goto re_match_fail;
+         }
+       else
+         {
+           if (! (beq_translate (match_pc, start, length, translation)))
+             goto re_match_fail;
+         }
+       match_pc = ((new_end == gap_start) ? gap_end : new_end);
+       goto re_match_loop;
+      }
+\f
+    case regexpcode_buffer_start:
+      {
+       if (match_pc == (buffer -> text_start))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_buffer_end:
+      {
+       if (match_pc == (buffer -> text_end))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_line_start:
+      {
+       if (match_pc == (buffer -> text_start))
+         goto re_match_loop;
+       if ((TRANSLATE_CHAR
+            (((match_pc == gap_end) ? gap_start : match_pc) [-1]))
+           == '\n')
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_line_end:
+      {
+       if ((match_pc == (buffer -> text_end)) ||
+           ((TRANSLATE_CHAR (match_pc [0])) == '\n'))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+\f
+#define RE_MATCH_WORD_BOUND(word_bound_p)                              \
+      {                                                                        \
+       if ((match_pc == gap_end)                                       \
+           ? (word_bound_p                                             \
+              (((gap_start != (buffer -> text_start)) &&               \
+                (WORD_CONSTITUENT_P (TRANSLATE_CHAR (gap_start [-1])))), \
+               ((gap_end != (buffer -> text_end)) &&                   \
+                (WORD_CONSTITUENT_P (TRANSLATE_CHAR (gap_end [0])))))) \
+           : (word_bound_p                                             \
+              (((match_pc != (buffer -> text_start)) &&                \
+                (WORD_CONSTITUENT_P (TRANSLATE_CHAR (match_pc [-1])))), \
+               ((match_pc != (buffer -> text_end)) &&                  \
+                (WORD_CONSTITUENT_P (TRANSLATE_CHAR (match_pc [0]))))))) \
+         goto re_match_loop;                                           \
+       goto re_match_fail;                                             \
+      }
+
+    case regexpcode_word_bound:
+#define WORD_BOUND_P(left_p, right_p) ((left_p) != (right_p))
+      RE_MATCH_WORD_BOUND (WORD_BOUND_P);
+#undef WORD_BOUND_P
+
+    case regexpcode_not_word_bound:
+#define NOT_WORD_BOUND_P(left_p, right_p) ((left_p) == (right_p))
+      RE_MATCH_WORD_BOUND (NOT_WORD_BOUND_P);
+#undef NOT_WORD_BOUND_P
+
+    case regexpcode_word_start:
+#define WORD_START_P(left_p, right_p) ((! (left_p)) && (right_p))
+      RE_MATCH_WORD_BOUND (WORD_START_P);
+#undef WORD_START_P
+
+    case regexpcode_word_end:
+#define WORD_END_P(left_p, right_p) ((left_p) && (! (right_p)))
+      RE_MATCH_WORD_BOUND (WORD_END_P);
+#undef WORD_END_P
+
+#undef RE_MATCH_WORD_BOUND
+\f
+    case regexpcode_syntax_spec:
+      {
+       fast int ascii;
+       fast enum syntaxcode code;
+
+       READ_MATCH_CHAR (ascii);
+       READ_PATTERN_SYNTAXCODE (code);
+       if (SYNTAX_CONSTITUENT_P (code, ascii))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_not_syntax_spec:
+      {
+       fast int ascii;
+       fast enum syntaxcode code;
+
+       READ_MATCH_CHAR (ascii);
+       READ_PATTERN_SYNTAXCODE (code);
+       if (! (SYNTAX_CONSTITUENT_P (code, ascii)))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_word_char:
+      {
+       fast int ascii;
+
+       READ_MATCH_CHAR (ascii);
+       if (WORD_CONSTITUENT_P (ascii))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+
+    case regexpcode_not_word_char:
+      {
+       fast int ascii;
+
+       READ_MATCH_CHAR (ascii);
+       if (! (WORD_CONSTITUENT_P (ascii)))
+         goto re_match_loop;
+       goto re_match_fail;
+      }
+\f
+    /* "or" constructs ("|") are handled by starting each alternative
+       with an on_failure_jump that points to the start of the next
+       alternative.  Each alternative except the last ends with a jump
+       to the joining point.  (Actually, each jump except for the last
+       one really jumps to the following jump, because tensioning the
+       jumps is a hassle.)
+
+       The start of a stupid repeat has an on_failure_jump that points
+       past the end of the repeat text.  This makes a failure point so
+       that, on failure to match a repetition, matching restarts past
+       as many repetitions have been found with no way to fail and
+       look for another one.
+
+       A smart repeat is similar but loops back to the on_failure_jump
+       so that each repetition makes another failure point. */
+
+    case regexpcode_on_failure_jump:
+      {
+       fast long offset;
+
+       READ_PATTERN_OFFSET (offset);
+       PUSH_FAILURE_POINT ((pattern_pc + offset), match_pc);
+       goto re_match_loop;
+      }
+\f
+    /* The end of a smart repeat has a maybe_finalize_jump back.
+       Change it either to a finalize_jump or an ordinary jump. */
+
+    case regexpcode_maybe_finalize_jump:
+      {
+       fast long offset;
+       fast long ascii;
+
+       READ_PATTERN_OFFSET (offset);
+       if (pattern_pc == pattern_end)
+         goto maybe_finalize_jump_finalize;
+
+       /* Compare what follows with the beginning of the repeat.
+          If we can establish that there is nothing that they
+          would both match, we can change to `finalize_jump'. */
+
+       SWITCH_ENUM (regexpcode, (pattern_pc [0]))
+         {
+         case regexpcode_exact_1:
+           ascii = (pattern_pc [1]);
+           break;
+
+         case regexpcode_exact_n:
+           ascii = (pattern_pc [2]);
+           break;
+
+         case regexpcode_line_end:
+           ascii = ('\n');
+           break;
+
+         default:
+           goto maybe_finalize_jump_not_finalize;
+         }
+\f
+       /* (pattern_pc [(offset - 3)]) is an `on_failure_jump'.
+          Examine what follows that. */
+       SWITCH_ENUM (regexpcode, (pattern_pc [offset]))
+         {
+         case regexpcode_exact_1:
+           {
+             if (ascii != (pattern_pc [(offset + 1)]))
+               goto maybe_finalize_jump_finalize;
+             goto maybe_finalize_jump_not_finalize;
+           }
+
+         case regexpcode_exact_n:
+           {
+             if (ascii != (pattern_pc [(offset + 2)]))
+               goto maybe_finalize_jump_finalize;
+             goto maybe_finalize_jump_not_finalize;
+           }
+
+         case regexpcode_char_set:
+           {
+             if (CHAR_SET_MEMBER_P ((pattern_pc [(offset + 1)]),
+                                    (& (pattern_pc [(offset + 2)])),
+                                    ascii))
+               goto maybe_finalize_jump_not_finalize;
+             goto maybe_finalize_jump_finalize;
+           }
+
+         case regexpcode_not_char_set:
+           {
+             if (CHAR_SET_MEMBER_P ((pattern_pc [(offset + 1)]),
+                                    (& (pattern_pc [(offset + 2)])),
+                                    ascii))
+               goto maybe_finalize_jump_finalize;
+             goto maybe_finalize_jump_not_finalize;
+           }
+
+         default:
+           goto maybe_finalize_jump_not_finalize;
+         }
+
+      maybe_finalize_jump_finalize:
+       pattern_pc -= 2;
+       (pattern_pc [-1]) = ((unsigned char) regexpcode_finalize_jump);
+       goto re_match_finalize_jump;
+
+      maybe_finalize_jump_not_finalize:
+       pattern_pc -= 2;
+       (pattern_pc [-1]) = ((unsigned char) regexpcode_jump);
+       goto re_match_jump;
+      }
+\f
+    case regexpcode_finalize_jump:
+    re_match_finalize_jump:
+      {
+       stack_pointer -= 2;
+       goto re_match_jump;
+      }
+
+    case regexpcode_jump:
+    re_match_jump:
+      {
+       fast long offset;
+
+       READ_PATTERN_OFFSET (offset);
+       pattern_pc += offset;
+       goto re_match_loop;
+      }
+
+    case regexpcode_dummy_failure_jump:
+      {
+       PUSH_FAILURE_POINT (NULL, NULL);
+       goto re_match_jump;
+      }
+
+    default:
+      {
+       BAD_PATTERN ();
+      }
+    }
+
+ re_match_fail:
+  if (stack_pointer == stack_start)
+    RE_RETURN (RE_MATCH_FAILED);
+  match_pc = (*--stack_pointer);
+  pattern_pc = (*--stack_pointer);
+  if (pattern_pc != NULL)
+    goto re_match_loop;
+  goto re_match_fail;
+
+ return_point:
+  if (stack_start != NULL)
+    free (stack_start);
+  return (return_value);
+}
+\f
+#define DEFINE_RE_SEARCH(name)                                         \
+int name (pattern_start, pattern_end, buffer, registers,               \
+         match_start, match_end)                                       \
+     unsigned char *pattern_start, *pattern_end;                       \
+     struct re_buffer *buffer;                                         \
+     struct re_registers *registers;                                   \
+     unsigned char *match_start;                                       \
+     unsigned char *match_end;
+
+#define INITIALIZE_RE_SEARCH(pc, limit, gap_limit)                     \
+  int can_be_null;                                                     \
+  unsigned char *translation;                                          \
+  int match_result;                                                    \
+                                                                       \
+  fast unsigned char *match_pc;                                                \
+  fast unsigned char *match_limit;                                     \
+  fast unsigned char *gap_limit;                                       \
+  fast unsigned char fastmap[MAX_ASCII];                               \
+                                                                       \
+  translation = (buffer -> translation);                               \
+  can_be_null =                                                                \
+    (re_compile_fastmap                                                        \
+     (pattern_start, pattern_end, translation,                         \
+      (buffer -> syntax_table), fastmap));                             \
+  if (can_be_null < 0)                                                 \
+    return (can_be_null);                                              \
+                                                                       \
+  match_pc = (pc);                                                     \
+  match_limit = (limit);                                               \
+  gap_limit = (buffer -> gap_limit)
+
+#define RE_SEARCH_TEST(start)                                          \
+  (re_match                                                            \
+   (pattern_start, pattern_end, buffer, registers, (start), match_end))
+\f
+#define RE_SEARCH_FORWARD_FAST(limit) do                               \
+{                                                                      \
+  while (true)                                                         \
+    {                                                                  \
+      if (match_pc >= (limit))                                         \
+       break;                                                          \
+                                                                       \
+      if ((fastmap [(TRANSLATE_CHAR (*match_pc++))]) != FASTMAP_FALSE) \
+       continue;                                                       \
+                                                                       \
+      match_result = (RE_SEARCH_TEST (match_pc - 1));                  \
+      if (match_result == RE_MATCH_FAILED)                             \
+       continue;                                                       \
+                                                                       \
+      return (match_result);                                           \
+    }                                                                  \
+} while (0)
+
+DEFINE_RE_SEARCH (re_search_forward)
+{
+  INITIALIZE_RE_SEARCH (match_start, match_end, gap_start);
+
+  if (can_be_null != 1)
+    {
+      if ((match_pc < gap_start) && (gap_start < match_limit))
+       RE_SEARCH_FORWARD_FAST (gap_start);
+      if (match_pc == gap_start)
+       match_pc = (buffer -> gap_end);
+      RE_SEARCH_FORWARD_FAST (match_limit);
+      return
+       ((can_be_null == 0)
+        ? RE_MATCH_FAILED
+        : (RE_SEARCH_TEST (match_limit)));
+    }
+  else
+    {
+      while (true)
+       {
+         match_result = (RE_SEARCH_TEST (match_pc));
+         if (match_result != RE_MATCH_FAILED)
+           return (match_result);
+         match_pc += 1;
+         if (match_pc == gap_start)
+           match_pc = (buffer -> gap_end);
+         if (match_pc > match_limit)
+           return (RE_MATCH_FAILED);
+       }
+    }
+}
+\f
+#define RE_SEARCH_BACKWARD_FAST(limit) do                              \
+{                                                                      \
+  while (true)                                                         \
+    {                                                                  \
+      if (match_pc <= (limit))                                         \
+       break;                                                          \
+                                                                       \
+      if ((fastmap [(TRANSLATE_CHAR (*--match_pc))]) != FASTMAP_FALSE) \
+       continue;                                                       \
+                                                                       \
+      match_result = (RE_SEARCH_TEST (match_pc));                      \
+      if (match_result == RE_MATCH_FAILED)                             \
+       continue;                                                       \
+                                                                       \
+      RE_SEARCH_BACKWARD_RETURN (match_pc);                            \
+    }                                                                  \
+} while (0)
+
+#define RE_SEARCH_BACKWARD_RETURN(start)                               \
+  return                                                               \
+    ((match_result < 0)                                                        \
+     ? match_result                                                    \
+     : ((((start) > (buffer -> gap_start))                             \
+        ? ((start) - (gap_end - (buffer -> gap_start)))                \
+        : (start))                                                     \
+       - (buffer -> text)))
+
+DEFINE_RE_SEARCH (re_search_backward)
+{
+  INITIALIZE_RE_SEARCH (match_end, match_start, gap_end);
+
+  if (can_be_null != 1)
+    {
+      if ((match_pc > gap_end) && (gap_end > match_limit))
+       RE_SEARCH_BACKWARD_FAST (gap_end);
+      if (match_pc == gap_end)
+       match_pc = (buffer -> gap_start);
+      RE_SEARCH_BACKWARD_FAST (match_limit);
+      if (can_be_null == 0)
+       return (RE_MATCH_FAILED);
+      match_result = (RE_SEARCH_TEST (match_limit));
+      RE_SEARCH_BACKWARD_RETURN (match_limit);
+    }
+  else
+    {
+      while (true)
+       {
+         match_result = (RE_SEARCH_TEST (match_pc));
+         if (match_result != RE_MATCH_FAILED)
+           RE_SEARCH_BACKWARD_RETURN (match_pc);
+         if (match_pc == gap_end)
+           match_pc = (buffer -> gap_start);
+         match_pc -= 1;
+         if (match_pc < match_limit)
+           return (RE_MATCH_FAILED);
+       }
+    }
+}
diff --git a/v7/src/microcode/regex.h b/v7/src/microcode/regex.h

new file mode 100644 (file)

index 0000000..c42cd0b
--- /dev/null
+++ b/v7/src/microcode/regex.h
@@ -0,0 +1,199 @@
+/* -*-C-*-
+
+$Header: /Users/cph/tmp/foo/mit-scheme/mit-scheme/v7/src/microcode/regex.h,v 1.1 1987/07/14 03:00:23 cph Rel $
+
+Copyright (c) 1987 Massachusetts Institute of Technology
+
+This material was developed by the Scheme project at the Massachusetts
+Institute of Technology, Department of Electrical Engineering and
+Computer Science.  Permission to copy this software, to redistribute
+it, and to use it for any purpose is granted, subject to the following
+restrictions and understandings.
+
+1. Any copy made of this software must include this copyright notice
+in full.
+
+2. Users of this software agree to make their best efforts (a) to
+return to the MIT Scheme project any improvements or extensions that
+they make, so that these may be included in future releases; and (b)
+to inform MIT of noteworthy uses of this software.
+
+3. All materials developed as a consequence of the use of this
+software shall duly acknowledge such use, in accordance with the usual
+standards of acknowledging credit in academic research.
+
+4. MIT has made no warrantee or representation that the operation of
+this software will be error-free, and MIT is under no obligation to
+provide any services, by way of maintenance, update, or otherwise.
+
+5. In conjunction with products arising from the use of this material,
+there shall be no use of the name of the Massachusetts Institute of
+Technology nor of any adaptation thereof in any advertising,
+promotional, or sales literature without prior written consent from
+MIT in each case. */
+
+/* Translated from GNU Emacs. */
+\f
+/* Structure to represent a buffer of text to match against.
+   This contains the information that an editor buffer would have
+   to supply for the matching process to be executed.
+
+   `translation' is an array of MAX_ASCII characters which is used to
+   map each character before matching.  Both the pattern and the match
+   text are mapped.  This is normally used to implement case
+   insensitive searches.
+
+   `syntax_table' describes the syntax of the match text.  See the
+   syntax table primitives for more information.
+
+   `text' points to the beginning of the match text.  It is used only
+   for translating match text pointers into indices.
+
+   `text_start' and `text_end' delimit the match text.  They define
+   the buffer-start and buffer-end for those matching commands that
+   refer to them.  Also, all matching must take place within these
+   limits.
+
+   `gap_start' and `gap_end' delimit a gap in the match text.  Editor
+   buffers normally have such a gap.  For applications without a gap,
+   it is recommended that these be set to the same value as
+   `text_end'.
+
+   Both `text_start' and `gap_start' are inclusive indices, while
+   `text_end' and `gap_end' are exclusive.
+
+   The following conditions must be true:
+
+   (text <= text_start)
+   (text_start <= text_end)
+   (gap_start <= gap_end)
+   (! ((text_start < text_end) &&
+       (gap_start < gap_end) &&
+       ((text_start == gap_start) || (text_end == gap_end))))
+
+   */
+
+struct re_buffer
+  {
+    unsigned char *translation;
+    SYNTAX_TABLE_TYPE syntax_table;
+    unsigned char *text;
+    unsigned char *text_start;
+    unsigned char *text_end;
+    unsigned char *gap_start;
+    unsigned char *gap_end;
+  };
+\f
+/* Structure to store "register" contents data in.
+
+   Pass the address of such a structure as an argument to re_match,
+   etc., if you want this information back.
+
+   start[i] and end[i] record the string matched by \( ... \) grouping
+   i, for i from 1 to RE_NREGS - 1.
+
+   start[0] and end[0] record the entire string matched. */
+
+#define RE_NREGS 10
+
+struct re_registers
+  {
+    long start[RE_NREGS];
+    long end[RE_NREGS];
+  };
+\f
+/* These are the command codes that appear in compiled regular
+   expressions, one per byte.  Some command codes are followed by
+   argument bytes.  A command code can specify any interpretation
+   whatever for its arguments.  Zero-bytes may appear in the compiled
+   regular expression. */
+
+enum regexpcode
+  {
+    regexpcode_unused,
+    regexpcode_exact_1,                /* Followed by 1 literal byte */
+
+    /* Followed by one byte giving n, and then by n literal bytes. */
+    regexpcode_exact_n,
+
+    regexpcode_line_start,     /* Fails unless at beginning of line */
+    regexpcode_line_end,       /* Fails unless at end of line */
+
+    /* Followed by two bytes giving relative address to jump to. */
+    regexpcode_jump,
+
+    /* Followed by two bytes giving relative address of place to
+       resume at in case of failure. */
+    regexpcode_on_failure_jump,        
+
+    /* Throw away latest failure point and then jump to address. */
+    regexpcode_finalize_jump,
+
+    /* Like jump but finalize if safe to do so.  This is used to jump
+       back to the beginning of a repeat.  If the command that follows
+       this jump is clearly incompatible with the one at the beginning
+       of the repeat, such that we can be sure that there is no use
+       backtracking out of repetitions already completed, then we
+       finalize. */
+    regexpcode_maybe_finalize_jump,
+
+    /* jump, and push a dummy failure point.  This failure point will
+       be thrown away if an attempt is made to use it for a failure.
+       A + construct makes this before the first repeat. */
+    regexpcode_dummy_failure_jump,
+
+    regexpcode_any_char,       /* Matches any one character */
+
+    /* Matches any one char belonging to specified set.  First
+       following byte is # bitmap bytes.  Then come bytes for a
+       bit-map saying which chars are in.  Bits in each byte are
+       ordered low-bit-first.  A character is in the set if its bit is
+       1.  A character too large to have a bit in the map is
+       automatically not in the set. */
+    regexpcode_char_set,
+
+    /* Similar but match any character that is NOT one of those
+       specified. */
+    regexpcode_not_char_set,
+\f
+    /* Starts remembering the text that is matched and stores it in a
+       memory register.  Followed by one byte containing the register
+       number.  Register numbers must be in the range 0 through
+       (RE_NREGS - 1) inclusive.  */
+    regexpcode_start_memory,
+
+    /* Stops remembering the text that is matched and stores it in a
+       memory register.  Followed by one byte containing the register
+       number.  Register numbers must be in the range 0 through
+       (RE_NREGS - 1) inclusive.  */
+    regexpcode_stop_memory,
+
+    /* Match a duplicate of something remembered.  Followed by one
+       byte containing the index of the memory register. */
+    regexpcode_duplicate,
+
+    regexpcode_buffer_start,   /* Succeeds if at beginning of buffer */
+    regexpcode_buffer_end,     /* Succeeds if at end of buffer */
+    regexpcode_word_char,      /* Matches any word-constituent character */
+
+    /* Matches any char that is not a word-constituent. */
+    regexpcode_not_word_char,
+
+    regexpcode_word_start,     /* Succeeds if at word beginning */
+    regexpcode_word_end,       /* Succeeds if at word end */
+    regexpcode_word_bound,     /* Succeeds if at a word boundary */
+    regexpcode_not_word_bound, /* Succeeds if not at a word boundary */
+
+    /* Matches any character whose syntax is specified.  Followed by a
+       byte which contains a syntax code, Sword or such like. */
+    regexpcode_syntax_spec,
+
+    /* Matches any character whose syntax differs from the specified. */
+    regexpcode_not_syntax_spec
+  };
+
+extern void re_buffer_initialize ();
+extern int re_compile_fastmap ();
+extern int re_match ();
+extern int re_search_forward ();
+extern int re_search_backward ();
diff --git a/v7/src/microcode/rgxprim.c b/v7/src/microcode/rgxprim.c

new file mode 100644 (file)

index 0000000..86c05da
--- /dev/null
+++ b/v7/src/microcode/rgxprim.c
@@ -0,0 +1,247 @@
+/* -*-C-*-
+
+$Header: /Users/cph/tmp/foo/mit-scheme/mit-scheme/v7/src/microcode/rgxprim.c,v 1.1 1987/07/14 03:00:03 cph Exp $
+
+Copyright (c) 1987 Massachusetts Institute of Technology
+
+This material was developed by the Scheme project at the Massachusetts
+Institute of Technology, Department of Electrical Engineering and
+Computer Science.  Permission to copy this software, to redistribute
+it, and to use it for any purpose is granted, subject to the following
+restrictions and understandings.
+
+1. Any copy made of this software must include this copyright notice
+in full.
+
+2. Users of this software agree to make their best efforts (a) to
+return to the MIT Scheme project any improvements or extensions that
+they make, so that these may be included in future releases; and (b)
+to inform MIT of noteworthy uses of this software.
+
+3. All materials developed as a consequence of the use of this
+software shall duly acknowledge such use, in accordance with the usual
+standards of acknowledging credit in academic research.
+
+4. MIT has made no warrantee or representation that the operation of
+this software will be error-free, and MIT is under no obligation to
+provide any services, by way of maintenance, update, or otherwise.
+
+5. In conjunction with products arising from the use of this material,
+there shall be no use of the name of the Massachusetts Institute of
+Technology nor of any adaptation thereof in any advertising,
+promotional, or sales literature without prior written consent from
+MIT in each case. */
+
+/* Primitives for regular expression matching and search. */
+
+/* This code is not yet tested. -- CPH */
+
+#include "scheme.h"
+#include "primitive.h"
+#include "stringprim.h"
+#include "character.h"
+#include "edwin.h"
+#include "syntax.h"
+#include "regex.h"
+\f
+#define RE_CHAR_SET_P(object)                                          \
+  ((STRING_P (object)) &&                                              \
+   ((string_length (object)) == (MAX_ASCII / ASCII_LENGTH)))
+
+#define CHAR_SET_P(argument)                                           \
+  ((STRING_P (argument)) && ((string_length (argument)) == MAX_ASCII))
+
+#define CHAR_TRANSLATION_P(argument)                                   \
+  ((STRING_P (argument)) && ((string_length (argument)) == MAX_ASCII))
+
+#define RE_REGISTERS_P(object)                                         \
+  (((object) == NIL) ||                                                        \
+   ((VECTOR_P (object)) &&                                             \
+    ((Vector_Length (object)) == (RE_NREGS + RE_NREGS))))
+
+#define RE_MATCH_RESULTS(result, vector) do                            \
+{                                                                      \
+  if ((result) >= 0)                                                   \
+    {                                                                  \
+      if ((vector) != NIL)                                             \
+       {                                                               \
+         int i;                                                        \
+         long index;                                                   \
+                                                                       \
+         for (i = 0; (i < RE_NREGS); i += 1)                           \
+           {                                                           \
+             index = ((registers . start) [i]);                        \
+             User_Vector_Set                                           \
+               (vector,                                                \
+                i,                                                     \
+                ((index == -1)                                         \
+                 ? NIL                                                 \
+                 : (C_Integer_To_Scheme_Integer (index))));            \
+             index = ((registers . end) [i]);                          \
+             User_Vector_Set                                           \
+               (vector,                                                \
+                (i + RE_NREGS),                                        \
+                ((index == -1)                                         \
+                 ? NIL                                                 \
+                 : (C_Integer_To_Scheme_Integer (index))));            \
+           }                                                           \
+       }                                                               \
+      return (C_Integer_To_Scheme_Integer (result));                   \
+    }                                                                  \
+  else if ((result) == (-1))                                           \
+    return (NIL);                                                      \
+  else if ((result) == (-2))                                           \
+    error_bad_range_arg (1);                                           \
+  else                                                                 \
+    error_external_return ();                                          \
+} while (0)
+\f
+Built_In_Primitive (Prim_re_char_set_adjoin, 2, "RE-CHAR-SET-ADJOIN!", 0x190)
+{
+  int ascii;
+  Primitive_2_Args ();
+
+  CHECK_ARG (1, RE_CHAR_SET_P);
+  ascii = (arg_ascii_char (2));
+  (* (string_pointer (Arg1, (ascii / ASCII_LENGTH)))) |=
+    (1 << (ascii % ASCII_LENGTH));
+  return (NIL);
+}
+
+Built_In_Primitive (Prim_re_compile_fastmap, 4, "RE-COMPILE-FASTMAP", 0x191)
+{
+  int can_be_null;
+  Primitive_4_Args ();
+
+  CHECK_ARG (1, STRING_P);
+  CHECK_ARG (2, CHAR_TRANSLATION_P);
+  CHECK_ARG (3, SYNTAX_TABLE_P);
+  CHECK_ARG (4, CHAR_SET_P);
+
+  can_be_null =
+    (re_compile_fastmap ((string_pointer (Arg1, 0)),
+                        (string_pointer (Arg1, (string_length (Arg1)))),
+                        (string_pointer (Arg2, 0)),
+                        Arg3,
+                        (string_pointer (Arg4, 0))));
+
+  if (can_be_null >= 0)
+    return (C_Integer_To_Scheme_Integer (can_be_null));
+  else if (can_be_null == (-2))
+    error_bad_range_arg (1);
+  else
+    error_external_return ();
+}
+\f
+/* (re-match-substring regexp translation syntax-table registers
+                      string start end)
+
+   Attempt to match REGEXP against the substring [STRING, START, END].
+   Return the index of the end of the match (exclusive) if successful.
+   Otherwise return false.  REGISTERS, if not false, is set to contain
+   the appropriate indices for the match registers. */
+
+#define RE_SUBSTRING_PRIMITIVE(procedure)                              \
+{                                                                      \
+  long match_start, match_end, text_end;                               \
+  char *text;                                                          \
+  struct re_buffer buffer;                                             \
+  struct re_registers registers;                                       \
+  int result;                                                          \
+  Primitive_7_Args ();                                                 \
+                                                                       \
+  CHECK_ARG (1, STRING_P);                                             \
+  CHECK_ARG (2, CHAR_TRANSLATION_P);                                   \
+  CHECK_ARG (3, SYNTAX_TABLE_P);                                       \
+  CHECK_ARG (4, RE_REGISTERS_P);                                       \
+  CHECK_ARG (5, STRING_P);                                             \
+  match_start = (arg_nonnegative_integer (6));                         \
+  match_end = (arg_nonnegative_integer (7));                           \
+  text = (string_pointer (Arg5, 0));                                   \
+  text_end = (string_length (Arg5));                                   \
+                                                                       \
+  if (match_end > text_end) error_bad_range_arg (7);                   \
+  if (match_start > match_end) error_bad_range_arg (6);                        \
+                                                                       \
+  re_buffer_initialize                                                 \
+    ((& buffer), (string_pointer (Arg2, 0)), Arg3, text, 0, text_end,  \
+     text_end, text_end);                                              \
+                                                                       \
+  result =                                                             \
+    (procedure ((string_pointer (Arg1, 0)),                            \
+               (string_pointer (Arg1, (string_length (Arg1)))),        \
+               (& buffer),                                             \
+               ((Arg4 == NIL) ? NULL : (& registers)),                 \
+               (& (text [match_start])),                               \
+               (& (text [match_end]))));                               \
+  RE_MATCH_RESULTS (result, Arg4);                                     \
+}
+
+Built_In_Primitive (Prim_re_match_substring, 7, "RE-MATCH-SUBSTRING", 0x118)
+  RE_SUBSTRING_PRIMITIVE (re_match)
+
+Built_In_Primitive (Prim_re_search_substring_forward, 7,
+                   "RE-SEARCH-SUBSTRING-FORWARD", 0x119)
+  RE_SUBSTRING_PRIMITIVE (re_search_forward)
+
+Built_In_Primitive (Prim_re_search_substring_backward, 7,
+                   "RE-SEARCH-SUBSTRING-BACKWARD", 0x11A)
+  RE_SUBSTRING_PRIMITIVE (re_search_backward)
+\f
+#define RE_BUFFER_PRIMITIVE(procedure)                                 \
+{                                                                      \
+  long match_start, match_end, text_start, text_end, gap_start;                \
+  char *text;                                                          \
+  struct re_buffer buffer;                                             \
+  struct re_registers registers;                                       \
+  int result;                                                          \
+  Primitive_7_Args ();                                                 \
+                                                                       \
+  CHECK_ARG (1, STRING_P);                                             \
+  CHECK_ARG (2, CHAR_TRANSLATION_P);                                   \
+  CHECK_ARG (3, SYNTAX_TABLE_P);                                       \
+  CHECK_ARG (4, RE_REGISTERS_P);                                       \
+  CHECK_ARG (5, GROUP_P);                                              \
+  match_start = (arg_nonnegative_integer (6));                         \
+  match_end = (arg_nonnegative_integer (7));                           \
+                                                                       \
+  text = (string_pointer ((GROUP_TEXT (Arg5)), 0));                    \
+  text_start = (MARK_POSITION (GROUP_START_MARK (Arg5)));              \
+  text_end = (MARK_POSITION (GROUP_END_MARK (Arg5)));                  \
+  gap_start = (GROUP_GAP_START (Arg5));                                        \
+                                                                       \
+  if (match_end > gap_start)                                           \
+    {                                                                  \
+      match_end += (GROUP_GAP_LENGTH (Arg5));                          \
+      if (match_start >= gap_start)                                    \
+       match_start += (GROUP_GAP_LENGTH (Arg5));                       \
+    }                                                                  \
+                                                                       \
+  if (match_start > match_end) error_bad_range_arg (6);                        \
+  if (match_end > text_end) error_bad_range_arg (7);                   \
+  if (match_start < text_start) error_bad_range_arg (6);               \
+                                                                       \
+  re_buffer_initialize                                                 \
+    ((& buffer), (string_pointer (Arg2, 0)), Arg3, text, text_start,   \
+     text_end, gap_start, (GROUP_GAP_END (Arg5)));                     \
+                                                                       \
+  result =                                                             \
+    (procedure ((string_pointer (Arg1, 0)),                            \
+               (string_pointer (Arg1, (string_length (Arg1)))),        \
+               (& buffer),                                             \
+               ((Arg4 == NIL) ? NULL : (& registers)),                 \
+               (& (text [match_start])),                               \
+               (& (text [match_end]))));                               \
+  RE_MATCH_RESULTS (result, Arg4);                                     \
+}
+
+Built_In_Primitive (Prim_re_match_buffer, 7, "RE-MATCH-BUFFER", 0x192)
+  RE_BUFFER_PRIMITIVE (re_match)
+
+Built_In_Primitive (Prim_re_search_buffer_forward, 7,
+                   "RE-SEARCH-BUFFER-FORWARD", 0x193)
+  RE_BUFFER_PRIMITIVE (re_search_forward)
+
+Built_In_Primitive (Prim_re_search_buffer_backward, 7,
+                   "RE-SEARCH-BUFFER-BACKWARD", 0x194)
+  RE_BUFFER_PRIMITIVE (re_search_backward)
author	Chris Hanson <org/chris-hanson/cph>
	Tue, 14 Jul 1987 03:00:59 +0000 (03:00 +0000)
committer	Chris Hanson <org/chris-hanson/cph>
	Tue, 14 Jul 1987 03:00:59 +0000 (03:00 +0000)
v7/src/microcode/regex.c	[new file with mode: 0644]	patch \| blob
v7/src/microcode/regex.h	[new file with mode: 0644]	patch \| blob
v7/src/microcode/rgxprim.c	[new file with mode: 0644]	patch \| blob