source: project/chicken/trunk/pcre/pcre_compile.c @ 6175

Last change on this file since 6175 was 6175, checked in by Kon Lovett, 13 years ago

Changes for PCRE 7.4, use of compiled regexp in posix & utils units.

File size: 189.0 KB
Line 
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2007 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing processed string start */
51#define PSEND   end_pattern    /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* When DEBUG is defined, we need the pcre_printint() function, which is also
57used by pcretest. DEBUG is not defined when building a production library. */
58
59#ifdef DEBUG
60#include "pcre_printint.src"
61#endif
62
63
64/* Macro for setting individual bits in class bitmaps. */
65
66#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68/* Maximum length value to check against when making sure that the integer that
69holds the compiled pattern length does not overflow. We make it a bit less than
70INT_MAX to allow for adding in group terminating bytes, so that we don't have
71to check them every time. */
72
73#define OFLOW_MAX (INT_MAX - 20)
74
75
76/*************************************************
77*      Code parameters and static tables         *
78*************************************************/
79
80/* This value specifies the size of stack workspace that is used during the
81first pre-compile phase that determines how much memory is required. The regex
82is partly compiled into this space, but the compiled parts are discarded as
83soon as they can be, so that hopefully there will never be an overrun. The code
84does, however, check for an overrun. The largest amount I've seen used is 218,
85so this number is very generous.
86
87The same workspace is used during the second, actual compile phase for
88remembering forward references to groups so that they can be filled in at the
89end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90is 4 there is plenty of room. */
91
92#define COMPILE_WORK_SIZE (4096)
93
94
95/* Table for handling escaped characters in the range '0'-'z'. Positive returns
96are simple data values; negative values are for special things like \d and so
97on. Zero means further processing is needed (for things like \x), or the escape
98is invalid. */
99
100#ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101static const short int escapes[] = {
102     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105-ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106-ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107-ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109-ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110-ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111     0,      0, -ESC_z                                            /* x - z */
112};
113
114#else           /* This is the "abnormal" table for EBCDIC systems */
115static const short int escapes[] = {
116/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
118/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
119/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
120/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
121/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
139};
140#endif
141
142
143/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144searched linearly. Put all the names into a single string, in order to reduce
145the number of relocations when a shared library is dynamically linked. */
146
147typedef struct verbitem {
148  int   len;
149  int   op;
150} verbitem;
151
152static const char verbnames[] =
153  "ACCEPT\0"
154  "COMMIT\0"
155  "F\0"
156  "FAIL\0"
157  "PRUNE\0"
158  "SKIP\0"
159  "THEN";
160
161static verbitem verbs[] = {
162  { 6, OP_ACCEPT },
163  { 6, OP_COMMIT },
164  { 1, OP_FAIL },
165  { 4, OP_FAIL },
166  { 5, OP_PRUNE },
167  { 4, OP_SKIP  },
168  { 4, OP_THEN  }
169};
170
171static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174/* Tables of names of POSIX character classes and their lengths. The names are
175now all in a single string, to reduce the number of relocations when a shared
176library is dynamically loaded. The list of lengths is terminated by a zero
177length entry. The first three must be alpha, lower, upper, as this is assumed
178for handling case independence. */
179
180static const char posix_names[] =
181  "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182  "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183  "word\0"   "xdigit";
184
185static const uschar posix_name_lengths[] = {
186  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188/* Table of class bit maps for each POSIX class. Each class is formed from a
189base map, with an optional addition or removal of another map. Then, for some
190classes, there is some additional tweaking: for [:blank:] the vertical space
191characters are removed, and for [:alpha:] and [:alnum:] the underscore
192character is removed. The triples in the table consist of the base map offset,
193second map offset or -1 if no second map, and a non-negative value for map
194addition or a negative value for map subtraction (if there are two maps). The
195absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196remove vertical space characters, 2 => remove underscore. */
197
198static const int posix_class_maps[] = {
199  cbit_word,  cbit_digit, -2,             /* alpha */
200  cbit_lower, -1,          0,             /* lower */
201  cbit_upper, -1,          0,             /* upper */
202  cbit_word,  -1,          2,             /* alnum - word without underscore */
203  cbit_print, cbit_cntrl,  0,             /* ascii */
204  cbit_space, -1,          1,             /* blank - a GNU extension */
205  cbit_cntrl, -1,          0,             /* cntrl */
206  cbit_digit, -1,          0,             /* digit */
207  cbit_graph, -1,          0,             /* graph */
208  cbit_print, -1,          0,             /* print */
209  cbit_punct, -1,          0,             /* punct */
210  cbit_space, -1,          0,             /* space */
211  cbit_word,  -1,          0,             /* word - a Perl extension */
212  cbit_xdigit,-1,          0              /* xdigit */
213};
214
215
216#define STRING(a)  # a
217#define XSTRING(s) STRING(s)
218
219/* The texts of compile-time error messages. These are "char *" because they
220are passed to the outside world. Do not ever re-use any error number, because
221they are documented. Always add a new error instead. Messages marked DEAD below
222are no longer used. This used to be a table of strings, but in order to reduce
223the number of relocations needed when a shared library is loaded dynamically,
224it is now one long string. We cannot use a table of offsets, because the
225lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226simply count through to the one we want - this isn't a performance issue
227because these strings are used only when there is a compilation error. */
228
229static const char error_texts[] =
230  "no error\0"
231  "\\ at end of pattern\0"
232  "\\c at end of pattern\0"
233  "unrecognized character follows \\\0"
234  "numbers out of order in {} quantifier\0"
235  /* 5 */
236  "number too big in {} quantifier\0"
237  "missing terminating ] for character class\0"
238  "invalid escape sequence in character class\0"
239  "range out of order in character class\0"
240  "nothing to repeat\0"
241  /* 10 */
242  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243  "internal error: unexpected repeat\0"
244  "unrecognized character after (?\0"
245  "POSIX named classes are supported only within a class\0"
246  "missing )\0"
247  /* 15 */
248  "reference to non-existent subpattern\0"
249  "erroffset passed as NULL\0"
250  "unknown option bit(s) set\0"
251  "missing ) after comment\0"
252  "parentheses nested too deeply\0"  /** DEAD **/
253  /* 20 */
254  "regular expression is too large\0"
255  "failed to get memory\0"
256  "unmatched parentheses\0"
257  "internal error: code overflow\0"
258  "unrecognized character after (?<\0"
259  /* 25 */
260  "lookbehind assertion is not fixed length\0"
261  "malformed number or name after (?(\0"
262  "conditional group contains more than two branches\0"
263  "assertion expected after (?(\0"
264  "(?R or (?[+-]digits must be followed by )\0"
265  /* 30 */
266  "unknown POSIX class name\0"
267  "POSIX collating elements are not supported\0"
268  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269  "spare error\0"  /** DEAD **/
270  "character value in \\x{...} sequence is too large\0"
271  /* 35 */
272  "invalid condition (?(0)\0"
273  "\\C not allowed in lookbehind assertion\0"
274  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275  "number after (?C is > 255\0"
276  "closing ) for (?C expected\0"
277  /* 40 */
278  "recursive call could loop indefinitely\0"
279  "unrecognized character after (?P\0"
280  "syntax error in subpattern name (missing terminator)\0"
281  "two named subpatterns have the same name\0"
282  "invalid UTF-8 string\0"
283  /* 45 */
284  "support for \\P, \\p, and \\X has not been compiled\0"
285  "malformed \\P or \\p sequence\0"
286  "unknown property name after \\P or \\p\0"
287  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289  /* 50 */
290  "repeated subpattern is too long\0"    /** DEAD **/
291  "octal value is greater than \\377 (not in UTF-8 mode)\0"
292  "internal error: overran compiling workspace\0"
293  "internal error: previously-checked referenced subpattern not found\0"
294  "DEFINE group contains more than one branch\0"
295  /* 55 */
296  "repeating a DEFINE group is not allowed\0"
297  "inconsistent NEWLINE options\0"
298  "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299  "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300  "(*VERB) with an argument is not supported\0"
301  /* 60 */
302  "(*VERB) not recognized\0"
303  "number is too big";
304
305
306/* Table to identify digits and hex digits. This is used when compiling
307patterns. Note that the tables in chartables are dependent on the locale, and
308may mark arbitrary characters as digits - but the PCRE compiling code expects
309to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
310a private table here. It costs 256 bytes, but it is a lot faster than doing
311character value tests (at least in some simple cases I timed), and in some
312applications one wants PCRE to compile efficiently as well as match
313efficiently.
314
315For convenience, we use the same bit definitions as in chartables:
316
317  0x04   decimal digit
318  0x08   hexadecimal digit
319
320Then we can use ctype_digit and ctype_xdigit in the code. */
321
322#ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
323static const unsigned char digitab[] =
324  {
325  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
326  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
327  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
328  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
329  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
330  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
331  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
332  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
333  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
334  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
335  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
336  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
337  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
338  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
339  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
340  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
341  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
342  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
343  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
344  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
345  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
346  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
347  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
348  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
349  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
350  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
351  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
352  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
353  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
354  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
355  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
356  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
357
358#else           /* This is the "abnormal" case, for EBCDIC systems */
359static const unsigned char digitab[] =
360  {
361  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
362  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
363  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
364  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
365  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
366  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
367  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
368  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
369  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
370  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
371  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
372  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
373  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
374  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
375  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
376  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
377  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
378  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
379  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
380  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
381  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
382  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
383  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
384  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
385  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
386  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
387  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
388  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
389  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
390  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
391  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
392  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
393
394static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
395  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
396  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
397  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
398  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
399  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
400  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
401  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
402  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
403  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
404  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
405  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
406  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
407  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
408  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
409  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
410  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
411  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
412  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
413  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
414  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
415  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
416  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
417  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
418  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
419  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
420  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
421  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
422  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
423  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
424  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
425  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
426  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
427#endif
428
429
430/* Definition to allow mutual recursion */
431
432static BOOL
433  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
434    int *, int *, branch_chain *, compile_data *, int *);
435
436
437
438/*************************************************
439*            Find an error text                  *
440*************************************************/
441
442/* The error texts are now all in one long string, to save on relocations. As
443some of the text is of unknown length, we can't use a table of offsets.
444Instead, just count through the strings. This is not a performance issue
445because it happens only when there has been a compilation error.
446
447Argument:   the error number
448Returns:    pointer to the error string
449*/
450
451static const char *
452find_error_text(int n)
453{
454const char *s = error_texts;
455for (; n > 0; n--) while (*s++ != 0);
456return s;
457}
458
459
460/*************************************************
461*            Handle escapes                      *
462*************************************************/
463
464/* This function is called when a \ has been encountered. It either returns a
465positive value for a simple escape such as \n, or a negative value which
466encodes one of the more complicated things such as \d. A backreference to group
467n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
468UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
469ptr is pointing at the \. On exit, it is on the final character of the escape
470sequence.
471
472Arguments:
473  ptrptr         points to the pattern position pointer
474  errorcodeptr   points to the errorcode variable
475  bracount       number of previous extracting brackets
476  options        the options bits
477  isclass        TRUE if inside a character class
478
479Returns:         zero or positive => a data character
480                 negative => a special escape sequence
481                 on error, errorcodeptr is set
482*/
483
484static int
485check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
486  int options, BOOL isclass)
487{
488BOOL utf8 = (options & PCRE_UTF8) != 0;
489const uschar *ptr = *ptrptr + 1;
490int c, i;
491
492GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
493ptr--;                            /* Set pointer back to the last byte */
494
495/* If backslash is at the end of the pattern, it's an error. */
496
497if (c == 0) *errorcodeptr = ERR1;
498
499/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
500a table. A non-zero result is something that can be returned immediately.
501Otherwise further processing may be required. */
502
503#ifndef EBCDIC  /* ASCII coding */
504else if (c < '0' || c > 'z') {}                           /* Not alphameric */
505else if ((i = escapes[c - '0']) != 0) c = i;
506
507#else           /* EBCDIC coding */
508else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
509else if ((i = escapes[c - 0x48]) != 0)  c = i;
510#endif
511
512/* Escapes that need further processing, or are illegal. */
513
514else
515  {
516  const uschar *oldptr;
517  BOOL braced, negated;
518
519  switch (c)
520    {
521    /* A number of Perl escapes are not handled by PCRE. We give an explicit
522    error. */
523
524    case 'l':
525    case 'L':
526    case 'N':
527    case 'u':
528    case 'U':
529    *errorcodeptr = ERR37;
530    break;
531
532    /* \g must be followed by a number, either plain or braced. If positive, it
533    is an absolute backreference. If negative, it is a relative backreference.
534    This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
535    reference to a named group. This is part of Perl's movement towards a
536    unified syntax for back references. As this is synonymous with \k{name}, we
537    fudge it up by pretending it really was \k. */
538
539    case 'g':
540    if (ptr[1] == '{')
541      {
542      const uschar *p;
543      for (p = ptr+2; *p != 0 && *p != '}'; p++)
544        if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
545      if (*p != 0 && *p != '}')
546        {
547        c = -ESC_k;
548        break;
549        }
550      braced = TRUE;
551      ptr++;
552      }
553    else braced = FALSE;
554
555    if (ptr[1] == '-')
556      {
557      negated = TRUE;
558      ptr++;
559      }
560    else negated = FALSE;
561
562    c = 0;
563    while ((digitab[ptr[1]] & ctype_digit) != 0)
564      c = c * 10 + *(++ptr) - '0';
565
566    if (c < 0)
567      {
568      *errorcodeptr = ERR61;
569      break;
570      }
571
572    if (c == 0 || (braced && *(++ptr) != '}'))
573      {
574      *errorcodeptr = ERR57;
575      break;
576      }
577
578    if (negated)
579      {
580      if (c > bracount)
581        {
582        *errorcodeptr = ERR15;
583        break;
584        }
585      c = bracount - (c - 1);
586      }
587
588    c = -(ESC_REF + c);
589    break;
590
591    /* The handling of escape sequences consisting of a string of digits
592    starting with one that is not zero is not straightforward. By experiment,
593    the way Perl works seems to be as follows:
594
595    Outside a character class, the digits are read as a decimal number. If the
596    number is less than 10, or if there are that many previous extracting
597    left brackets, then it is a back reference. Otherwise, up to three octal
598    digits are read to form an escaped byte. Thus \123 is likely to be octal
599    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
600    value is greater than 377, the least significant 8 bits are taken. Inside a
601    character class, \ followed by a digit is always an octal number. */
602
603    case '1': case '2': case '3': case '4': case '5':
604    case '6': case '7': case '8': case '9':
605
606    if (!isclass)
607      {
608      oldptr = ptr;
609      c -= '0';
610      while ((digitab[ptr[1]] & ctype_digit) != 0)
611        c = c * 10 + *(++ptr) - '0';
612      if (c < 0)
613        {
614        *errorcodeptr = ERR61;
615        break;
616        }
617      if (c < 10 || c <= bracount)
618        {
619        c = -(ESC_REF + c);
620        break;
621        }
622      ptr = oldptr;      /* Put the pointer back and fall through */
623      }
624
625    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
626    generates a binary zero byte and treats the digit as a following literal.
627    Thus we have to pull back the pointer by one. */
628
629    if ((c = *ptr) >= '8')
630      {
631      ptr--;
632      c = 0;
633      break;
634      }
635
636    /* \0 always starts an octal number, but we may drop through to here with a
637    larger first octal digit. The original code used just to take the least
638    significant 8 bits of octal numbers (I think this is what early Perls used
639    to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
640    than 3 octal digits. */
641
642    case '0':
643    c -= '0';
644    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
645        c = c * 8 + *(++ptr) - '0';
646    if (!utf8 && c > 255) *errorcodeptr = ERR51;
647    break;
648
649    /* \x is complicated. \x{ddd} is a character number which can be greater
650    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
651    treated as a data character. */
652
653    case 'x':
654    if (ptr[1] == '{')
655      {
656      const uschar *pt = ptr + 2;
657      int count = 0;
658
659      c = 0;
660      while ((digitab[*pt] & ctype_xdigit) != 0)
661        {
662        register int cc = *pt++;
663        if (c == 0 && cc == '0') continue;     /* Leading zeroes */
664        count++;
665
666#ifndef EBCDIC  /* ASCII coding */
667        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
668        c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
669#else           /* EBCDIC coding */
670        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
671        c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
672#endif
673        }
674
675      if (*pt == '}')
676        {
677        if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
678        ptr = pt;
679        break;
680        }
681
682      /* If the sequence of hex digits does not end with '}', then we don't
683      recognize this construct; fall through to the normal \x handling. */
684      }
685
686    /* Read just a single-byte hex-defined char */
687
688    c = 0;
689    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
690      {
691      int cc;                               /* Some compilers don't like ++ */
692      cc = *(++ptr);                        /* in initializers */
693#ifndef EBCDIC  /* ASCII coding */
694      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
695      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
696#else           /* EBCDIC coding */
697      if (cc <= 'z') cc += 64;              /* Convert to upper case */
698      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
699#endif
700      }
701    break;
702
703    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
704    This coding is ASCII-specific, but then the whole concept of \cx is
705    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
706
707    case 'c':
708    c = *(++ptr);
709    if (c == 0)
710      {
711      *errorcodeptr = ERR2;
712      break;
713      }
714
715#ifndef EBCDIC  /* ASCII coding */
716    if (c >= 'a' && c <= 'z') c -= 32;
717    c ^= 0x40;
718#else           /* EBCDIC coding */
719    if (c >= 'a' && c <= 'z') c += 64;
720    c ^= 0xC0;
721#endif
722    break;
723
724    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
725    other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
726    for Perl compatibility, it is a literal. This code looks a bit odd, but
727    there used to be some cases other than the default, and there may be again
728    in future, so I haven't "optimized" it. */
729
730    default:
731    if ((options & PCRE_EXTRA) != 0) switch(c)
732      {
733      default:
734      *errorcodeptr = ERR3;
735      break;
736      }
737    break;
738    }
739  }
740
741*ptrptr = ptr;
742return c;
743}
744
745
746
747#ifdef SUPPORT_UCP
748/*************************************************
749*               Handle \P and \p                 *
750*************************************************/
751
752/* This function is called after \P or \p has been encountered, provided that
753PCRE is compiled with support for Unicode properties. On entry, ptrptr is
754pointing at the P or p. On exit, it is pointing at the final character of the
755escape sequence.
756
757Argument:
758  ptrptr         points to the pattern position pointer
759  negptr         points to a boolean that is set TRUE for negation else FALSE
760  dptr           points to an int that is set to the detailed property value
761  errorcodeptr   points to the error code variable
762
763Returns:         type value from ucp_type_table, or -1 for an invalid type
764*/
765
766static int
767get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
768{
769int c, i, bot, top;
770const uschar *ptr = *ptrptr;
771char name[32];
772
773c = *(++ptr);
774if (c == 0) goto ERROR_RETURN;
775
776*negptr = FALSE;
777
778/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
779negation. */
780
781if (c == '{')
782  {
783  if (ptr[1] == '^')
784    {
785    *negptr = TRUE;
786    ptr++;
787    }
788  for (i = 0; i < (int)sizeof(name) - 1; i++)
789    {
790    c = *(++ptr);
791    if (c == 0) goto ERROR_RETURN;
792    if (c == '}') break;
793    name[i] = c;
794    }
795  if (c !='}') goto ERROR_RETURN;
796  name[i] = 0;
797  }
798
799/* Otherwise there is just one following character */
800
801else
802  {
803  name[0] = c;
804  name[1] = 0;
805  }
806
807*ptrptr = ptr;
808
809/* Search for a recognized property name using binary chop */
810
811bot = 0;
812top = _pcre_utt_size;
813
814while (bot < top)
815  {
816  i = (bot + top) >> 1;
817  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
818  if (c == 0)
819    {
820    *dptr = _pcre_utt[i].value;
821    return _pcre_utt[i].type;
822    }
823  if (c > 0) bot = i + 1; else top = i;
824  }
825
826*errorcodeptr = ERR47;
827*ptrptr = ptr;
828return -1;
829
830ERROR_RETURN:
831*errorcodeptr = ERR46;
832*ptrptr = ptr;
833return -1;
834}
835#endif
836
837
838
839
840/*************************************************
841*            Check for counted repeat            *
842*************************************************/
843
844/* This function is called when a '{' is encountered in a place where it might
845start a quantifier. It looks ahead to see if it really is a quantifier or not.
846It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
847where the ddds are digits.
848
849Arguments:
850  p         pointer to the first char after '{'
851
852Returns:    TRUE or FALSE
853*/
854
855static BOOL
856is_counted_repeat(const uschar *p)
857{
858if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
859while ((digitab[*p] & ctype_digit) != 0) p++;
860if (*p == '}') return TRUE;
861
862if (*p++ != ',') return FALSE;
863if (*p == '}') return TRUE;
864
865if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
866while ((digitab[*p] & ctype_digit) != 0) p++;
867
868return (*p == '}');
869}
870
871
872
873/*************************************************
874*         Read repeat counts                     *
875*************************************************/
876
877/* Read an item of the form {n,m} and return the values. This is called only
878after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
879so the syntax is guaranteed to be correct, but we need to check the values.
880
881Arguments:
882  p              pointer to first char after '{'
883  minp           pointer to int for min
884  maxp           pointer to int for max
885                 returned as -1 if no max
886  errorcodeptr   points to error code variable
887
888Returns:         pointer to '}' on success;
889                 current ptr on error, with errorcodeptr set non-zero
890*/
891
892static const uschar *
893read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
894{
895int min = 0;
896int max = -1;
897
898/* Read the minimum value and do a paranoid check: a negative value indicates
899an integer overflow. */
900
901while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
902if (min < 0 || min > 65535)
903  {
904  *errorcodeptr = ERR5;
905  return p;
906  }
907
908/* Read the maximum value if there is one, and again do a paranoid on its size.
909Also, max must not be less than min. */
910
911if (*p == '}') max = min; else
912  {
913  if (*(++p) != '}')
914    {
915    max = 0;
916    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
917    if (max < 0 || max > 65535)
918      {
919      *errorcodeptr = ERR5;
920      return p;
921      }
922    if (max < min)
923      {
924      *errorcodeptr = ERR4;
925      return p;
926      }
927    }
928  }
929
930/* Fill in the required variables, and pass back the pointer to the terminating
931'}'. */
932
933*minp = min;
934*maxp = max;
935return p;
936}
937
938
939
940/*************************************************
941*       Find forward referenced subpattern       *
942*************************************************/
943
944/* This function scans along a pattern's text looking for capturing
945subpatterns, and counting them. If it finds a named pattern that matches the
946name it is given, it returns its number. Alternatively, if the name is NULL, it
947returns when it reaches a given numbered subpattern. This is used for forward
948references to subpatterns. We know that if (?P< is encountered, the name will
949be terminated by '>' because that is checked in the first pass.
950
951Arguments:
952  ptr          current position in the pattern
953  count        current count of capturing parens so far encountered
954  name         name to seek, or NULL if seeking a numbered subpattern
955  lorn         name length, or subpattern number if name is NULL
956  xmode        TRUE if we are in /x mode
957
958Returns:       the number of the named subpattern, or -1 if not found
959*/
960
961static int
962find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
963  BOOL xmode)
964{
965const uschar *thisname;
966
967for (; *ptr != 0; ptr++)
968  {
969  int term;
970
971  /* Skip over backslashed characters and also entire \Q...\E */
972
973  if (*ptr == '\\')
974    {
975    if (*(++ptr) == 0) return -1;
976    if (*ptr == 'Q') for (;;)
977      {
978      while (*(++ptr) != 0 && *ptr != '\\');
979      if (*ptr == 0) return -1;
980      if (*(++ptr) == 'E') break;
981      }
982    continue;
983    }
984
985  /* Skip over character classes */
986
987  if (*ptr == '[')
988    {
989    while (*(++ptr) != ']')
990      {
991      if (*ptr == 0) return -1;
992      if (*ptr == '\\')
993        {
994        if (*(++ptr) == 0) return -1;
995        if (*ptr == 'Q') for (;;)
996          {
997          while (*(++ptr) != 0 && *ptr != '\\');
998          if (*ptr == 0) return -1;
999          if (*(++ptr) == 'E') break;
1000          }
1001        continue;
1002        }
1003      }
1004    continue;
1005    }
1006
1007  /* Skip comments in /x mode */
1008
1009  if (xmode && *ptr == '#')
1010    {
1011    while (*(++ptr) != 0 && *ptr != '\n');
1012    if (*ptr == 0) return -1;
1013    continue;
1014    }
1015
1016  /* An opening parens must now be a real metacharacter */
1017
1018  if (*ptr != '(') continue;
1019  if (ptr[1] != '?' && ptr[1] != '*')
1020    {
1021    count++;
1022    if (name == NULL && count == lorn) return count;
1023    continue;
1024    }
1025
1026  ptr += 2;
1027  if (*ptr == 'P') ptr++;                      /* Allow optional P */
1028
1029  /* We have to disambiguate (?<! and (?<= from (?<name> */
1030
1031  if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1032       *ptr != '\'')
1033    continue;
1034
1035  count++;
1036
1037  if (name == NULL && count == lorn) return count;
1038  term = *ptr++;
1039  if (term == '<') term = '>';
1040  thisname = ptr;
1041  while (*ptr != term) ptr++;
1042  if (name != NULL && lorn == ptr - thisname &&
1043      strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1044    return count;
1045  }
1046
1047return -1;
1048}
1049
1050
1051
1052/*************************************************
1053*      Find first significant op code            *
1054*************************************************/
1055
1056/* This is called by several functions that scan a compiled expression looking
1057for a fixed first character, or an anchoring op code etc. It skips over things
1058that do not influence this. For some calls, a change of option is important.
1059For some calls, it makes sense to skip negative forward and all backward
1060assertions, and also the \b assertion; for others it does not.
1061
1062Arguments:
1063  code         pointer to the start of the group
1064  options      pointer to external options
1065  optbit       the option bit whose changing is significant, or
1066                 zero if none are
1067  skipassert   TRUE if certain assertions are to be skipped
1068
1069Returns:       pointer to the first significant opcode
1070*/
1071
1072static const uschar*
1073first_significant_code(const uschar *code, int *options, int optbit,
1074  BOOL skipassert)
1075{
1076for (;;)
1077  {
1078  switch ((int)*code)
1079    {
1080    case OP_OPT:
1081    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1082      *options = (int)code[1];
1083    code += 2;
1084    break;
1085
1086    case OP_ASSERT_NOT:
1087    case OP_ASSERTBACK:
1088    case OP_ASSERTBACK_NOT:
1089    if (!skipassert) return code;
1090    do code += GET(code, 1); while (*code == OP_ALT);
1091    code += _pcre_OP_lengths[*code];
1092    break;
1093
1094    case OP_WORD_BOUNDARY:
1095    case OP_NOT_WORD_BOUNDARY:
1096    if (!skipassert) return code;
1097    /* Fall through */
1098
1099    case OP_CALLOUT:
1100    case OP_CREF:
1101    case OP_RREF:
1102    case OP_DEF:
1103    code += _pcre_OP_lengths[*code];
1104    break;
1105
1106    default:
1107    return code;
1108    }
1109  }
1110/* Control never reaches here */
1111}
1112
1113
1114
1115
1116/*************************************************
1117*        Find the fixed length of a pattern      *
1118*************************************************/
1119
1120/* Scan a pattern and compute the fixed length of subject that will match it,
1121if the length is fixed. This is needed for dealing with backward assertions.
1122In UTF8 mode, the result is in characters rather than bytes.
1123
1124Arguments:
1125  code     points to the start of the pattern (the bracket)
1126  options  the compiling options
1127
1128Returns:   the fixed length, or -1 if there is no fixed length,
1129             or -2 if \C was encountered
1130*/
1131
1132static int
1133find_fixedlength(uschar *code, int options)
1134{
1135int length = -1;
1136
1137register int branchlength = 0;
1138register uschar *cc = code + 1 + LINK_SIZE;
1139
1140/* Scan along the opcodes for this branch. If we get to the end of the
1141branch, check the length against that of the other branches. */
1142
1143for (;;)
1144  {
1145  int d;
1146  register int op = *cc;
1147  switch (op)
1148    {
1149    case OP_CBRA:
1150    case OP_BRA:
1151    case OP_ONCE:
1152    case OP_COND:
1153    d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1154    if (d < 0) return d;
1155    branchlength += d;
1156    do cc += GET(cc, 1); while (*cc == OP_ALT);
1157    cc += 1 + LINK_SIZE;
1158    break;
1159
1160    /* Reached end of a branch; if it's a ket it is the end of a nested
1161    call. If it's ALT it is an alternation in a nested call. If it is
1162    END it's the end of the outer call. All can be handled by the same code. */
1163
1164    case OP_ALT:
1165    case OP_KET:
1166    case OP_KETRMAX:
1167    case OP_KETRMIN:
1168    case OP_END:
1169    if (length < 0) length = branchlength;
1170      else if (length != branchlength) return -1;
1171    if (*cc != OP_ALT) return length;
1172    cc += 1 + LINK_SIZE;
1173    branchlength = 0;
1174    break;
1175
1176    /* Skip over assertive subpatterns */
1177
1178    case OP_ASSERT:
1179    case OP_ASSERT_NOT:
1180    case OP_ASSERTBACK:
1181    case OP_ASSERTBACK_NOT:
1182    do cc += GET(cc, 1); while (*cc == OP_ALT);
1183    /* Fall through */
1184
1185    /* Skip over things that don't match chars */
1186
1187    case OP_REVERSE:
1188    case OP_CREF:
1189    case OP_RREF:
1190    case OP_DEF:
1191    case OP_OPT:
1192    case OP_CALLOUT:
1193    case OP_SOD:
1194    case OP_SOM:
1195    case OP_EOD:
1196    case OP_EODN:
1197    case OP_CIRC:
1198    case OP_DOLL:
1199    case OP_NOT_WORD_BOUNDARY:
1200    case OP_WORD_BOUNDARY:
1201    cc += _pcre_OP_lengths[*cc];
1202    break;
1203
1204    /* Handle literal characters */
1205
1206    case OP_CHAR:
1207    case OP_CHARNC:
1208    case OP_NOT:
1209    branchlength++;
1210    cc += 2;
1211#ifdef SUPPORT_UTF8
1212    if ((options & PCRE_UTF8) != 0)
1213      {
1214      while ((*cc & 0xc0) == 0x80) cc++;
1215      }
1216#endif
1217    break;
1218
1219    /* Handle exact repetitions. The count is already in characters, but we
1220    need to skip over a multibyte character in UTF8 mode.  */
1221
1222    case OP_EXACT:
1223    branchlength += GET2(cc,1);
1224    cc += 4;
1225#ifdef SUPPORT_UTF8
1226    if ((options & PCRE_UTF8) != 0)
1227      {
1228      while((*cc & 0x80) == 0x80) cc++;
1229      }
1230#endif
1231    break;
1232
1233    case OP_TYPEEXACT:
1234    branchlength += GET2(cc,1);
1235    if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1236    cc += 4;
1237    break;
1238
1239    /* Handle single-char matchers */
1240
1241    case OP_PROP:
1242    case OP_NOTPROP:
1243    cc += 2;
1244    /* Fall through */
1245
1246    case OP_NOT_DIGIT:
1247    case OP_DIGIT:
1248    case OP_NOT_WHITESPACE:
1249    case OP_WHITESPACE:
1250    case OP_NOT_WORDCHAR:
1251    case OP_WORDCHAR:
1252    case OP_ANY:
1253    branchlength++;
1254    cc++;
1255    break;
1256
1257    /* The single-byte matcher isn't allowed */
1258
1259    case OP_ANYBYTE:
1260    return -2;
1261
1262    /* Check a class for variable quantification */
1263
1264#ifdef SUPPORT_UTF8
1265    case OP_XCLASS:
1266    cc += GET(cc, 1) - 33;
1267    /* Fall through */
1268#endif
1269
1270    case OP_CLASS:
1271    case OP_NCLASS:
1272    cc += 33;
1273
1274    switch (*cc)
1275      {
1276      case OP_CRSTAR:
1277      case OP_CRMINSTAR:
1278      case OP_CRQUERY:
1279      case OP_CRMINQUERY:
1280      return -1;
1281
1282      case OP_CRRANGE:
1283      case OP_CRMINRANGE:
1284      if (GET2(cc,1) != GET2(cc,3)) return -1;
1285      branchlength += GET2(cc,1);
1286      cc += 5;
1287      break;
1288
1289      default:
1290      branchlength++;
1291      }
1292    break;
1293
1294    /* Anything else is variable length */
1295
1296    default:
1297    return -1;
1298    }
1299  }
1300/* Control never gets here */
1301}
1302
1303
1304
1305
1306/*************************************************
1307*    Scan compiled regex for numbered bracket    *
1308*************************************************/
1309
1310/* This little function scans through a compiled pattern until it finds a
1311capturing bracket with the given number.
1312
1313Arguments:
1314  code        points to start of expression
1315  utf8        TRUE in UTF-8 mode
1316  number      the required bracket number
1317
1318Returns:      pointer to the opcode for the bracket, or NULL if not found
1319*/
1320
1321static const uschar *
1322find_bracket(const uschar *code, BOOL utf8, int number)
1323{
1324for (;;)
1325  {
1326  register int c = *code;
1327  if (c == OP_END) return NULL;
1328
1329  /* XCLASS is used for classes that cannot be represented just by a bit
1330  map. This includes negated single high-valued characters. The length in
1331  the table is zero; the actual length is stored in the compiled code. */
1332
1333  if (c == OP_XCLASS) code += GET(code, 1);
1334
1335  /* Handle capturing bracket */
1336
1337  else if (c == OP_CBRA)
1338    {
1339    int n = GET2(code, 1+LINK_SIZE);
1340    if (n == number) return (uschar *)code;
1341    code += _pcre_OP_lengths[c];
1342    }
1343
1344  /* Otherwise, we can get the item's length from the table, except that for
1345  repeated character types, we have to test for \p and \P, which have an extra
1346  two bytes of parameters. */
1347
1348  else
1349    {
1350    switch(c)
1351      {
1352      case OP_TYPESTAR:
1353      case OP_TYPEMINSTAR:
1354      case OP_TYPEPLUS:
1355      case OP_TYPEMINPLUS:
1356      case OP_TYPEQUERY:
1357      case OP_TYPEMINQUERY:
1358      case OP_TYPEPOSSTAR:
1359      case OP_TYPEPOSPLUS:
1360      case OP_TYPEPOSQUERY:
1361      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1362      break;
1363
1364      case OP_TYPEUPTO:
1365      case OP_TYPEMINUPTO:
1366      case OP_TYPEEXACT:
1367      case OP_TYPEPOSUPTO:
1368      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1369      break;
1370      }
1371
1372    /* Add in the fixed length from the table */
1373
1374    code += _pcre_OP_lengths[c];
1375
1376  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1377  a multi-byte character. The length in the table is a minimum, so we have to
1378  arrange to skip the extra bytes. */
1379
1380#ifdef SUPPORT_UTF8
1381    if (utf8) switch(c)
1382      {
1383      case OP_CHAR:
1384      case OP_CHARNC:
1385      case OP_EXACT:
1386      case OP_UPTO:
1387      case OP_MINUPTO:
1388      case OP_POSUPTO:
1389      case OP_STAR:
1390      case OP_MINSTAR:
1391      case OP_POSSTAR:
1392      case OP_PLUS:
1393      case OP_MINPLUS:
1394      case OP_POSPLUS:
1395      case OP_QUERY:
1396      case OP_MINQUERY:
1397      case OP_POSQUERY:
1398      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1399      break;
1400      }
1401#endif
1402    }
1403  }
1404}
1405
1406
1407
1408/*************************************************
1409*   Scan compiled regex for recursion reference  *
1410*************************************************/
1411
1412/* This little function scans through a compiled pattern until it finds an
1413instance of OP_RECURSE.
1414
1415Arguments:
1416  code        points to start of expression
1417  utf8        TRUE in UTF-8 mode
1418
1419Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1420*/
1421
1422static const uschar *
1423find_recurse(const uschar *code, BOOL utf8)
1424{
1425for (;;)
1426  {
1427  register int c = *code;
1428  if (c == OP_END) return NULL;
1429  if (c == OP_RECURSE) return code;
1430
1431  /* XCLASS is used for classes that cannot be represented just by a bit
1432  map. This includes negated single high-valued characters. The length in
1433  the table is zero; the actual length is stored in the compiled code. */
1434
1435  if (c == OP_XCLASS) code += GET(code, 1);
1436
1437  /* Otherwise, we can get the item's length from the table, except that for
1438  repeated character types, we have to test for \p and \P, which have an extra
1439  two bytes of parameters. */
1440
1441  else
1442    {
1443    switch(c)
1444      {
1445      case OP_TYPESTAR:
1446      case OP_TYPEMINSTAR:
1447      case OP_TYPEPLUS:
1448      case OP_TYPEMINPLUS:
1449      case OP_TYPEQUERY:
1450      case OP_TYPEMINQUERY:
1451      case OP_TYPEPOSSTAR:
1452      case OP_TYPEPOSPLUS:
1453      case OP_TYPEPOSQUERY:
1454      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1455      break;
1456
1457      case OP_TYPEPOSUPTO:
1458      case OP_TYPEUPTO:
1459      case OP_TYPEMINUPTO:
1460      case OP_TYPEEXACT:
1461      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1462      break;
1463      }
1464
1465    /* Add in the fixed length from the table */
1466
1467    code += _pcre_OP_lengths[c];
1468
1469    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1470    by a multi-byte character. The length in the table is a minimum, so we have
1471    to arrange to skip the extra bytes. */
1472
1473#ifdef SUPPORT_UTF8
1474    if (utf8) switch(c)
1475      {
1476      case OP_CHAR:
1477      case OP_CHARNC:
1478      case OP_EXACT:
1479      case OP_UPTO:
1480      case OP_MINUPTO:
1481      case OP_POSUPTO:
1482      case OP_STAR:
1483      case OP_MINSTAR:
1484      case OP_POSSTAR:
1485      case OP_PLUS:
1486      case OP_MINPLUS:
1487      case OP_POSPLUS:
1488      case OP_QUERY:
1489      case OP_MINQUERY:
1490      case OP_POSQUERY:
1491      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1492      break;
1493      }
1494#endif
1495    }
1496  }
1497}
1498
1499
1500
1501/*************************************************
1502*    Scan compiled branch for non-emptiness      *
1503*************************************************/
1504
1505/* This function scans through a branch of a compiled pattern to see whether it
1506can match the empty string or not. It is called from could_be_empty()
1507below and from compile_branch() when checking for an unlimited repeat of a
1508group that can match nothing. Note that first_significant_code() skips over
1509assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1510struck an inner bracket whose current branch will already have been scanned.
1511
1512Arguments:
1513  code        points to start of search
1514  endcode     points to where to stop
1515  utf8        TRUE if in UTF8 mode
1516
1517Returns:      TRUE if what is matched could be empty
1518*/
1519
1520static BOOL
1521could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1522{
1523register int c;
1524for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1525     code < endcode;
1526     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1527  {
1528  const uschar *ccode;
1529
1530  c = *code;
1531
1532  /* Groups with zero repeats can of course be empty; skip them. */
1533
1534  if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1535    {
1536    code += _pcre_OP_lengths[c];
1537    do code += GET(code, 1); while (*code == OP_ALT);
1538    c = *code;
1539    continue;
1540    }
1541
1542  /* For other groups, scan the branches. */
1543
1544  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1545    {
1546    BOOL empty_branch;
1547    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1548
1549    /* Scan a closed bracket */
1550
1551    empty_branch = FALSE;
1552    do
1553      {
1554      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1555        empty_branch = TRUE;
1556      code += GET(code, 1);
1557      }
1558    while (*code == OP_ALT);
1559    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1560    c = *code;
1561    continue;
1562    }
1563
1564  /* Handle the other opcodes */
1565
1566  switch (c)
1567    {
1568    /* Check for quantifiers after a class. XCLASS is used for classes that
1569    cannot be represented just by a bit map. This includes negated single
1570    high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1571    actual length is stored in the compiled code, so we must update "code"
1572    here. */
1573
1574#ifdef SUPPORT_UTF8
1575    case OP_XCLASS:
1576    ccode = code += GET(code, 1);
1577    goto CHECK_CLASS_REPEAT;
1578#endif
1579
1580    case OP_CLASS:
1581    case OP_NCLASS:
1582    ccode = code + 33;
1583
1584#ifdef SUPPORT_UTF8
1585    CHECK_CLASS_REPEAT:
1586#endif
1587
1588    switch (*ccode)
1589      {
1590      case OP_CRSTAR:            /* These could be empty; continue */
1591      case OP_CRMINSTAR:
1592      case OP_CRQUERY:
1593      case OP_CRMINQUERY:
1594      break;
1595
1596      default:                   /* Non-repeat => class must match */
1597      case OP_CRPLUS:            /* These repeats aren't empty */
1598      case OP_CRMINPLUS:
1599      return FALSE;
1600
1601      case OP_CRRANGE:
1602      case OP_CRMINRANGE:
1603      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1604      break;
1605      }
1606    break;
1607
1608    /* Opcodes that must match a character */
1609
1610    case OP_PROP:
1611    case OP_NOTPROP:
1612    case OP_EXTUNI:
1613    case OP_NOT_DIGIT:
1614    case OP_DIGIT:
1615    case OP_NOT_WHITESPACE:
1616    case OP_WHITESPACE:
1617    case OP_NOT_WORDCHAR:
1618    case OP_WORDCHAR:
1619    case OP_ANY:
1620    case OP_ANYBYTE:
1621    case OP_CHAR:
1622    case OP_CHARNC:
1623    case OP_NOT:
1624    case OP_PLUS:
1625    case OP_MINPLUS:
1626    case OP_POSPLUS:
1627    case OP_EXACT:
1628    case OP_NOTPLUS:
1629    case OP_NOTMINPLUS:
1630    case OP_NOTPOSPLUS:
1631    case OP_NOTEXACT:
1632    case OP_TYPEPLUS:
1633    case OP_TYPEMINPLUS:
1634    case OP_TYPEPOSPLUS:
1635    case OP_TYPEEXACT:
1636    return FALSE;
1637
1638    /* These are going to continue, as they may be empty, but we have to
1639    fudge the length for the \p and \P cases. */
1640
1641    case OP_TYPESTAR:
1642    case OP_TYPEMINSTAR:
1643    case OP_TYPEPOSSTAR:
1644    case OP_TYPEQUERY:
1645    case OP_TYPEMINQUERY:
1646    case OP_TYPEPOSQUERY:
1647    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1648    break;
1649
1650    /* Same for these */
1651
1652    case OP_TYPEUPTO:
1653    case OP_TYPEMINUPTO:
1654    case OP_TYPEPOSUPTO:
1655    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1656    break;
1657
1658    /* End of branch */
1659
1660    case OP_KET:
1661    case OP_KETRMAX:
1662    case OP_KETRMIN:
1663    case OP_ALT:
1664    return TRUE;
1665
1666    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1667    MINUPTO, and POSUPTO may be followed by a multibyte character */
1668
1669#ifdef SUPPORT_UTF8
1670    case OP_STAR:
1671    case OP_MINSTAR:
1672    case OP_POSSTAR:
1673    case OP_QUERY:
1674    case OP_MINQUERY:
1675    case OP_POSQUERY:
1676    case OP_UPTO:
1677    case OP_MINUPTO:
1678    case OP_POSUPTO:
1679    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1680    break;
1681#endif
1682    }
1683  }
1684
1685return TRUE;
1686}
1687
1688
1689
1690/*************************************************
1691*    Scan compiled regex for non-emptiness       *
1692*************************************************/
1693
1694/* This function is called to check for left recursive calls. We want to check
1695the current branch of the current pattern to see if it could match the empty
1696string. If it could, we must look outwards for branches at other levels,
1697stopping when we pass beyond the bracket which is the subject of the recursion.
1698
1699Arguments:
1700  code        points to start of the recursion
1701  endcode     points to where to stop (current RECURSE item)
1702  bcptr       points to the chain of current (unclosed) branch starts
1703  utf8        TRUE if in UTF-8 mode
1704
1705Returns:      TRUE if what is matched could be empty
1706*/
1707
1708static BOOL
1709could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1710  BOOL utf8)
1711{
1712while (bcptr != NULL && bcptr->current >= code)
1713  {
1714  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1715  bcptr = bcptr->outer;
1716  }
1717return TRUE;
1718}
1719
1720
1721
1722/*************************************************
1723*           Check for POSIX class syntax         *
1724*************************************************/
1725
1726/* This function is called when the sequence "[:" or "[." or "[=" is
1727encountered in a character class. It checks whether this is followed by an
1728optional ^ and then a sequence of letters, terminated by a matching ":]" or
1729".]" or "=]".
1730
1731Argument:
1732  ptr      pointer to the initial [
1733  endptr   where to return the end pointer
1734  cd       pointer to compile data
1735
1736Returns:   TRUE or FALSE
1737*/
1738
1739static BOOL
1740check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1741{
1742int terminator;          /* Don't combine these lines; the Solaris cc */
1743terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1744if (*(++ptr) == '^') ptr++;
1745while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1746if (*ptr == terminator && ptr[1] == ']')
1747  {
1748  *endptr = ptr;
1749  return TRUE;
1750  }
1751return FALSE;
1752}
1753
1754
1755
1756
1757/*************************************************
1758*          Check POSIX class name                *
1759*************************************************/
1760
1761/* This function is called to check the name given in a POSIX-style class entry
1762such as [:alnum:].
1763
1764Arguments:
1765  ptr        points to the first letter
1766  len        the length of the name
1767
1768Returns:     a value representing the name, or -1 if unknown
1769*/
1770
1771static int
1772check_posix_name(const uschar *ptr, int len)
1773{
1774const char *pn = posix_names;
1775register int yield = 0;
1776while (posix_name_lengths[yield] != 0)
1777  {
1778  if (len == posix_name_lengths[yield] &&
1779    strncmp((const char *)ptr, pn, len) == 0) return yield;
1780  pn += posix_name_lengths[yield] + 1;
1781  yield++;
1782  }
1783return -1;
1784}
1785
1786
1787/*************************************************
1788*    Adjust OP_RECURSE items in repeated group   *
1789*************************************************/
1790
1791/* OP_RECURSE items contain an offset from the start of the regex to the group
1792that is referenced. This means that groups can be replicated for fixed
1793repetition simply by copying (because the recursion is allowed to refer to
1794earlier groups that are outside the current group). However, when a group is
1795optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1796it, after it has been compiled. This means that any OP_RECURSE items within it
1797that refer to the group itself or any contained groups have to have their
1798offsets adjusted. That one of the jobs of this function. Before it is called,
1799the partially compiled regex must be temporarily terminated with OP_END.
1800
1801This function has been extended with the possibility of forward references for
1802recursions and subroutine calls. It must also check the list of such references
1803for the group we are dealing with. If it finds that one of the recursions in
1804the current group is on this list, it adjusts the offset in the list, not the
1805value in the reference (which is a group number).
1806
1807Arguments:
1808  group      points to the start of the group
1809  adjust     the amount by which the group is to be moved
1810  utf8       TRUE in UTF-8 mode
1811  cd         contains pointers to tables etc.
1812  save_hwm   the hwm forward reference pointer at the start of the group
1813
1814Returns:     nothing
1815*/
1816
1817static void
1818adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1819  uschar *save_hwm)
1820{
1821uschar *ptr = group;
1822
1823while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1824  {
1825  int offset;
1826  uschar *hc;
1827
1828  /* See if this recursion is on the forward reference list. If so, adjust the
1829  reference. */
1830
1831  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1832    {
1833    offset = GET(hc, 0);
1834    if (cd->start_code + offset == ptr + 1)
1835      {
1836      PUT(hc, 0, offset + adjust);
1837      break;
1838      }
1839    }
1840
1841  /* Otherwise, adjust the recursion offset if it's after the start of this
1842  group. */
1843
1844  if (hc >= cd->hwm)
1845    {
1846    offset = GET(ptr, 1);
1847    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1848    }
1849
1850  ptr += 1 + LINK_SIZE;
1851  }
1852}
1853
1854
1855
1856/*************************************************
1857*        Insert an automatic callout point       *
1858*************************************************/
1859
1860/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1861callout points before each pattern item.
1862
1863Arguments:
1864  code           current code pointer
1865  ptr            current pattern pointer
1866  cd             pointers to tables etc
1867
1868Returns:         new code pointer
1869*/
1870
1871static uschar *
1872auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1873{
1874*code++ = OP_CALLOUT;
1875*code++ = 255;
1876PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1877PUT(code, LINK_SIZE, 0);                /* Default length */
1878return code + 2*LINK_SIZE;
1879}
1880
1881
1882
1883/*************************************************
1884*         Complete a callout item                *
1885*************************************************/
1886
1887/* A callout item contains the length of the next item in the pattern, which
1888we can't fill in till after we have reached the relevant point. This is used
1889for both automatic and manual callouts.
1890
1891Arguments:
1892  previous_callout   points to previous callout item
1893  ptr                current pattern pointer
1894  cd                 pointers to tables etc
1895
1896Returns:             nothing
1897*/
1898
1899static void
1900complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1901{
1902int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1903PUT(previous_callout, 2 + LINK_SIZE, length);
1904}
1905
1906
1907
1908#ifdef SUPPORT_UCP
1909/*************************************************
1910*           Get othercase range                  *
1911*************************************************/
1912
1913/* This function is passed the start and end of a class range, in UTF-8 mode
1914with UCP support. It searches up the characters, looking for internal ranges of
1915characters in the "other" case. Each call returns the next one, updating the
1916start address.
1917
1918Arguments:
1919  cptr        points to starting character value; updated
1920  d           end value
1921  ocptr       where to put start of othercase range
1922  odptr       where to put end of othercase range
1923
1924Yield:        TRUE when range returned; FALSE when no more
1925*/
1926
1927static BOOL
1928get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1929  unsigned int *odptr)
1930{
1931unsigned int c, othercase, next;
1932
1933for (c = *cptr; c <= d; c++)
1934  { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1935
1936if (c > d) return FALSE;
1937
1938*ocptr = othercase;
1939next = othercase + 1;
1940
1941for (++c; c <= d; c++)
1942  {
1943  if (_pcre_ucp_othercase(c) != next) break;
1944  next++;
1945  }
1946
1947*odptr = next - 1;
1948*cptr = c;
1949
1950return TRUE;
1951}
1952#endif  /* SUPPORT_UCP */
1953
1954
1955
1956/*************************************************
1957*     Check if auto-possessifying is possible    *
1958*************************************************/
1959
1960/* This function is called for unlimited repeats of certain items, to see
1961whether the next thing could possibly match the repeated item. If not, it makes
1962sense to automatically possessify the repeated item.
1963
1964Arguments:
1965  op_code       the repeated op code
1966  this          data for this item, depends on the opcode
1967  utf8          TRUE in UTF-8 mode
1968  utf8_char     used for utf8 character bytes, NULL if not relevant
1969  ptr           next character in pattern
1970  options       options bits
1971  cd            contains pointers to tables etc.
1972
1973Returns:        TRUE if possessifying is wanted
1974*/
1975
1976static BOOL
1977check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1978  const uschar *ptr, int options, compile_data *cd)
1979{
1980int next;
1981
1982/* Skip whitespace and comments in extended mode */
1983
1984if ((options & PCRE_EXTENDED) != 0)
1985  {
1986  for (;;)
1987    {
1988    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1989    if (*ptr == '#')
1990      {
1991      while (*(++ptr) != 0)
1992        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1993      }
1994    else break;
1995    }
1996  }
1997
1998/* If the next item is one that we can handle, get its value. A non-negative
1999value is a character, a negative value is an escape value. */
2000
2001if (*ptr == '\\')
2002  {
2003  int temperrorcode = 0;
2004  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2005  if (temperrorcode != 0) return FALSE;
2006  ptr++;    /* Point after the escape sequence */
2007  }
2008
2009else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2010  {
2011#ifdef SUPPORT_UTF8
2012  if (utf8) { GETCHARINC(next, ptr); } else
2013#endif
2014  next = *ptr++;
2015  }
2016
2017else return FALSE;
2018
2019/* Skip whitespace and comments in extended mode */
2020
2021if ((options & PCRE_EXTENDED) != 0)
2022  {
2023  for (;;)
2024    {
2025    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2026    if (*ptr == '#')
2027      {
2028      while (*(++ptr) != 0)
2029        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2030      }
2031    else break;
2032    }
2033  }
2034
2035/* If the next thing is itself optional, we have to give up. */
2036
2037if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2038  return FALSE;
2039
2040/* Now compare the next item with the previous opcode. If the previous is a
2041positive single character match, "item" either contains the character or, if
2042"item" is greater than 127 in utf8 mode, the character's bytes are in
2043utf8_char. */
2044
2045
2046/* Handle cases when the next item is a character. */
2047
2048if (next >= 0) switch(op_code)
2049  {
2050  case OP_CHAR:
2051#ifdef SUPPORT_UTF8
2052  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2053#endif
2054  return item != next;
2055
2056  /* For CHARNC (caseless character) we must check the other case. If we have
2057  Unicode property support, we can use it to test the other case of
2058  high-valued characters. */
2059
2060  case OP_CHARNC:
2061#ifdef SUPPORT_UTF8
2062  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2063#endif
2064  if (item == next) return FALSE;
2065#ifdef SUPPORT_UTF8
2066  if (utf8)
2067    {
2068    unsigned int othercase;
2069    if (next < 128) othercase = cd->fcc[next]; else
2070#ifdef SUPPORT_UCP
2071    othercase = _pcre_ucp_othercase((unsigned int)next);
2072#else
2073    othercase = NOTACHAR;
2074#endif
2075    return (unsigned int)item != othercase;
2076    }
2077  else
2078#endif  /* SUPPORT_UTF8 */
2079  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2080
2081  /* For OP_NOT, "item" must be a single-byte character. */
2082
2083  case OP_NOT:
2084  if (next < 0) return FALSE;  /* Not a character */
2085  if (item == next) return TRUE;
2086  if ((options & PCRE_CASELESS) == 0) return FALSE;
2087#ifdef SUPPORT_UTF8
2088  if (utf8)
2089    {
2090    unsigned int othercase;
2091    if (next < 128) othercase = cd->fcc[next]; else
2092#ifdef SUPPORT_UCP
2093    othercase = _pcre_ucp_othercase(next);
2094#else
2095    othercase = NOTACHAR;
2096#endif
2097    return (unsigned int)item == othercase;
2098    }
2099  else
2100#endif  /* SUPPORT_UTF8 */
2101  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2102
2103  case OP_DIGIT:
2104  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2105
2106  case OP_NOT_DIGIT:
2107  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2108
2109  case OP_WHITESPACE:
2110  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2111
2112  case OP_NOT_WHITESPACE:
2113  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2114
2115  case OP_WORDCHAR:
2116  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2117
2118  case OP_NOT_WORDCHAR:
2119  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2120
2121  case OP_HSPACE:
2122  case OP_NOT_HSPACE:
2123  switch(next)
2124    {
2125    case 0x09:
2126    case 0x20:
2127    case 0xa0:
2128    case 0x1680:
2129    case 0x180e:
2130    case 0x2000:
2131    case 0x2001:
2132    case 0x2002:
2133    case 0x2003:
2134    case 0x2004:
2135    case 0x2005:
2136    case 0x2006:
2137    case 0x2007:
2138    case 0x2008:
2139    case 0x2009:
2140    case 0x200A:
2141    case 0x202f:
2142    case 0x205f:
2143    case 0x3000:
2144    return op_code != OP_HSPACE;
2145    default:
2146    return op_code == OP_HSPACE;
2147    }
2148
2149  case OP_VSPACE:
2150  case OP_NOT_VSPACE:
2151  switch(next)
2152    {
2153    case 0x0a:
2154    case 0x0b:
2155    case 0x0c:
2156    case 0x0d:
2157    case 0x85:
2158    case 0x2028:
2159    case 0x2029:
2160    return op_code != OP_VSPACE;
2161    default:
2162    return op_code == OP_VSPACE;
2163    }
2164
2165  default:
2166  return FALSE;
2167  }
2168
2169
2170/* Handle the case when the next item is \d, \s, etc. */
2171
2172switch(op_code)
2173  {
2174  case OP_CHAR:
2175  case OP_CHARNC:
2176#ifdef SUPPORT_UTF8
2177  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2178#endif
2179  switch(-next)
2180    {
2181    case ESC_d:
2182    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2183
2184    case ESC_D:
2185    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2186
2187    case ESC_s:
2188    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2189
2190    case ESC_S:
2191    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2192
2193    case ESC_w:
2194    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2195
2196    case ESC_W:
2197    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2198
2199    case ESC_h:
2200    case ESC_H:
2201    switch(item)
2202      {
2203      case 0x09:
2204      case 0x20:
2205      case 0xa0:
2206      case 0x1680:
2207      case 0x180e:
2208      case 0x2000:
2209      case 0x2001:
2210      case 0x2002:
2211      case 0x2003:
2212      case 0x2004:
2213      case 0x2005:
2214      case 0x2006:
2215      case 0x2007:
2216      case 0x2008:
2217      case 0x2009:
2218      case 0x200A:
2219      case 0x202f:
2220      case 0x205f:
2221      case 0x3000:
2222      return -next != ESC_h;
2223      default:
2224      return -next == ESC_h;
2225      }
2226
2227    case ESC_v:
2228    case ESC_V:
2229    switch(item)
2230      {
2231      case 0x0a:
2232      case 0x0b:
2233      case 0x0c:
2234      case 0x0d:
2235      case 0x85:
2236      case 0x2028:
2237      case 0x2029:
2238      return -next != ESC_v;
2239      default:
2240      return -next == ESC_v;
2241      }
2242
2243    default:
2244    return FALSE;
2245    }
2246
2247  case OP_DIGIT:
2248  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2249         next == -ESC_h || next == -ESC_v;
2250
2251  case OP_NOT_DIGIT:
2252  return next == -ESC_d;
2253
2254  case OP_WHITESPACE:
2255  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2256
2257  case OP_NOT_WHITESPACE:
2258  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2259
2260  case OP_HSPACE:
2261  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2262
2263  case OP_NOT_HSPACE:
2264  return next == -ESC_h;
2265
2266  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2267  case OP_VSPACE:
2268  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2269
2270  case OP_NOT_VSPACE:
2271  return next == -ESC_v;
2272
2273  case OP_WORDCHAR:
2274  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2275
2276  case OP_NOT_WORDCHAR:
2277  return next == -ESC_w || next == -ESC_d;
2278
2279  default:
2280  return FALSE;
2281  }
2282
2283/* Control does not reach here */
2284}
2285
2286
2287
2288/*************************************************
2289*           Compile one branch                   *
2290*************************************************/
2291
2292/* Scan the pattern, compiling it into the a vector. If the options are
2293changed during the branch, the pointer is used to change the external options
2294bits. This function is used during the pre-compile phase when we are trying
2295to find out the amount of memory needed, as well as during the real compile
2296phase. The value of lengthptr distinguishes the two phases.
2297
2298Arguments:
2299  optionsptr     pointer to the option bits
2300  codeptr        points to the pointer to the current code point
2301  ptrptr         points to the current pattern pointer
2302  errorcodeptr   points to error code variable
2303  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2304  reqbyteptr     set to the last literal character required, else < 0
2305  bcptr          points to current branch chain
2306  cd             contains pointers to tables etc.
2307  lengthptr      NULL during the real compile phase
2308                 points to length accumulator during pre-compile phase
2309
2310Returns:         TRUE on success
2311                 FALSE, with *errorcodeptr set non-zero on error
2312*/
2313
2314static BOOL
2315compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2316  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2317  compile_data *cd, int *lengthptr)
2318{
2319int repeat_type, op_type;
2320int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2321int bravalue = 0;
2322int greedy_default, greedy_non_default;
2323int firstbyte, reqbyte;
2324int zeroreqbyte, zerofirstbyte;
2325int req_caseopt, reqvary, tempreqvary;
2326int options = *optionsptr;
2327int after_manual_callout = 0;
2328int length_prevgroup = 0;
2329register int c;
2330register uschar *code = *codeptr;
2331uschar *last_code = code;
2332uschar *orig_code = code;
2333uschar *tempcode;
2334BOOL inescq = FALSE;
2335BOOL groupsetfirstbyte = FALSE;
2336const uschar *ptr = *ptrptr;
2337const uschar *tempptr;
2338uschar *previous = NULL;
2339uschar *previous_callout = NULL;
2340uschar *save_hwm = NULL;
2341uschar classbits[32];
2342
2343#ifdef SUPPORT_UTF8
2344BOOL class_utf8;
2345BOOL utf8 = (options & PCRE_UTF8) != 0;
2346uschar *class_utf8data;
2347uschar utf8_char[6];
2348#else
2349BOOL utf8 = FALSE;
2350uschar *utf8_char = NULL;
2351#endif
2352
2353#ifdef DEBUG
2354if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2355#endif
2356
2357/* Set up the default and non-default settings for greediness */
2358
2359greedy_default = ((options & PCRE_UNGREEDY) != 0);
2360greedy_non_default = greedy_default ^ 1;
2361
2362/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2363matching encountered yet". It gets changed to REQ_NONE if we hit something that
2364matches a non-fixed char first char; reqbyte just remains unset if we never
2365find one.
2366
2367When we hit a repeat whose minimum is zero, we may have to adjust these values
2368to take the zero repeat into account. This is implemented by setting them to
2369zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2370item types that can be repeated set these backoff variables appropriately. */
2371
2372firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2373
2374/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2375according to the current setting of the caseless flag. REQ_CASELESS is a bit
2376value > 255. It is added into the firstbyte or reqbyte variables to record the
2377case status of the value. This is used only for ASCII characters. */
2378
2379req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2380
2381/* Switch on next character until the end of the branch */
2382
2383for (;; ptr++)
2384  {
2385  BOOL negate_class;
2386  BOOL possessive_quantifier;
2387  BOOL is_quantifier;
2388  BOOL is_recurse;
2389  BOOL reset_bracount;
2390  int class_charcount;
2391  int class_lastchar;
2392  int newoptions;
2393  int recno;
2394  int refsign;
2395  int skipbytes;
2396  int subreqbyte;
2397  int subfirstbyte;
2398  int terminator;
2399  int mclength;
2400  uschar mcbuffer[8];
2401
2402  /* Get next byte in the pattern */
2403
2404  c = *ptr;
2405
2406  /* If we are in the pre-compile phase, accumulate the length used for the
2407  previous cycle of this loop. */
2408
2409  if (lengthptr != NULL)
2410    {
2411#ifdef DEBUG
2412    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2413#endif
2414    if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2415      {
2416      *errorcodeptr = ERR52;
2417      goto FAILED;
2418      }
2419
2420    /* There is at least one situation where code goes backwards: this is the
2421    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2422    the class is simply eliminated. However, it is created first, so we have to
2423    allow memory for it. Therefore, don't ever reduce the length at this point.
2424    */
2425
2426    if (code < last_code) code = last_code;
2427
2428    /* Paranoid check for integer overflow */
2429
2430    if (OFLOW_MAX - *lengthptr < code - last_code)
2431      {
2432      *errorcodeptr = ERR20;
2433      goto FAILED;
2434      }
2435
2436    *lengthptr += code - last_code;
2437    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2438
2439    /* If "previous" is set and it is not at the start of the work space, move
2440    it back to there, in order to avoid filling up the work space. Otherwise,
2441    if "previous" is NULL, reset the current code pointer to the start. */
2442
2443    if (previous != NULL)
2444      {
2445      if (previous > orig_code)
2446        {
2447        memmove(orig_code, previous, code - previous);
2448        code -= previous - orig_code;
2449        previous = orig_code;
2450        }
2451      }
2452    else code = orig_code;
2453
2454    /* Remember where this code item starts so we can pick up the length
2455    next time round. */
2456
2457    last_code = code;
2458    }
2459
2460  /* In the real compile phase, just check the workspace used by the forward
2461  reference list. */
2462
2463  else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2464    {
2465    *errorcodeptr = ERR52;
2466    goto FAILED;
2467    }
2468
2469  /* If in \Q...\E, check for the end; if not, we have a literal */
2470
2471  if (inescq && c != 0)
2472    {
2473    if (c == '\\' && ptr[1] == 'E')
2474      {
2475      inescq = FALSE;
2476      ptr++;
2477      continue;
2478      }
2479    else
2480      {
2481      if (previous_callout != NULL)
2482        {
2483        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2484          complete_callout(previous_callout, ptr, cd);
2485        previous_callout = NULL;
2486        }
2487      if ((options & PCRE_AUTO_CALLOUT) != 0)
2488        {
2489        previous_callout = code;
2490        code = auto_callout(code, ptr, cd);
2491        }
2492      goto NORMAL_CHAR;
2493      }
2494    }
2495
2496  /* Fill in length of a previous callout, except when the next thing is
2497  a quantifier. */
2498
2499  is_quantifier = c == '*' || c == '+' || c == '?' ||
2500    (c == '{' && is_counted_repeat(ptr+1));
2501
2502  if (!is_quantifier && previous_callout != NULL &&
2503       after_manual_callout-- <= 0)
2504    {
2505    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2506      complete_callout(previous_callout, ptr, cd);
2507    previous_callout = NULL;
2508    }
2509
2510  /* In extended mode, skip white space and comments */
2511
2512  if ((options & PCRE_EXTENDED) != 0)
2513    {
2514    if ((cd->ctypes[c] & ctype_space) != 0) continue;
2515    if (c == '#')
2516      {
2517      while (*(++ptr) != 0)
2518        {
2519        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2520        }
2521      if (*ptr != 0) continue;
2522
2523      /* Else fall through to handle end of string */
2524      c = 0;
2525      }
2526    }
2527
2528  /* No auto callout for quantifiers. */
2529
2530  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2531    {
2532    previous_callout = code;
2533    code = auto_callout(code, ptr, cd);
2534    }
2535
2536  switch(c)
2537    {
2538    /* ===================================================================*/
2539    case 0:                        /* The branch terminates at string end */
2540    case '|':                      /* or | or ) */
2541    case ')':
2542    *firstbyteptr = firstbyte;
2543    *reqbyteptr = reqbyte;
2544    *codeptr = code;
2545    *ptrptr = ptr;
2546    if (lengthptr != NULL)
2547      {
2548      if (OFLOW_MAX - *lengthptr < code - last_code)
2549        {
2550        *errorcodeptr = ERR20;
2551        goto FAILED;
2552        }
2553      *lengthptr += code - last_code;   /* To include callout length */
2554      DPRINTF((">> end branch\n"));
2555      }
2556    return TRUE;
2557
2558
2559    /* ===================================================================*/
2560    /* Handle single-character metacharacters. In multiline mode, ^ disables
2561    the setting of any following char as a first character. */
2562
2563    case '^':
2564    if ((options & PCRE_MULTILINE) != 0)
2565      {
2566      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2567      }
2568    previous = NULL;
2569    *code++ = OP_CIRC;
2570    break;
2571
2572    case '$':
2573    previous = NULL;
2574    *code++ = OP_DOLL;
2575    break;
2576
2577    /* There can never be a first char if '.' is first, whatever happens about
2578    repeats. The value of reqbyte doesn't change either. */
2579
2580    case '.':
2581    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2582    zerofirstbyte = firstbyte;
2583    zeroreqbyte = reqbyte;
2584    previous = code;
2585    *code++ = OP_ANY;
2586    break;
2587
2588
2589    /* ===================================================================*/
2590    /* Character classes. If the included characters are all < 256, we build a
2591    32-byte bitmap of the permitted characters, except in the special case
2592    where there is only one such character. For negated classes, we build the
2593    map as usual, then invert it at the end. However, we use a different opcode
2594    so that data characters > 255 can be handled correctly.
2595
2596    If the class contains characters outside the 0-255 range, a different
2597    opcode is compiled. It may optionally have a bit map for characters < 256,
2598    but those above are are explicitly listed afterwards. A flag byte tells
2599    whether the bitmap is present, and whether this is a negated class or not.
2600    */
2601
2602    case '[':
2603    previous = code;
2604
2605    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2606    they are encountered at the top level, so we'll do that too. */
2607
2608    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2609        check_posix_syntax(ptr, &tempptr, cd))
2610      {
2611      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2612      goto FAILED;
2613      }
2614
2615    /* If the first character is '^', set the negation flag and skip it. Also,
2616    if the first few characters (either before or after ^) are \Q\E or \E we
2617    skip them too. This makes for compatibility with Perl. */
2618
2619    negate_class = FALSE;
2620    for (;;)
2621      {
2622      c = *(++ptr);
2623      if (c == '\\')
2624        {
2625        if (ptr[1] == 'E') ptr++;
2626          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2627            else break;
2628        }
2629      else if (!negate_class && c == '^')
2630        negate_class = TRUE;
2631      else break;
2632      }
2633
2634    /* Keep a count of chars with values < 256 so that we can optimize the case
2635    of just a single character (as long as it's < 256). However, For higher
2636    valued UTF-8 characters, we don't yet do any optimization. */
2637
2638    class_charcount = 0;
2639    class_lastchar = -1;
2640
2641    /* Initialize the 32-char bit map to all zeros. We build the map in a
2642    temporary bit of memory, in case the class contains only 1 character (less
2643    than 256), because in that case the compiled code doesn't use the bit map.
2644    */
2645
2646    memset(classbits, 0, 32 * sizeof(uschar));
2647
2648#ifdef SUPPORT_UTF8
2649    class_utf8 = FALSE;                       /* No chars >= 256 */
2650    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2651#endif
2652
2653    /* Process characters until ] is reached. By writing this as a "do" it
2654    means that an initial ] is taken as a data character. At the start of the
2655    loop, c contains the first byte of the character. */
2656
2657    if (c != 0) do
2658      {
2659      const uschar *oldptr;
2660
2661#ifdef SUPPORT_UTF8
2662      if (utf8 && c > 127)
2663        {                           /* Braces are required because the */
2664        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2665        }
2666#endif
2667
2668      /* Inside \Q...\E everything is literal except \E */
2669
2670      if (inescq)
2671        {
2672        if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2673          {
2674          inescq = FALSE;                   /* Reset literal state */
2675          ptr++;                            /* Skip the 'E' */
2676          continue;                         /* Carry on with next */
2677          }
2678        goto CHECK_RANGE;                   /* Could be range if \E follows */
2679        }
2680
2681      /* Handle POSIX class names. Perl allows a negation extension of the
2682      form [:^name:]. A square bracket that doesn't match the syntax is
2683      treated as a literal. We also recognize the POSIX constructions
2684      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2685      5.6 and 5.8 do. */
2686
2687      if (c == '[' &&
2688          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2689          check_posix_syntax(ptr, &tempptr, cd))
2690        {
2691        BOOL local_negate = FALSE;
2692        int posix_class, taboffset, tabopt;
2693        register const uschar *cbits = cd->cbits;
2694        uschar pbits[32];
2695
2696        if (ptr[1] != ':')
2697          {
2698          *errorcodeptr = ERR31;
2699          goto FAILED;
2700          }
2701
2702        ptr += 2;
2703        if (*ptr == '^')
2704          {
2705          local_negate = TRUE;
2706          ptr++;
2707          }
2708
2709        posix_class = check_posix_name(ptr, tempptr - ptr);
2710        if (posix_class < 0)
2711          {
2712          *errorcodeptr = ERR30;
2713          goto FAILED;
2714          }
2715
2716        /* If matching is caseless, upper and lower are converted to
2717        alpha. This relies on the fact that the class table starts with
2718        alpha, lower, upper as the first 3 entries. */
2719
2720        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2721          posix_class = 0;
2722
2723        /* We build the bit map for the POSIX class in a chunk of local store
2724        because we may be adding and subtracting from it, and we don't want to
2725        subtract bits that may be in the main map already. At the end we or the
2726        result into the bit map that is being built. */
2727
2728        posix_class *= 3;
2729
2730        /* Copy in the first table (always present) */
2731
2732        memcpy(pbits, cbits + posix_class_maps[posix_class],
2733          32 * sizeof(uschar));
2734
2735        /* If there is a second table, add or remove it as required. */
2736
2737        taboffset = posix_class_maps[posix_class + 1];
2738        tabopt = posix_class_maps[posix_class + 2];
2739
2740        if (taboffset >= 0)
2741          {
2742          if (tabopt >= 0)
2743            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2744          else
2745            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2746          }
2747
2748        /* Not see if we need to remove any special characters. An option
2749        value of 1 removes vertical space and 2 removes underscore. */
2750
2751        if (tabopt < 0) tabopt = -tabopt;
2752        if (tabopt == 1) pbits[1] &= ~0x3c;
2753          else if (tabopt == 2) pbits[11] &= 0x7f;
2754
2755        /* Add the POSIX table or its complement into the main table that is
2756        being built and we are done. */
2757
2758        if (local_negate)
2759          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2760        else
2761          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2762
2763        ptr = tempptr + 1;
2764        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2765        continue;    /* End of POSIX syntax handling */
2766        }
2767
2768      /* Backslash may introduce a single character, or it may introduce one
2769      of the specials, which just set a flag. The sequence \b is a special
2770      case. Inside a class (and only there) it is treated as backspace.
2771      Elsewhere it marks a word boundary. Other escapes have preset maps ready
2772      to 'or' into the one we are building. We assume they have more than one
2773      character in them, so set class_charcount bigger than one. */
2774
2775      if (c == '\\')
2776        {
2777        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2778        if (*errorcodeptr != 0) goto FAILED;
2779
2780        if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2781        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2782        else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2783        else if (-c == ESC_Q)            /* Handle start of quoted string */
2784          {
2785          if (ptr[1] == '\\' && ptr[2] == 'E')
2786            {
2787            ptr += 2; /* avoid empty string */
2788            }
2789          else inescq = TRUE;
2790          continue;
2791          }
2792        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2793
2794        if (c < 0)
2795          {
2796          register const uschar *cbits = cd->cbits;
2797          class_charcount += 2;     /* Greater than 1 is what matters */
2798
2799          /* Save time by not doing this in the pre-compile phase. */
2800
2801          if (lengthptr == NULL) switch (-c)
2802            {
2803            case ESC_d:
2804            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2805            continue;
2806
2807            case ESC_D:
2808            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2809            continue;
2810
2811            case ESC_w:
2812            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2813            continue;
2814
2815            case ESC_W:
2816            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2817            continue;
2818
2819            case ESC_s:
2820            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2821            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2822            continue;
2823
2824            case ESC_S:
2825            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2826            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2827            continue;
2828
2829            case ESC_E: /* Perl ignores an orphan \E */
2830            continue;
2831
2832            default:    /* Not recognized; fall through */
2833            break;      /* Need "default" setting to stop compiler warning. */
2834            }
2835
2836          /* In the pre-compile phase, just do the recognition. */
2837
2838          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2839                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2840
2841          /* We need to deal with \H, \h, \V, and \v in both phases because
2842          they use extra memory. */
2843
2844          if (-c == ESC_h)
2845            {
2846            SETBIT(classbits, 0x09); /* VT */
2847            SETBIT(classbits, 0x20); /* SPACE */
2848            SETBIT(classbits, 0xa0); /* NSBP */
2849#ifdef SUPPORT_UTF8
2850            if (utf8)
2851              {
2852              class_utf8 = TRUE;
2853              *class_utf8data++ = XCL_SINGLE;
2854              class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2855              *class_utf8data++ = XCL_SINGLE;
2856              class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2857              *class_utf8data++ = XCL_RANGE;
2858              class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2859              class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2860              *class_utf8data++ = XCL_SINGLE;
2861              class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2862              *class_utf8data++ = XCL_SINGLE;
2863              class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2864              *class_utf8data++ = XCL_SINGLE;
2865              class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2866              }
2867#endif
2868            continue;
2869            }
2870
2871          if (-c == ESC_H)
2872            {
2873            for (c = 0; c < 32; c++)
2874              {
2875              int x = 0xff;
2876              switch (c)
2877                {
2878                case 0x09/8: x ^= 1 << (0x09%8); break;
2879                case 0x20/8: x ^= 1 << (0x20%8); break;
2880                case 0xa0/8: x ^= 1 << (0xa0%8); break;
2881                default: break;
2882                }
2883              classbits[c] |= x;
2884              }
2885
2886#ifdef SUPPORT_UTF8
2887            if (utf8)
2888              {
2889              class_utf8 = TRUE;
2890              *class_utf8data++ = XCL_RANGE;
2891              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2892              class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2893              *class_utf8data++ = XCL_RANGE;
2894              class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2895              class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2896              *class_utf8data++ = XCL_RANGE;
2897              class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2898              class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2899              *class_utf8data++ = XCL_RANGE;
2900              class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2901              class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2902              *class_utf8data++ = XCL_RANGE;
2903              class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2904              class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2905              *class_utf8data++ = XCL_RANGE;
2906              class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2907              class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2908              *class_utf8data++ = XCL_RANGE;
2909              class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2910              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2911              }
2912#endif
2913            continue;
2914            }
2915
2916          if (-c == ESC_v)
2917            {
2918            SETBIT(classbits, 0x0a); /* LF */
2919            SETBIT(classbits, 0x0b); /* VT */
2920            SETBIT(classbits, 0x0c); /* FF */
2921            SETBIT(classbits, 0x0d); /* CR */
2922            SETBIT(classbits, 0x85); /* NEL */
2923#ifdef SUPPORT_UTF8
2924            if (utf8)
2925              {
2926              class_utf8 = TRUE;
2927              *class_utf8data++ = XCL_RANGE;
2928              class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2929              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2930              }
2931#endif
2932            continue;
2933            }
2934
2935          if (-c == ESC_V)
2936            {
2937            for (c = 0; c < 32; c++)
2938              {
2939              int x = 0xff;
2940              switch (c)
2941                {
2942                case 0x0a/8: x ^= 1 << (0x0a%8);
2943                             x ^= 1 << (0x0b%8);
2944                             x ^= 1 << (0x0c%8);
2945                             x ^= 1 << (0x0d%8);
2946                             break;
2947                case 0x85/8: x ^= 1 << (0x85%8); break;
2948                default: break;
2949                }
2950              classbits[c] |= x;
2951              }
2952
2953#ifdef SUPPORT_UTF8
2954            if (utf8)
2955              {
2956              class_utf8 = TRUE;
2957              *class_utf8data++ = XCL_RANGE;
2958              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2959              class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2960              *class_utf8data++ = XCL_RANGE;
2961              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2962              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2963              }
2964#endif
2965            continue;
2966            }
2967
2968          /* We need to deal with \P and \p in both phases. */
2969
2970#ifdef SUPPORT_UCP
2971          if (-c == ESC_p || -c == ESC_P)
2972            {
2973            BOOL negated;
2974            int pdata;
2975            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2976            if (ptype < 0) goto FAILED;
2977            class_utf8 = TRUE;
2978            *class_utf8data++ = ((-c == ESC_p) != negated)?
2979              XCL_PROP : XCL_NOTPROP;
2980            *class_utf8data++ = ptype;
2981            *class_utf8data++ = pdata;
2982            class_charcount -= 2;   /* Not a < 256 character */
2983            continue;
2984            }
2985#endif
2986          /* Unrecognized escapes are faulted if PCRE is running in its
2987          strict mode. By default, for compatibility with Perl, they are
2988          treated as literals. */
2989
2990          if ((options & PCRE_EXTRA) != 0)
2991            {
2992            *errorcodeptr = ERR7;
2993            goto FAILED;
2994            }
2995
2996          class_charcount -= 2;  /* Undo the default count from above */
2997          c = *ptr;              /* Get the final character and fall through */
2998          }
2999
3000        /* Fall through if we have a single character (c >= 0). This may be
3001        greater than 256 in UTF-8 mode. */
3002
3003        }   /* End of backslash handling */
3004
3005      /* A single character may be followed by '-' to form a range. However,
3006      Perl does not permit ']' to be the end of the range. A '-' character
3007      at the end is treated as a literal. Perl ignores orphaned \E sequences
3008      entirely. The code for handling \Q and \E is messy. */
3009
3010      CHECK_RANGE:
3011      while (ptr[1] == '\\' && ptr[2] == 'E')
3012        {
3013        inescq = FALSE;
3014        ptr += 2;
3015        }
3016
3017      oldptr = ptr;
3018
3019      /* Remember \r or \n */
3020
3021      if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3022
3023      /* Check for range */
3024
3025      if (!inescq && ptr[1] == '-')
3026        {
3027        int d;
3028        ptr += 2;
3029        while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3030
3031        /* If we hit \Q (not followed by \E) at this point, go into escaped
3032        mode. */
3033
3034        while (*ptr == '\\' && ptr[1] == 'Q')
3035          {
3036          ptr += 2;
3037          if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3038          inescq = TRUE;
3039          break;
3040          }
3041
3042        if (*ptr == 0 || (!inescq && *ptr == ']'))
3043          {
3044          ptr = oldptr;
3045          goto LONE_SINGLE_CHARACTER;
3046          }
3047
3048#ifdef SUPPORT_UTF8
3049        if (utf8)
3050          {                           /* Braces are required because the */
3051          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3052          }
3053        else
3054#endif
3055        d = *ptr;  /* Not UTF-8 mode */
3056
3057        /* The second part of a range can be a single-character escape, but
3058        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3059        in such circumstances. */
3060
3061        if (!inescq && d == '\\')
3062          {
3063          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3064          if (*errorcodeptr != 0) goto FAILED;
3065
3066          /* \b is backslash; \X is literal X; \R is literal R; any other
3067          special means the '-' was literal */
3068
3069          if (d < 0)
3070            {
3071            if (d == -ESC_b) d = '\b';
3072            else if (d == -ESC_X) d = 'X';
3073            else if (d == -ESC_R) d = 'R'; else
3074              {
3075              ptr = oldptr;
3076              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3077              }
3078            }
3079          }
3080
3081        /* Check that the two values are in the correct order. Optimize
3082        one-character ranges */
3083
3084        if (d < c)
3085          {
3086          *errorcodeptr = ERR8;
3087          goto FAILED;
3088          }
3089
3090        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3091
3092        /* Remember \r or \n */
3093
3094        if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3095
3096        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3097        matching, we have to use an XCLASS with extra data items. Caseless
3098        matching for characters > 127 is available only if UCP support is
3099        available. */
3100
3101#ifdef SUPPORT_UTF8
3102        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3103          {
3104          class_utf8 = TRUE;
3105
3106          /* With UCP support, we can find the other case equivalents of
3107          the relevant characters. There may be several ranges. Optimize how
3108          they fit with the basic range. */
3109
3110#ifdef SUPPORT_UCP
3111          if ((options & PCRE_CASELESS) != 0)
3112            {
3113            unsigned int occ, ocd;
3114            unsigned int cc = c;
3115            unsigned int origd = d;
3116            while (get_othercase_range(&cc, origd, &occ, &ocd))
3117              {
3118              if (occ >= (unsigned int)c &&
3119                  ocd <= (unsigned int)d)
3120                continue;                          /* Skip embedded ranges */
3121
3122              if (occ < (unsigned int)&&
3123                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3124                {                                  /* if there is overlap,   */
3125                c = occ;                           /* noting that if occ < c */
3126                continue;                          /* we can't have ocd > d  */
3127                }                                  /* because a subrange is  */
3128              if (ocd > (unsigned int)d &&
3129                  occ <= (unsigned int)d + 1)      /* always shorter than    */
3130                {                                  /* the basic range.       */
3131                d = ocd;
3132                continue;
3133                }
3134
3135              if (occ == ocd)
3136                {
3137                *class_utf8data++ = XCL_SINGLE;
3138                }
3139              else
3140                {
3141                *class_utf8data++ = XCL_RANGE;
3142                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3143                }
3144              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3145              }
3146            }
3147#endif  /* SUPPORT_UCP */
3148
3149          /* Now record the original range, possibly modified for UCP caseless
3150          overlapping ranges. */
3151
3152          *class_utf8data++ = XCL_RANGE;
3153          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3154          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3155
3156          /* With UCP support, we are done. Without UCP support, there is no
3157          caseless matching for UTF-8 characters > 127; we can use the bit map
3158          for the smaller ones. */
3159
3160#ifdef SUPPORT_UCP
3161          continue;    /* With next character in the class */
3162#else
3163          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3164
3165          /* Adjust upper limit and fall through to set up the map */
3166
3167          d = 127;
3168
3169#endif  /* SUPPORT_UCP */
3170          }
3171#endif  /* SUPPORT_UTF8 */
3172
3173        /* We use the bit map for all cases when not in UTF-8 mode; else
3174        ranges that lie entirely within 0-127 when there is UCP support; else
3175        for partial ranges without UCP support. */
3176
3177        class_charcount += d - c + 1;
3178        class_lastchar = d;
3179
3180        /* We can save a bit of time by skipping this in the pre-compile. */
3181
3182        if (lengthptr == NULL) for (; c <= d; c++)
3183          {
3184          classbits[c/8] |= (1 << (c&7));
3185          if ((options & PCRE_CASELESS) != 0)
3186            {
3187            int uc = cd->fcc[c];           /* flip case */
3188            classbits[uc/8] |= (1 << (uc&7));
3189            }
3190          }
3191
3192        continue;   /* Go get the next char in the class */
3193        }
3194
3195      /* Handle a lone single character - we can get here for a normal
3196      non-escape char, or after \ that introduces a single character or for an
3197      apparent range that isn't. */
3198
3199      LONE_SINGLE_CHARACTER:
3200
3201      /* Handle a character that cannot go in the bit map */
3202
3203#ifdef SUPPORT_UTF8
3204      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3205        {
3206        class_utf8 = TRUE;
3207        *class_utf8data++ = XCL_SINGLE;
3208        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3209
3210#ifdef SUPPORT_UCP
3211        if ((options & PCRE_CASELESS) != 0)
3212          {
3213          unsigned int othercase;
3214          if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3215            {
3216            *class_utf8data++ = XCL_SINGLE;
3217            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3218            }
3219          }
3220#endif  /* SUPPORT_UCP */
3221
3222        }
3223      else
3224#endif  /* SUPPORT_UTF8 */
3225
3226      /* Handle a single-byte character */
3227        {
3228        classbits[c/8] |= (1 << (c&7));
3229        if ((options & PCRE_CASELESS) != 0)
3230          {
3231          c = cd->fcc[c];   /* flip case */
3232          classbits[c/8] |= (1 << (c&7));
3233          }
3234        class_charcount++;
3235        class_lastchar = c;
3236        }
3237      }
3238
3239    /* Loop until ']' reached. This "while" is the end of the "do" above. */
3240
3241    while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3242
3243    if (c == 0)                          /* Missing terminating ']' */
3244      {
3245      *errorcodeptr = ERR6;
3246      goto FAILED;
3247      }
3248
3249
3250/* This code has been disabled because it would mean that \s counts as
3251an explicit \r or \n reference, and that's not really what is wanted. Now
3252we set the flag only if there is a literal "\r" or "\n" in the class. */
3253
3254#if 0
3255    /* Remember whether \r or \n are in this class */
3256
3257    if (negate_class)
3258      {
3259      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3260      }
3261    else
3262      {
3263      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3264      }
3265#endif
3266
3267
3268    /* If class_charcount is 1, we saw precisely one character whose value is
3269    less than 256. As long as there were no characters >= 128 and there was no
3270    use of \p or \P, in other words, no use of any XCLASS features, we can
3271    optimize.
3272
3273    In UTF-8 mode, we can optimize the negative case only if there were no
3274    characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3275    operate on single-bytes only. This is an historical hangover. Maybe one day
3276    we can tidy these opcodes to handle multi-byte characters.
3277
3278    The optimization throws away the bit map. We turn the item into a
3279    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3280    that OP_NOT does not support multibyte characters. In the positive case, it
3281    can cause firstbyte to be set. Otherwise, there can be no first char if
3282    this item is first, whatever repeat count may follow. In the case of
3283    reqbyte, save the previous value for reinstating. */
3284
3285#ifdef SUPPORT_UTF8
3286    if (class_charcount == 1 && !class_utf8 &&
3287      (!utf8 || !negate_class || class_lastchar < 128))
3288#else
3289    if (class_charcount == 1)
3290#endif
3291      {
3292      zeroreqbyte = reqbyte;
3293
3294      /* The OP_NOT opcode works on one-byte characters only. */
3295
3296      if (negate_class)
3297        {
3298        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3299        zerofirstbyte = firstbyte;
3300        *code++ = OP_NOT;
3301        *code++ = class_lastchar;
3302        break;
3303        }
3304
3305      /* For a single, positive character, get the value into mcbuffer, and
3306      then we can handle this with the normal one-character code. */
3307
3308#ifdef SUPPORT_UTF8
3309      if (utf8 && class_lastchar > 127)
3310        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3311      else
3312#endif
3313        {
3314        mcbuffer[0] = class_lastchar;
3315        mclength = 1;
3316        }
3317      goto ONE_CHAR;
3318      }       /* End of 1-char optimization */
3319
3320    /* The general case - not the one-char optimization. If this is the first
3321    thing in the branch, there can be no first char setting, whatever the
3322    repeat count. Any reqbyte setting must remain unchanged after any kind of
3323    repeat. */
3324
3325    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3326    zerofirstbyte = firstbyte;
3327    zeroreqbyte = reqbyte;
3328
3329    /* If there are characters with values > 255, we have to compile an
3330    extended class, with its own opcode. If there are no characters < 256,
3331    we can omit the bitmap in the actual compiled code. */
3332
3333#ifdef SUPPORT_UTF8
3334    if (class_utf8)
3335      {
3336      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3337      *code++ = OP_XCLASS;
3338      code += LINK_SIZE;
3339      *code = negate_class? XCL_NOT : 0;
3340
3341      /* If the map is required, move up the extra data to make room for it;
3342      otherwise just move the code pointer to the end of the extra data. */
3343
3344      if (class_charcount > 0)
3345        {
3346        *code++ |= XCL_MAP;
3347        memmove(code + 32, code, class_utf8data - code);
3348        memcpy(code, classbits, 32);
3349        code = class_utf8data + 32;
3350        }
3351      else code = class_utf8data;
3352
3353      /* Now fill in the complete length of the item */
3354
3355      PUT(previous, 1, code - previous);
3356      break;   /* End of class handling */
3357      }
3358#endif
3359
3360    /* If there are no characters > 255, negate the 32-byte map if necessary,
3361    and copy it into the code vector. If this is the first thing in the branch,
3362    there can be no first char setting, whatever the repeat count. Any reqbyte
3363    setting must remain unchanged after any kind of repeat. */
3364
3365    if (negate_class)
3366      {
3367      *code++ = OP_NCLASS;
3368      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3369        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3370      }
3371    else
3372      {
3373      *code++ = OP_CLASS;
3374      memcpy(code, classbits, 32);
3375      }
3376    code += 32;
3377    break;
3378
3379
3380    /* ===================================================================*/
3381    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3382    has been tested above. */
3383
3384    case '{':
3385    if (!is_quantifier) goto NORMAL_CHAR;
3386    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3387    if (*errorcodeptr != 0) goto FAILED;
3388    goto REPEAT;
3389
3390    case '*':
3391    repeat_min = 0;
3392    repeat_max = -1;
3393    goto REPEAT;
3394
3395    case '+':
3396    repeat_min = 1;
3397    repeat_max = -1;
3398    goto REPEAT;
3399
3400    case '?':
3401    repeat_min = 0;
3402    repeat_max = 1;
3403
3404    REPEAT:
3405    if (previous == NULL)
3406      {
3407      *errorcodeptr = ERR9;
3408      goto FAILED;
3409      }
3410
3411    if (repeat_min == 0)
3412      {
3413      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3414      reqbyte = zeroreqbyte;        /* Ditto */
3415      }
3416
3417    /* Remember whether this is a variable length repeat */
3418
3419    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3420
3421    op_type = 0;                    /* Default single-char op codes */
3422    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3423
3424    /* Save start of previous item, in case we have to move it up to make space
3425    for an inserted OP_ONCE for the additional '+' extension. */
3426
3427    tempcode = previous;
3428
3429    /* If the next character is '+', we have a possessive quantifier. This
3430    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3431    If the next character is '?' this is a minimizing repeat, by default,
3432    but if PCRE_UNGREEDY is set, it works the other way round. We change the
3433    repeat type to the non-default. */
3434
3435    if (ptr[1] == '+')
3436      {
3437      repeat_type = 0;                  /* Force greedy */
3438      possessive_quantifier = TRUE;
3439      ptr++;
3440      }
3441    else if (ptr[1] == '?')
3442      {
3443      repeat_type = greedy_non_default;
3444      ptr++;
3445      }
3446    else repeat_type = greedy_default;
3447
3448    /* If previous was a character match, abolish the item and generate a
3449    repeat item instead. If a char item has a minumum of more than one, ensure
3450    that it is set in reqbyte - it might not be if a sequence such as x{3} is
3451    the first thing in a branch because the x will have gone into firstbyte
3452    instead.  */
3453
3454    if (*previous == OP_CHAR || *previous == OP_CHARNC)
3455      {
3456      /* Deal with UTF-8 characters that take up more than one byte. It's
3457      easier to write this out separately than try to macrify it. Use c to
3458      hold the length of the character in bytes, plus 0x80 to flag that it's a
3459      length rather than a small character. */
3460
3461#ifdef SUPPORT_UTF8
3462      if (utf8 && (code[-1] & 0x80) != 0)
3463        {
3464        uschar *lastchar = code - 1;
3465        while((*lastchar & 0xc0) == 0x80) lastchar--;
3466        c = code - lastchar;            /* Length of UTF-8 character */
3467        memcpy(utf8_char, lastchar, c); /* Save the char */
3468        c |= 0x80;                      /* Flag c as a length */
3469        }
3470      else
3471#endif
3472
3473      /* Handle the case of a single byte - either with no UTF8 support, or
3474      with UTF-8 disabled, or for a UTF-8 character < 128. */
3475
3476        {
3477        c = code[-1];
3478        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3479        }
3480
3481      /* If the repetition is unlimited, it pays to see if the next thing on
3482      the line is something that cannot possibly match this character. If so,
3483      automatically possessifying this item gains some performance in the case
3484      where the match fails. */
3485
3486      if (!possessive_quantifier &&
3487          repeat_max < 0 &&
3488          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3489            options, cd))
3490        {
3491        repeat_type = 0;    /* Force greedy */
3492        possessive_quantifier = TRUE;
3493        }
3494
3495      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3496      }
3497
3498    /* If previous was a single negated character ([^a] or similar), we use
3499    one of the special opcodes, replacing it. The code is shared with single-
3500    character repeats by setting opt_type to add a suitable offset into
3501    repeat_type. We can also test for auto-possessification. OP_NOT is
3502    currently used only for single-byte chars. */
3503
3504    else if (*previous == OP_NOT)
3505      {
3506      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3507      c = previous[1];
3508      if (!possessive_quantifier &&
3509          repeat_max < 0 &&
3510          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3511        {
3512        repeat_type = 0;    /* Force greedy */
3513        possessive_quantifier = TRUE;
3514        }
3515      goto OUTPUT_SINGLE_REPEAT;
3516      }
3517
3518    /* If previous was a character type match (\d or similar), abolish it and
3519    create a suitable repeat item. The code is shared with single-character
3520    repeats by setting op_type to add a suitable offset into repeat_type. Note
3521    the the Unicode property types will be present only when SUPPORT_UCP is
3522    defined, but we don't wrap the little bits of code here because it just
3523    makes it horribly messy. */
3524
3525    else if (*previous < OP_EODN)
3526      {
3527      uschar *oldcode;
3528      int prop_type, prop_value;
3529      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3530      c = *previous;
3531
3532      if (!possessive_quantifier &&
3533          repeat_max < 0 &&
3534          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3535        {
3536        repeat_type = 0;    /* Force greedy */
3537        possessive_quantifier = TRUE;
3538        }
3539
3540      OUTPUT_SINGLE_REPEAT:
3541      if (*previous == OP_PROP || *previous == OP_NOTPROP)
3542        {
3543        prop_type = previous[1];
3544        prop_value = previous[2];
3545        }
3546      else prop_type = prop_value = -1;
3547
3548      oldcode = code;
3549      code = previous;                  /* Usually overwrite previous item */
3550
3551      /* If the maximum is zero then the minimum must also be zero; Perl allows
3552      this case, so we do too - by simply omitting the item altogether. */
3553
3554      if (repeat_max == 0) goto END_REPEAT;
3555
3556      /* All real repeats make it impossible to handle partial matching (maybe
3557      one day we will be able to remove this restriction). */
3558
3559      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3560
3561      /* Combine the op_type with the repeat_type */
3562
3563      repeat_type += op_type;
3564
3565      /* A minimum of zero is handled either as the special case * or ?, or as
3566      an UPTO, with the maximum given. */
3567
3568      if (repeat_min == 0)
3569        {
3570        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3571          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3572        else
3573          {
3574          *code++ = OP_UPTO + repeat_type;
3575          PUT2INC(code, 0, repeat_max);
3576          }
3577        }
3578
3579      /* A repeat minimum of 1 is optimized into some special cases. If the
3580      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3581      left in place and, if the maximum is greater than 1, we use OP_UPTO with
3582      one less than the maximum. */
3583
3584      else if (repeat_min == 1)
3585        {
3586        if (repeat_max == -1)
3587          *code++ = OP_PLUS + repeat_type;
3588        else
3589          {
3590          code = oldcode;                 /* leave previous item in place */
3591          if (repeat_max == 1) goto END_REPEAT;
3592          *code++ = OP_UPTO + repeat_type;
3593          PUT2INC(code, 0, repeat_max - 1);
3594          }
3595        }
3596
3597      /* The case {n,n} is just an EXACT, while the general case {n,m} is
3598      handled as an EXACT followed by an UPTO. */
3599
3600      else
3601        {
3602        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3603        PUT2INC(code, 0, repeat_min);
3604
3605        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3606        we have to insert the character for the previous code. For a repeated
3607        Unicode property match, there are two extra bytes that define the
3608        required property. In UTF-8 mode, long characters have their length in
3609        c, with the 0x80 bit as a flag. */
3610
3611        if (repeat_max < 0)
3612          {
3613#ifdef SUPPORT_UTF8
3614          if (utf8 && c >= 128)
3615            {
3616            memcpy(code, utf8_char, c & 7);
3617            code += c & 7;
3618            }
3619          else
3620#endif
3621            {
3622            *code++ = c;
3623            if (prop_type >= 0)
3624              {
3625              *code++ = prop_type;
3626              *code++ = prop_value;
3627              }
3628            }
3629          *code++ = OP_STAR + repeat_type;
3630          }
3631
3632        /* Else insert an UPTO if the max is greater than the min, again
3633        preceded by the character, for the previously inserted code. If the
3634        UPTO is just for 1 instance, we can use QUERY instead. */
3635
3636        else if (repeat_max != repeat_min)
3637          {
3638#ifdef SUPPORT_UTF8
3639          if (utf8 && c >= 128)
3640            {
3641            memcpy(code, utf8_char, c & 7);
3642            code += c & 7;
3643            }
3644          else
3645#endif
3646          *code++ = c;
3647          if (prop_type >= 0)
3648            {
3649            *code++ = prop_type;
3650            *code++ = prop_value;
3651            }
3652          repeat_max -= repeat_min;
3653
3654          if (repeat_max == 1)
3655            {
3656            *code++ = OP_QUERY + repeat_type;
3657            }
3658          else
3659            {
3660            *code++ = OP_UPTO + repeat_type;
3661            PUT2INC(code, 0, repeat_max);
3662            }
3663          }
3664        }
3665
3666      /* The character or character type itself comes last in all cases. */
3667
3668#ifdef SUPPORT_UTF8
3669      if (utf8 && c >= 128)
3670        {
3671        memcpy(code, utf8_char, c & 7);
3672        code += c & 7;
3673        }
3674      else
3675#endif
3676      *code++ = c;
3677
3678      /* For a repeated Unicode property match, there are two extra bytes that
3679      define the required property. */
3680
3681#ifdef SUPPORT_UCP
3682      if (prop_type >= 0)
3683        {
3684        *code++ = prop_type;
3685        *code++ = prop_value;
3686        }
3687#endif
3688      }
3689
3690    /* If previous was a character class or a back reference, we put the repeat
3691    stuff after it, but just skip the item if the repeat was {0,0}. */
3692
3693    else if (*previous == OP_CLASS ||
3694             *previous == OP_NCLASS ||
3695#ifdef SUPPORT_UTF8
3696             *previous == OP_XCLASS ||
3697#endif
3698             *previous == OP_REF)
3699      {
3700      if (repeat_max == 0)
3701        {
3702        code = previous;
3703        goto END_REPEAT;
3704        }
3705
3706      /* All real repeats make it impossible to handle partial matching (maybe
3707      one day we will be able to remove this restriction). */
3708
3709      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3710
3711      if (repeat_min == 0 && repeat_max == -1)
3712        *code++ = OP_CRSTAR + repeat_type;
3713      else if (repeat_min == 1 && repeat_max == -1)
3714        *code++ = OP_CRPLUS + repeat_type;
3715      else if (repeat_min == 0 && repeat_max == 1)
3716        *code++ = OP_CRQUERY + repeat_type;
3717      else
3718        {
3719        *code++ = OP_CRRANGE + repeat_type;
3720        PUT2INC(code, 0, repeat_min);
3721        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3722        PUT2INC(code, 0, repeat_max);
3723        }
3724      }
3725
3726    /* If previous was a bracket group, we may have to replicate it in certain
3727    cases. */
3728
3729    else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3730             *previous == OP_ONCE || *previous == OP_COND)
3731      {
3732      register int i;
3733      int ketoffset = 0;
3734      int len = code - previous;
3735      uschar *bralink = NULL;
3736
3737      /* Repeating a DEFINE group is pointless */
3738
3739      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3740        {
3741        *errorcodeptr = ERR55;
3742        goto FAILED;
3743        }
3744
3745      /* If the maximum repeat count is unlimited, find the end of the bracket
3746      by scanning through from the start, and compute the offset back to it
3747      from the current code pointer. There may be an OP_OPT setting following
3748      the final KET, so we can't find the end just by going back from the code
3749      pointer. */
3750
3751      if (repeat_max == -1)
3752        {
3753        register uschar *ket = previous;
3754        do ket += GET(ket, 1); while (*ket != OP_KET);
3755        ketoffset = code - ket;
3756        }
3757
3758      /* The case of a zero minimum is special because of the need to stick
3759      OP_BRAZERO in front of it, and because the group appears once in the
3760      data, whereas in other cases it appears the minimum number of times. For
3761      this reason, it is simplest to treat this case separately, as otherwise
3762      the code gets far too messy. There are several special subcases when the
3763      minimum is zero. */
3764
3765      if (repeat_min == 0)
3766        {
3767        /* If the maximum is also zero, we just omit the group from the output
3768        altogether. */
3769
3770        if (repeat_max == 0)
3771          {
3772          code = previous;
3773          goto END_REPEAT;
3774          }
3775
3776        /* If the maximum is 1 or unlimited, we just have to stick in the
3777        BRAZERO and do no more at this point. However, we do need to adjust
3778        any OP_RECURSE calls inside the group that refer to the group itself or
3779        any internal or forward referenced group, because the offset is from
3780        the start of the whole regex. Temporarily terminate the pattern while
3781        doing this. */
3782
3783        if (repeat_max <= 1)
3784          {
3785          *code = OP_END;
3786          adjust_recurse(previous, 1, utf8, cd, save_hwm);
3787          memmove(previous+1, previous, len);
3788          code++;
3789          *previous++ = OP_BRAZERO + repeat_type;
3790          }
3791
3792        /* If the maximum is greater than 1 and limited, we have to replicate
3793        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3794        The first one has to be handled carefully because it's the original
3795        copy, which has to be moved up. The remainder can be handled by code
3796        that is common with the non-zero minimum case below. We have to
3797        adjust the value or repeat_max, since one less copy is required. Once
3798        again, we may have to adjust any OP_RECURSE calls inside the group. */
3799
3800        else
3801          {
3802          int offset;
3803          *code = OP_END;
3804          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3805          memmove(previous + 2 + LINK_SIZE, previous, len);
3806          code += 2 + LINK_SIZE;
3807          *previous++ = OP_BRAZERO + repeat_type;
3808          *previous++ = OP_BRA;
3809
3810          /* We chain together the bracket offset fields that have to be
3811          filled in later when the ends of the brackets are reached. */
3812
3813          offset = (bralink == NULL)? 0 : previous - bralink;
3814          bralink = previous;
3815          PUTINC(previous, 0, offset);
3816          }
3817
3818        repeat_max--;
3819        }
3820
3821      /* If the minimum is greater than zero, replicate the group as many
3822      times as necessary, and adjust the maximum to the number of subsequent
3823      copies that we need. If we set a first char from the group, and didn't
3824      set a required char, copy the latter from the former. If there are any
3825      forward reference subroutine calls in the group, there will be entries on
3826      the workspace list; replicate these with an appropriate increment. */
3827
3828      else
3829        {
3830        if (repeat_min > 1)
3831          {
3832          /* In the pre-compile phase, we don't actually do the replication. We
3833          just adjust the length as if we had. Do some paranoid checks for
3834          potential integer overflow. */
3835
3836          if (lengthptr != NULL)
3837            {
3838            int delta = (repeat_min - 1)*length_prevgroup;
3839            if ((double)(repeat_min - 1)*(double)length_prevgroup >
3840                                                            (double)INT_MAX ||
3841                OFLOW_MAX - *lengthptr < delta)
3842              {
3843              *errorcodeptr = ERR20;
3844              goto FAILED;
3845              }
3846            *lengthptr += delta;
3847            }
3848
3849          /* This is compiling for real */
3850
3851          else
3852            {
3853            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3854            for (i = 1; i < repeat_min; i++)
3855              {
3856              uschar *hc;
3857              uschar *this_hwm = cd->hwm;
3858              memcpy(code, previous, len);
3859              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3860                {
3861                PUT(cd->hwm, 0, GET(hc, 0) + len);
3862                cd->hwm += LINK_SIZE;
3863                }
3864              save_hwm = this_hwm;
3865              code += len;
3866              }
3867            }
3868          }
3869
3870        if (repeat_max > 0) repeat_max -= repeat_min;
3871        }
3872
3873      /* This code is common to both the zero and non-zero minimum cases. If
3874      the maximum is limited, it replicates the group in a nested fashion,
3875      remembering the bracket starts on a stack. In the case of a zero minimum,
3876      the first one was set up above. In all cases the repeat_max now specifies
3877      the number of additional copies needed. Again, we must remember to
3878      replicate entries on the forward reference list. */
3879
3880      if (repeat_max >= 0)
3881        {
3882        /* In the pre-compile phase, we don't actually do the replication. We
3883        just adjust the length as if we had. For each repetition we must add 1
3884        to the length for BRAZERO and for all but the last repetition we must
3885        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3886        paranoid checks to avoid integer overflow. */
3887
3888        if (lengthptr != NULL && repeat_max > 0)
3889          {
3890          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3891                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3892          if ((double)repeat_max *
3893                (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3894                  > (double)INT_MAX ||
3895              OFLOW_MAX - *lengthptr < delta)
3896            {
3897            *errorcodeptr = ERR20;
3898            goto FAILED;
3899            }
3900          *lengthptr += delta;
3901          }
3902
3903        /* This is compiling for real */
3904
3905        else for (i = repeat_max - 1; i >= 0; i--)
3906          {
3907          uschar *hc;
3908          uschar *this_hwm = cd->hwm;
3909
3910          *code++ = OP_BRAZERO + repeat_type;
3911
3912          /* All but the final copy start a new nesting, maintaining the
3913          chain of brackets outstanding. */
3914
3915          if (i != 0)
3916            {
3917            int offset;
3918            *code++ = OP_BRA;
3919            offset = (bralink == NULL)? 0 : code - bralink;
3920            bralink = code;
3921            PUTINC(code, 0, offset);
3922            }
3923
3924          memcpy(code, previous, len);
3925          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3926            {
3927            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3928            cd->hwm += LINK_SIZE;
3929            }
3930          save_hwm = this_hwm;
3931          code += len;
3932          }
3933
3934        /* Now chain through the pending brackets, and fill in their length
3935        fields (which are holding the chain links pro tem). */
3936
3937        while (bralink != NULL)
3938          {
3939          int oldlinkoffset;
3940          int offset = code - bralink + 1;
3941          uschar *bra = code - offset;
3942          oldlinkoffset = GET(bra, 1);
3943          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3944          *code++ = OP_KET;
3945          PUTINC(code, 0, offset);
3946          PUT(bra, 1, offset);
3947          }
3948        }
3949
3950      /* If the maximum is unlimited, set a repeater in the final copy. We
3951      can't just offset backwards from the current code point, because we
3952      don't know if there's been an options resetting after the ket. The
3953      correct offset was computed above.
3954
3955      Then, when we are doing the actual compile phase, check to see whether
3956      this group is a non-atomic one that could match an empty string. If so,
3957      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3958      that runtime checking can be done. [This check is also applied to
3959      atomic groups at runtime, but in a different way.] */
3960
3961      else
3962        {
3963        uschar *ketcode = code - ketoffset;
3964        uschar *bracode = ketcode - GET(ketcode, 1);
3965        *ketcode = OP_KETRMAX + repeat_type;
3966        if (lengthptr == NULL && *bracode != OP_ONCE)
3967          {
3968          uschar *scode = bracode;
3969          do
3970            {
3971            if (could_be_empty_branch(scode, ketcode, utf8))
3972              {
3973              *bracode += OP_SBRA - OP_BRA;
3974              break;
3975              }
3976            scode += GET(scode, 1);
3977            }
3978          while (*scode == OP_ALT);
3979          }
3980        }
3981      }
3982
3983    /* Else there's some kind of shambles */
3984
3985    else
3986      {
3987      *errorcodeptr = ERR11;
3988      goto FAILED;
3989      }
3990
3991    /* If the character following a repeat is '+', or if certain optimization
3992    tests above succeeded, possessive_quantifier is TRUE. For some of the
3993    simpler opcodes, there is an special alternative opcode for this. For
3994    anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3995    The '+' notation is just syntactic sugar, taken from Sun's Java package,
3996    but the special opcodes can optimize it a bit. The repeated item starts at
3997    tempcode, not at previous, which might be the first part of a string whose
3998    (former) last char we repeated.
3999
4000    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4001    an 'upto' may follow. We skip over an 'exact' item, and then test the
4002    length of what remains before proceeding. */
4003
4004    if (possessive_quantifier)
4005      {
4006      int len;
4007      if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4008          *tempcode == OP_NOTEXACT)
4009        tempcode += _pcre_OP_lengths[*tempcode];
4010      len = code - tempcode;
4011      if (len > 0) switch (*tempcode)
4012        {
4013        case OP_STAR:  *tempcode = OP_POSSTAR; break;
4014        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4015        case OP_QUERY: *tempcode = OP_POSQUERY; break;
4016        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4017
4018        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4019        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4020        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4021        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4022
4023        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4024        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4025        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4026        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4027
4028        default:
4029        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4030        code += 1 + LINK_SIZE;
4031        len += 1 + LINK_SIZE;
4032        tempcode[0] = OP_ONCE;
4033        *code++ = OP_KET;
4034        PUTINC(code, 0, len);
4035        PUT(tempcode, 1, len);
4036        break;
4037        }
4038      }
4039
4040    /* In all case we no longer have a previous item. We also set the
4041    "follows varying string" flag for subsequently encountered reqbytes if
4042    it isn't already set and we have just passed a varying length item. */
4043
4044    END_REPEAT:
4045    previous = NULL;
4046    cd->req_varyopt |= reqvary;
4047    break;
4048
4049
4050    /* ===================================================================*/
4051    /* Start of nested parenthesized sub-expression, or comment or lookahead or
4052    lookbehind or option setting or condition or all the other extended
4053    parenthesis forms.  */
4054
4055    case '(':
4056    newoptions = options;
4057    skipbytes = 0;
4058    bravalue = OP_CBRA;
4059    save_hwm = cd->hwm;
4060    reset_bracount = FALSE;
4061
4062    /* First deal with various "verbs" that can be introduced by '*'. */
4063
4064    if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4065      {
4066      int i, namelen;
4067      const char *vn = verbnames;
4068      const uschar *name = ++ptr;
4069      previous = NULL;
4070      while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4071      if (*ptr == ':')
4072        {
4073        *errorcodeptr = ERR59;   /* Not supported */
4074        goto FAILED;
4075        }
4076      if (*ptr != ')')
4077        {
4078        *errorcodeptr = ERR60;
4079        goto FAILED;
4080        }
4081      namelen = ptr - name;
4082      for (i = 0; i < verbcount; i++)
4083        {
4084        if (namelen == verbs[i].len &&
4085            strncmp((char *)name, vn, namelen) == 0)
4086          {
4087          *code = verbs[i].op;
4088          if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4089          break;
4090          }
4091        vn += verbs[i].len + 1;
4092        }
4093      if (i < verbcount) continue;
4094      *errorcodeptr = ERR60;
4095      goto FAILED;
4096      }
4097
4098    /* Deal with the extended parentheses; all are introduced by '?', and the
4099    appearance of any of them means that this is not a capturing group. */
4100
4101    else if (*ptr == '?')
4102      {
4103      int i, set, unset, namelen;
4104      int *optset;
4105      const uschar *name;
4106      uschar *slot;
4107
4108      switch (*(++ptr))
4109        {
4110        case '#':                 /* Comment; skip to ket */
4111        ptr++;
4112        while (*ptr != 0 && *ptr != ')') ptr++;
4113        if (*ptr == 0)
4114          {
4115          *errorcodeptr = ERR18;
4116          goto FAILED;
4117          }
4118        continue;
4119
4120
4121        /* ------------------------------------------------------------ */
4122        case '|':                 /* Reset capture count for each branch */
4123        reset_bracount = TRUE;
4124        /* Fall through */
4125
4126        /* ------------------------------------------------------------ */
4127        case ':':                 /* Non-capturing bracket */
4128        bravalue = OP_BRA;
4129        ptr++;
4130        break;
4131
4132
4133        /* ------------------------------------------------------------ */
4134        case '(':
4135        bravalue = OP_COND;       /* Conditional group */
4136
4137        /* A condition can be an assertion, a number (referring to a numbered
4138        group), a name (referring to a named group), or 'R', referring to
4139        recursion. R<digits> and R&name are also permitted for recursion tests.
4140
4141        There are several syntaxes for testing a named group: (?(name)) is used
4142        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4143
4144        There are two unfortunate ambiguities, caused by history. (a) 'R' can
4145        be the recursive thing or the name 'R' (and similarly for 'R' followed
4146        by digits), and (b) a number could be a name that consists of digits.
4147        In both cases, we look for a name first; if not found, we try the other
4148        cases. */
4149
4150        /* For conditions that are assertions, check the syntax, and then exit
4151        the switch. This will take control down to where bracketed groups,
4152        including assertions, are processed. */
4153
4154        if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4155          break;
4156
4157        /* Most other conditions use OP_CREF (a couple change to OP_RREF
4158        below), and all need to skip 3 bytes at the start of the group. */
4159
4160        code[1+LINK_SIZE] = OP_CREF;
4161        skipbytes = 3;
4162        refsign = -1;
4163
4164        /* Check for a test for recursion in a named group. */
4165
4166        if (ptr[1] == 'R' && ptr[2] == '&')
4167          {
4168          terminator = -1;
4169          ptr += 2;
4170          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4171          }
4172
4173        /* Check for a test for a named group's having been set, using the Perl
4174        syntax (?(<name>) or (?('name') */
4175
4176        else if (ptr[1] == '<')
4177          {
4178          terminator = '>';
4179          ptr++;
4180          }
4181        else if (ptr[1] == '\'')
4182          {
4183          terminator = '\'';
4184          ptr++;
4185          }
4186        else
4187          {
4188          terminator = 0;
4189          if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4190          }
4191
4192        /* We now expect to read a name; any thing else is an error */
4193
4194        if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4195          {
4196          ptr += 1;  /* To get the right offset */
4197          *errorcodeptr = ERR28;
4198          goto FAILED;
4199          }
4200
4201        /* Read the name, but also get it as a number if it's all digits */
4202
4203        recno = 0;
4204        name = ++ptr;
4205        while ((cd->ctypes[*ptr] & ctype_word) != 0)
4206          {
4207          if (recno >= 0)
4208            recno = ((digitab[*ptr] & ctype_digit) != 0)?
4209              recno * 10 + *ptr - '0' : -1;
4210          ptr++;
4211          }
4212        namelen = ptr - name;
4213
4214        if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4215          {
4216          ptr--;      /* Error offset */
4217          *errorcodeptr = ERR26;
4218          goto FAILED;
4219          }
4220
4221        /* Do no further checking in the pre-compile phase. */
4222
4223        if (lengthptr != NULL) break;
4224
4225        /* In the real compile we do the work of looking for the actual
4226        reference. If the string started with "+" or "-" we require the rest to
4227        be digits, in which case recno will be set. */
4228
4229        if (refsign > 0)
4230          {
4231          if (recno <= 0)
4232            {
4233            *errorcodeptr = ERR58;
4234            goto FAILED;
4235            }
4236          if (refsign == '-')
4237            {
4238            recno = cd->bracount - recno + 1;
4239            if (recno <= 0)
4240              {
4241              *errorcodeptr = ERR15;
4242              goto FAILED;
4243              }
4244            }
4245          else recno += cd->bracount;
4246          PUT2(code, 2+LINK_SIZE, recno);
4247          break;
4248          }
4249
4250        /* Otherwise (did not start with "+" or "-"), start by looking for the
4251        name. */
4252
4253        slot = cd->name_table;
4254        for (i = 0; i < cd->names_found; i++)
4255          {
4256          if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4257          slot += cd->name_entry_size;
4258          }
4259
4260        /* Found a previous named subpattern */
4261
4262        if (i < cd->names_found)
4263          {
4264          recno = GET2(slot, 0);
4265          PUT2(code, 2+LINK_SIZE, recno);
4266          }
4267
4268        /* Search the pattern for a forward reference */
4269
4270        else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4271                        (options & PCRE_EXTENDED) != 0)) > 0)
4272          {
4273          PUT2(code, 2+LINK_SIZE, i);
4274          }
4275
4276        /* If terminator == 0 it means that the name followed directly after
4277        the opening parenthesis [e.g. (?(abc)...] and in this case there are
4278        some further alternatives to try. For the cases where terminator != 0
4279        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4280        now checked all the possibilities, so give an error. */
4281
4282        else if (terminator != 0)
4283          {
4284          *errorcodeptr = ERR15;
4285          goto FAILED;
4286          }
4287
4288        /* Check for (?(R) for recursion. Allow digits after R to specify a
4289        specific group number. */
4290
4291        else if (*name == 'R')
4292          {
4293          recno = 0;
4294          for (i = 1; i < namelen; i++)
4295            {
4296            if ((digitab[name[i]] & ctype_digit) == 0)
4297              {
4298              *errorcodeptr = ERR15;
4299              goto FAILED;
4300              }
4301            recno = recno * 10 + name[i] - '0';
4302            }
4303          if (recno == 0) recno = RREF_ANY;
4304          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4305          PUT2(code, 2+LINK_SIZE, recno);
4306          }
4307
4308        /* Similarly, check for the (?(DEFINE) "condition", which is always
4309        false. */
4310
4311        else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4312          {
4313          code[1+LINK_SIZE] = OP_DEF;
4314          skipbytes = 1;
4315          }
4316
4317        /* Check for the "name" actually being a subpattern number. */
4318
4319        else if (recno > 0)
4320          {
4321          PUT2(code, 2+LINK_SIZE, recno);
4322          }
4323
4324        /* Either an unidentified subpattern, or a reference to (?(0) */
4325
4326        else
4327          {
4328          *errorcodeptr = (recno == 0)? ERR35: ERR15;
4329          goto FAILED;
4330          }
4331        break;
4332
4333
4334        /* ------------------------------------------------------------ */
4335        case '=':                 /* Positive lookahead */
4336        bravalue = OP_ASSERT;
4337        ptr++;
4338        break;
4339
4340
4341        /* ------------------------------------------------------------ */
4342        case '!':                 /* Negative lookahead */
4343        ptr++;
4344        if (*ptr == ')')          /* Optimize (?!) */
4345          {
4346          *code++ = OP_FAIL;
4347          previous = NULL;
4348          continue;
4349          }
4350        bravalue = OP_ASSERT_NOT;
4351        break;
4352
4353
4354        /* ------------------------------------------------------------ */
4355        case '<':                 /* Lookbehind or named define */
4356        switch (ptr[1])
4357          {
4358          case '=':               /* Positive lookbehind */
4359          bravalue = OP_ASSERTBACK;
4360          ptr += 2;
4361          break;
4362
4363          case '!':               /* Negative lookbehind */
4364          bravalue = OP_ASSERTBACK_NOT;
4365          ptr += 2;
4366          break;
4367
4368          default:                /* Could be name define, else bad */
4369          if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4370          ptr++;                  /* Correct offset for error */
4371          *errorcodeptr = ERR24;
4372          goto FAILED;
4373          }
4374        break;
4375
4376
4377        /* ------------------------------------------------------------ */
4378        case '>':                 /* One-time brackets */
4379        bravalue = OP_ONCE;
4380        ptr++;
4381        break;
4382
4383
4384        /* ------------------------------------------------------------ */
4385        case 'C':                 /* Callout - may be followed by digits; */
4386        previous_callout = code;  /* Save for later completion */
4387        after_manual_callout = 1; /* Skip one item before completing */
4388        *code++ = OP_CALLOUT;
4389          {
4390          int n = 0;
4391          while ((digitab[*(++ptr)] & ctype_digit) != 0)
4392            n = n * 10 + *ptr - '0';
4393          if (*ptr != ')')
4394            {
4395            *errorcodeptr = ERR39;
4396            goto FAILED;
4397            }
4398          if (n > 255)
4399            {
4400            *errorcodeptr = ERR38;
4401            goto FAILED;
4402            }
4403          *code++ = n;
4404          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4405          PUT(code, LINK_SIZE, 0);                    /* Default length */
4406          code += 2 * LINK_SIZE;
4407          }
4408        previous = NULL;
4409        continue;
4410
4411
4412        /* ------------------------------------------------------------ */
4413        case 'P':                 /* Python-style named subpattern handling */
4414        if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4415          {
4416          is_recurse = *ptr == '>';
4417          terminator = ')';
4418          goto NAMED_REF_OR_RECURSE;
4419          }
4420        else if (*ptr != '<')    /* Test for Python-style definition */
4421          {
4422          *errorcodeptr = ERR41;
4423          goto FAILED;
4424          }
4425        /* Fall through to handle (?P< as (?< is handled */
4426
4427
4428        /* ------------------------------------------------------------ */
4429        DEFINE_NAME:    /* Come here from (?< handling */
4430        case '\'':
4431          {
4432          terminator = (*ptr == '<')? '>' : '\'';
4433          name = ++ptr;
4434
4435          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4436          namelen = ptr - name;
4437
4438          /* In the pre-compile phase, just do a syntax check. */
4439
4440          if (lengthptr != NULL)
4441            {
4442            if (*ptr != terminator)
4443              {
4444              *errorcodeptr = ERR42;
4445              goto FAILED;
4446              }
4447            if (cd->names_found >= MAX_NAME_COUNT)
4448              {
4449              *errorcodeptr = ERR49;
4450              goto FAILED;
4451              }
4452            if (namelen + 3 > cd->name_entry_size)
4453              {
4454              cd->name_entry_size = namelen + 3;
4455              if (namelen > MAX_NAME_SIZE)
4456                {
4457                *errorcodeptr = ERR48;
4458                goto FAILED;
4459                }
4460              }
4461            }
4462
4463          /* In the real compile, create the entry in the table */
4464
4465          else
4466            {
4467            slot = cd->name_table;
4468            for (i = 0; i < cd->names_found; i++)
4469              {
4470              int crc = memcmp(name, slot+2, namelen);
4471              if (crc == 0)
4472                {
4473                if (slot[2+namelen] == 0)
4474                  {
4475                  if ((options & PCRE_DUPNAMES) == 0)
4476                    {
4477                    *errorcodeptr = ERR43;
4478                    goto FAILED;
4479                    }
4480                  }
4481                else crc = -1;      /* Current name is substring */
4482                }
4483              if (crc < 0)
4484                {
4485                memmove(slot + cd->name_entry_size, slot,
4486                  (cd->names_found - i) * cd->name_entry_size);
4487                break;
4488                }
4489              slot += cd->name_entry_size;
4490              }
4491
4492            PUT2(slot, 0, cd->bracount + 1);
4493            memcpy(slot + 2, name, namelen);
4494            slot[2+namelen] = 0;
4495            }
4496          }
4497
4498        /* In both cases, count the number of names we've encountered. */
4499
4500        ptr++;                    /* Move past > or ' */
4501        cd->names_found++;
4502        goto NUMBERED_GROUP;
4503
4504
4505        /* ------------------------------------------------------------ */
4506        case '&':                 /* Perl recursion/subroutine syntax */
4507        terminator = ')';
4508        is_recurse = TRUE;
4509        /* Fall through */
4510
4511        /* We come here from the Python syntax above that handles both
4512        references (?P=name) and recursion (?P>name), as well as falling
4513        through from the Perl recursion syntax (?&name). */
4514
4515        NAMED_REF_OR_RECURSE:
4516        name = ++ptr;
4517        while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4518        namelen = ptr - name;
4519
4520        /* In the pre-compile phase, do a syntax check and set a dummy
4521        reference number. */
4522
4523        if (lengthptr != NULL)
4524          {
4525          if (*ptr != terminator)
4526            {
4527            *errorcodeptr = ERR42;
4528            goto FAILED;
4529            }
4530          if (namelen > MAX_NAME_SIZE)
4531            {
4532            *errorcodeptr = ERR48;
4533            goto FAILED;
4534            }
4535          recno = 0;
4536          }
4537
4538        /* In the real compile, seek the name in the table */
4539
4540        else
4541          {
4542          slot = cd->name_table;
4543          for (i = 0; i < cd->names_found; i++)
4544            {
4545            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4546            slot += cd->name_entry_size;
4547            }
4548
4549          if (i < cd->names_found)         /* Back reference */
4550            {
4551            recno = GET2(slot, 0);
4552            }
4553          else if ((recno =                /* Forward back reference */
4554                    find_parens(ptr, cd->bracount, name, namelen,
4555                      (options & PCRE_EXTENDED) != 0)) <= 0)
4556            {
4557            *errorcodeptr = ERR15;
4558            goto FAILED;
4559            }
4560          }
4561
4562        /* In both phases, we can now go to the code than handles numerical
4563        recursion or backreferences. */
4564
4565        if (is_recurse) goto HANDLE_RECURSION;
4566          else goto HANDLE_REFERENCE;
4567
4568
4569        /* ------------------------------------------------------------ */
4570        case 'R':                 /* Recursion */
4571        ptr++;                    /* Same as (?0)      */
4572        /* Fall through */
4573
4574
4575        /* ------------------------------------------------------------ */
4576        case '-': case '+':
4577        case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4578        case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4579          {
4580          const uschar *called;
4581
4582          if ((refsign = *ptr) == '+') ptr++;
4583          else if (refsign == '-')
4584            {
4585            if ((digitab[ptr[1]] & ctype_digit) == 0)
4586              goto OTHER_CHAR_AFTER_QUERY;
4587            ptr++;
4588            }
4589
4590          recno = 0;
4591          while((digitab[*ptr] & ctype_digit) != 0)
4592            recno = recno * 10 + *ptr++ - '0';
4593
4594          if (*ptr != ')')
4595            {
4596            *errorcodeptr = ERR29;
4597            goto FAILED;
4598            }
4599
4600          if (refsign == '-')
4601            {
4602            if (recno == 0)
4603              {
4604              *errorcodeptr = ERR58;
4605              goto FAILED;
4606              }
4607            recno = cd->bracount - recno + 1;
4608            if (recno <= 0)
4609              {
4610              *errorcodeptr = ERR15;
4611              goto FAILED;
4612              }
4613            }
4614          else if (refsign == '+')
4615            {
4616            if (recno == 0)
4617              {
4618              *errorcodeptr = ERR58;
4619              goto FAILED;
4620              }
4621            recno += cd->bracount;
4622            }
4623
4624          /* Come here from code above that handles a named recursion */
4625
4626          HANDLE_RECURSION:
4627
4628          previous = code;
4629          called = cd->start_code;
4630
4631          /* When we are actually compiling, find the bracket that is being
4632          referenced. Temporarily end the regex in case it doesn't exist before
4633          this point. If we end up with a forward reference, first check that
4634          the bracket does occur later so we can give the error (and position)
4635          now. Then remember this forward reference in the workspace so it can
4636          be filled in at the end. */
4637
4638          if (lengthptr == NULL)
4639            {
4640            *code = OP_END;
4641            if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4642
4643            /* Forward reference */
4644
4645            if (called == NULL)
4646              {
4647              if (find_parens(ptr, cd->bracount, NULL, recno,
4648                   (options & PCRE_EXTENDED) != 0) < 0)
4649                {
4650                *errorcodeptr = ERR15;
4651                goto FAILED;
4652                }
4653              called = cd->start_code + recno;
4654              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4655              }
4656
4657            /* If not a forward reference, and the subpattern is still open,
4658            this is a recursive call. We check to see if this is a left
4659            recursion that could loop for ever, and diagnose that case. */
4660
4661            else if (GET(called, 1) == 0 &&
4662                     could_be_empty(called, code, bcptr, utf8))
4663              {
4664              *errorcodeptr = ERR40;
4665              goto FAILED;
4666              }
4667            }
4668
4669          /* Insert the recursion/subroutine item, automatically wrapped inside
4670          "once" brackets. Set up a "previous group" length so that a
4671          subsequent quantifier will work. */
4672
4673          *code = OP_ONCE;
4674          PUT(code, 1, 2 + 2*LINK_SIZE);
4675          code += 1 + LINK_SIZE;
4676
4677          *code = OP_RECURSE;
4678          PUT(code, 1, called - cd->start_code);
4679          code += 1 + LINK_SIZE;
4680
4681          *code = OP_KET;
4682          PUT(code, 1, 2 + 2*LINK_SIZE);
4683          code += 1 + LINK_SIZE;
4684
4685          length_prevgroup = 3 + 3*LINK_SIZE;
4686          }
4687
4688        /* Can't determine a first byte now */
4689
4690        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4691        continue;
4692
4693
4694        /* ------------------------------------------------------------ */
4695        default:              /* Other characters: check option setting */
4696        OTHER_CHAR_AFTER_QUERY:
4697        set = unset = 0;
4698        optset = &set;
4699
4700        while (*ptr != ')' && *ptr != ':')
4701          {
4702          switch (*ptr++)
4703            {
4704            case '-': optset = &unset; break;
4705
4706            case 'J':    /* Record that it changed in the external options */
4707            *optset |= PCRE_DUPNAMES;
4708            cd->external_flags |= PCRE_JCHANGED;
4709            break;
4710
4711            case 'i': *optset |= PCRE_CASELESS; break;
4712            case 'm': *optset |= PCRE_MULTILINE; break;
4713            case 's': *optset |= PCRE_DOTALL; break;
4714            case 'x': *optset |= PCRE_EXTENDED; break;
4715            case 'U': *optset |= PCRE_UNGREEDY; break;
4716            case 'X': *optset |= PCRE_EXTRA; break;
4717
4718            default:  *errorcodeptr = ERR12;
4719                      ptr--;    /* Correct the offset */
4720                      goto FAILED;
4721            }
4722          }
4723
4724        /* Set up the changed option bits, but don't change anything yet. */
4725
4726        newoptions = (options | set) & (~unset);
4727
4728        /* If the options ended with ')' this is not the start of a nested
4729        group with option changes, so the options change at this level. If this
4730        item is right at the start of the pattern, the options can be
4731        abstracted and made external in the pre-compile phase, and ignored in
4732        the compile phase. This can be helpful when matching -- for instance in
4733        caseless checking of required bytes.
4734
4735        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4736        definitely *not* at the start of the pattern because something has been
4737        compiled. In the pre-compile phase, however, the code pointer can have
4738        that value after the start, because it gets reset as code is discarded
4739        during the pre-compile. However, this can happen only at top level - if
4740        we are within parentheses, the starting BRA will still be present. At
4741        any parenthesis level, the length value can be used to test if anything
4742        has been compiled at that level. Thus, a test for both these conditions
4743        is necessary to ensure we correctly detect the start of the pattern in
4744        both phases.
4745
4746        If we are not at the pattern start, compile code to change the ims
4747        options if this setting actually changes any of them. We also pass the
4748        new setting back so that it can be put at the start of any following
4749        branches, and when this group ends (if we are in a group), a resetting
4750        item can be compiled. */
4751
4752        if (*ptr == ')')
4753          {
4754          if (code == cd->start_code + 1 + LINK_SIZE &&
4755               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4756            {
4757            cd->external_options = newoptions;
4758            options = newoptions;
4759            }
4760         else
4761            {
4762            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4763              {
4764              *code++ = OP_OPT;
4765              *code++ = newoptions & PCRE_IMS;
4766              }
4767
4768            /* Change options at this level, and pass them back for use
4769            in subsequent branches. Reset the greedy defaults and the case
4770            value for firstbyte and reqbyte. */
4771
4772            *optionsptr = options = newoptions;
4773            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4774            greedy_non_default = greedy_default ^ 1;
4775            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4776            }
4777
4778          previous = NULL;       /* This item can't be repeated */
4779          continue;              /* It is complete */
4780          }
4781
4782        /* If the options ended with ':' we are heading into a nested group
4783        with possible change of options. Such groups are non-capturing and are
4784        not assertions of any kind. All we need to do is skip over the ':';
4785        the newoptions value is handled below. */
4786
4787        bravalue = OP_BRA;
4788        ptr++;
4789        }     /* End of switch for character following (? */
4790      }       /* End of (? handling */
4791
4792    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4793    all unadorned brackets become non-capturing and behave like (?:...)
4794    brackets. */
4795
4796    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4797      {
4798      bravalue = OP_BRA;
4799      }
4800
4801    /* Else we have a capturing group. */
4802
4803    else
4804      {
4805      NUMBERED_GROUP:
4806      cd->bracount += 1;
4807      PUT2(code, 1+LINK_SIZE, cd->bracount);
4808      skipbytes = 2;
4809      }
4810
4811    /* Process nested bracketed regex. Assertions may not be repeated, but
4812    other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4813    non-register variable in order to be able to pass its address because some
4814    compilers complain otherwise. Pass in a new setting for the ims options if
4815    they have changed. */
4816
4817    previous = (bravalue >= OP_ONCE)? code : NULL;
4818    *code = bravalue;
4819    tempcode = code;
4820    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4821    length_prevgroup = 0;              /* Initialize for pre-compile phase */
4822
4823    if (!compile_regex(
4824         newoptions,                   /* The complete new option state */
4825         options & PCRE_IMS,           /* The previous ims option state */
4826         &tempcode,                    /* Where to put code (updated) */
4827         &ptr,                         /* Input pointer (updated) */
4828         errorcodeptr,                 /* Where to put an error message */
4829         (bravalue == OP_ASSERTBACK ||
4830          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4831         reset_bracount,               /* True if (?| group */
4832         skipbytes,                    /* Skip over bracket number */
4833         &subfirstbyte,                /* For possible first char */
4834         &subreqbyte,                  /* For possible last char */
4835         bcptr,                        /* Current branch chain */
4836         cd,                           /* Tables block */
4837         (lengthptr == NULL)? NULL :   /* Actual compile phase */
4838           &length_prevgroup           /* Pre-compile phase */
4839         ))
4840      goto FAILED;
4841
4842    /* At the end of compiling, code is still pointing to the start of the
4843    group, while tempcode has been updated to point past the end of the group
4844    and any option resetting that may follow it. The pattern pointer (ptr)
4845    is on the bracket. */
4846
4847    /* If this is a conditional bracket, check that there are no more than
4848    two branches in the group, or just one if it's a DEFINE group. We do this
4849    in the real compile phase, not in the pre-pass, where the whole group may
4850    not be available. */
4851
4852    if (bravalue == OP_COND && lengthptr == NULL)
4853      {
4854      uschar *tc = code;
4855      int condcount = 0;
4856
4857      do {
4858         condcount++;
4859         tc += GET(tc,1);
4860         }
4861      while (*tc != OP_KET);
4862
4863      /* A DEFINE group is never obeyed inline (the "condition" is always
4864      false). It must have only one branch. */
4865
4866      if (code[LINK_SIZE+1] == OP_DEF)
4867        {
4868        if (condcount > 1)
4869          {
4870          *errorcodeptr = ERR54;
4871          goto FAILED;
4872          }
4873        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4874        }
4875
4876      /* A "normal" conditional group. If there is just one branch, we must not
4877      make use of its firstbyte or reqbyte, because this is equivalent to an
4878      empty second branch. */
4879
4880      else
4881        {
4882        if (condcount > 2)
4883          {
4884          *errorcodeptr = ERR27;
4885          goto FAILED;
4886          }
4887        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4888        }
4889      }
4890
4891    /* Error if hit end of pattern */
4892
4893    if (*ptr != ')')
4894      {
4895      *errorcodeptr = ERR14;
4896      goto FAILED;
4897      }
4898
4899    /* In the pre-compile phase, update the length by the length of the group,
4900    less the brackets at either end. Then reduce the compiled code to just a
4901    set of non-capturing brackets so that it doesn't use much memory if it is
4902    duplicated by a quantifier.*/
4903
4904    if (lengthptr != NULL)
4905      {
4906      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4907        {
4908        *errorcodeptr = ERR20;
4909        goto FAILED;
4910        }
4911      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4912      *code++ = OP_BRA;
4913      PUTINC(code, 0, 1 + LINK_SIZE);
4914      *code++ = OP_KET;
4915      PUTINC(code, 0, 1 + LINK_SIZE);
4916      break;    /* No need to waste time with special character handling */
4917      }
4918
4919    /* Otherwise update the main code pointer to the end of the group. */
4920
4921    code = tempcode;
4922
4923    /* For a DEFINE group, required and first character settings are not
4924    relevant. */
4925
4926    if (bravalue == OP_DEF) break;
4927
4928    /* Handle updating of the required and first characters for other types of
4929    group. Update for normal brackets of all kinds, and conditions with two
4930    branches (see code above). If the bracket is followed by a quantifier with
4931    zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4932    zerofirstbyte outside the main loop so that they can be accessed for the
4933    back off. */
4934
4935    zeroreqbyte = reqbyte;
4936    zerofirstbyte = firstbyte;
4937    groupsetfirstbyte = FALSE;
4938
4939    if (bravalue >= OP_ONCE)
4940      {
4941      /* If we have not yet set a firstbyte in this branch, take it from the
4942      subpattern, remembering that it was set here so that a repeat of more
4943      than one can replicate it as reqbyte if necessary. If the subpattern has
4944      no firstbyte, set "none" for the whole branch. In both cases, a zero
4945      repeat forces firstbyte to "none". */
4946
4947      if (firstbyte == REQ_UNSET)
4948        {
4949        if (subfirstbyte >= 0)
4950          {
4951          firstbyte = subfirstbyte;
4952          groupsetfirstbyte = TRUE;
4953          }
4954        else firstbyte = REQ_NONE;
4955        zerofirstbyte = REQ_NONE;
4956        }
4957
4958      /* If firstbyte was previously set, convert the subpattern's firstbyte
4959      into reqbyte if there wasn't one, using the vary flag that was in
4960      existence beforehand. */
4961
4962      else if (subfirstbyte >= 0 && subreqbyte < 0)
4963        subreqbyte = subfirstbyte | tempreqvary;
4964
4965      /* If the subpattern set a required byte (or set a first byte that isn't
4966      really the first byte - see above), set it. */
4967
4968      if (subreqbyte >= 0) reqbyte = subreqbyte;
4969      }
4970
4971    /* For a forward assertion, we take the reqbyte, if set. This can be
4972    helpful if the pattern that follows the assertion doesn't set a different
4973    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4974    for an assertion, however because it leads to incorrect effect for patterns
4975    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4976    of a firstbyte. This is overcome by a scan at the end if there's no
4977    firstbyte, looking for an asserted first char. */
4978
4979    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4980    break;     /* End of processing '(' */
4981
4982
4983    /* ===================================================================*/
4984    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4985    are arranged to be the negation of the corresponding OP_values. For the
4986    back references, the values are ESC_REF plus the reference number. Only
4987    back references and those types that consume a character may be repeated.
4988    We can test for values between ESC_b and ESC_Z for the latter; this may
4989    have to change if any new ones are ever created. */
4990
4991    case '\\':
4992    tempptr = ptr;
4993    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4994    if (*errorcodeptr != 0) goto FAILED;
4995
4996    if (c < 0)
4997      {
4998      if (-c == ESC_Q)            /* Handle start of quoted string */
4999        {
5000        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5001          else inescq = TRUE;
5002        continue;
5003        }
5004
5005      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5006
5007      /* For metasequences that actually match a character, we disable the
5008      setting of a first character if it hasn't already been set. */
5009
5010      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5011        firstbyte = REQ_NONE;
5012
5013      /* Set values to reset to if this is followed by a zero repeat. */
5014
5015      zerofirstbyte = firstbyte;
5016      zeroreqbyte = reqbyte;
5017
5018      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5019      We also support \k{name} (.NET syntax) */
5020
5021      if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5022        {
5023        is_recurse = FALSE;
5024        terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5025        goto NAMED_REF_OR_RECURSE;
5026        }
5027
5028      /* Back references are handled specially; must disable firstbyte if
5029      not set to cope with cases like (?=(\w+))\1: which would otherwise set
5030      ':' later. */
5031
5032      if (-c >= ESC_REF)
5033        {
5034        recno = -c - ESC_REF;
5035
5036        HANDLE_REFERENCE:    /* Come here from named backref handling */
5037        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5038        previous = code;
5039        *code++ = OP_REF;
5040        PUT2INC(code, 0, recno);
5041        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5042        if (recno > cd->top_backref) cd->top_backref = recno;
5043        }
5044
5045      /* So are Unicode property matches, if supported. */
5046
5047#ifdef SUPPORT_UCP
5048      else if (-c == ESC_P || -c == ESC_p)
5049        {
5050        BOOL negated;
5051        int pdata;
5052        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5053        if (ptype < 0) goto FAILED;
5054        previous = code;
5055        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5056        *code++ = ptype;
5057        *code++ = pdata;
5058        }
5059#else
5060
5061      /* If Unicode properties are not supported, \X, \P, and \p are not
5062      allowed. */
5063
5064      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5065        {
5066        *errorcodeptr = ERR45;
5067        goto FAILED;
5068        }
5069#endif
5070
5071      /* For the rest (including \X when Unicode properties are supported), we
5072      can obtain the OP value by negating the escape value. */
5073
5074      else
5075        {
5076        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5077        *code++ = -c;
5078        }
5079      continue;
5080      }
5081
5082    /* We have a data character whose value is in c. In UTF-8 mode it may have
5083    a value > 127. We set its representation in the length/buffer, and then
5084    handle it as a data character. */
5085
5086#ifdef SUPPORT_UTF8
5087    if (utf8 && c > 127)
5088      mclength = _pcre_ord2utf8(c, mcbuffer);
5089    else
5090#endif
5091
5092     {
5093     mcbuffer[0] = c;
5094     mclength = 1;
5095     }
5096    goto ONE_CHAR;
5097
5098
5099    /* ===================================================================*/
5100    /* Handle a literal character. It is guaranteed not to be whitespace or #
5101    when the extended flag is set. If we are in UTF-8 mode, it may be a
5102    multi-byte literal character. */
5103
5104    default:
5105    NORMAL_CHAR:
5106    mclength = 1;
5107    mcbuffer[0] = c;
5108
5109#ifdef SUPPORT_UTF8
5110    if (utf8 && c >= 0xc0)
5111      {
5112      while ((ptr[1] & 0xc0) == 0x80)
5113        mcbuffer[mclength++] = *(++ptr);
5114      }
5115#endif
5116
5117    /* At this point we have the character's bytes in mcbuffer, and the length
5118    in mclength. When not in UTF-8 mode, the length is always 1. */
5119
5120    ONE_CHAR:
5121    previous = code;
5122    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5123    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5124
5125    /* Remember if \r or \n were seen */
5126
5127    if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5128      cd->external_flags |= PCRE_HASCRORLF;
5129
5130    /* Set the first and required bytes appropriately. If no previous first
5131    byte, set it from this character, but revert to none on a zero repeat.
5132    Otherwise, leave the firstbyte value alone, and don't change it on a zero
5133    repeat. */
5134
5135    if (firstbyte == REQ_UNSET)
5136      {
5137      zerofirstbyte = REQ_NONE;
5138      zeroreqbyte = reqbyte;
5139
5140      /* If the character is more than one byte long, we can set firstbyte
5141      only if it is not to be matched caselessly. */
5142
5143      if (mclength == 1 || req_caseopt == 0)
5144        {
5145        firstbyte = mcbuffer[0] | req_caseopt;
5146        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5147        }
5148      else firstbyte = reqbyte = REQ_NONE;
5149      }
5150
5151    /* firstbyte was previously set; we can set reqbyte only the length is
5152    1 or the matching is caseful. */
5153
5154    else
5155      {
5156      zerofirstbyte = firstbyte;
5157      zeroreqbyte = reqbyte;
5158      if (mclength == 1 || req_caseopt == 0)
5159        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5160      }
5161
5162    break;            /* End of literal character handling */
5163    }
5164  }                   /* end of big loop */
5165
5166
5167/* Control never reaches here by falling through, only by a goto for all the
5168error states. Pass back the position in the pattern so that it can be displayed
5169to the user for diagnosing the error. */
5170
5171FAILED:
5172*ptrptr = ptr;
5173return FALSE;
5174}
5175
5176
5177
5178
5179/*************************************************
5180*     Compile sequence of alternatives           *
5181*************************************************/
5182
5183/* On entry, ptr is pointing past the bracket character, but on return it
5184points to the closing bracket, or vertical bar, or end of string. The code
5185variable is pointing at the byte into which the BRA operator has been stored.
5186If the ims options are changed at the start (for a (?ims: group) or during any
5187branch, we need to insert an OP_OPT item at the start of every following branch
5188to ensure they get set correctly at run time, and also pass the new options
5189into every subsequent branch compile.
5190
5191This function is used during the pre-compile phase when we are trying to find
5192out the amount of memory needed, as well as during the real compile phase. The
5193value of lengthptr distinguishes the two phases.
5194
5195Arguments:
5196  options        option bits, including any changes for this subpattern
5197  oldims         previous settings of ims option bits
5198  codeptr        -> the address of the current code pointer
5199  ptrptr         -> the address of the current pattern pointer
5200  errorcodeptr   -> pointer to error code variable
5201  lookbehind     TRUE if this is a lookbehind assertion
5202  reset_bracount TRUE to reset the count for each branch
5203  skipbytes      skip this many bytes at start (for brackets and OP_COND)
5204  firstbyteptr   place to put the first required character, or a negative number
5205  reqbyteptr     place to put the last required character, or a negative number
5206  bcptr          pointer to the chain of currently open branches
5207  cd             points to the data block with tables pointers etc.
5208  lengthptr      NULL during the real compile phase
5209                 points to length accumulator during pre-compile phase
5210
5211Returns:         TRUE on success
5212*/
5213
5214static BOOL
5215compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5216  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5217  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5218  int *lengthptr)
5219{
5220const uschar *ptr = *ptrptr;
5221uschar *code = *codeptr;
5222uschar *last_branch = code;
5223uschar *start_bracket = code;
5224uschar *reverse_count = NULL;
5225int firstbyte, reqbyte;
5226int branchfirstbyte, branchreqbyte;
5227int length;
5228int orig_bracount;
5229int max_bracount;
5230branch_chain bc;
5231
5232bc.outer = bcptr;
5233bc.current = code;
5234
5235firstbyte = reqbyte = REQ_UNSET;
5236
5237/* Accumulate the length for use in the pre-compile phase. Start with the
5238length of the BRA and KET and any extra bytes that are required at the
5239beginning. We accumulate in a local variable to save frequent testing of
5240lenthptr for NULL. We cannot do this by looking at the value of code at the
5241start and end of each alternative, because compiled items are discarded during
5242the pre-compile phase so that the work space is not exceeded. */
5243
5244length = 2 + 2*LINK_SIZE + skipbytes;
5245
5246/* WARNING: If the above line is changed for any reason, you must also change
5247the code that abstracts option settings at the start of the pattern and makes
5248them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5249pre-compile phase to find out whether anything has yet been compiled or not. */
5250
5251/* Offset is set zero to mark that this bracket is still open */
5252
5253PUT(code, 1, 0);
5254code += 1 + LINK_SIZE + skipbytes;
5255
5256/* Loop for each alternative branch */
5257
5258orig_bracount = max_bracount = cd->bracount;
5259for (;;)
5260  {
5261  /* For a (?| group, reset the capturing bracket count so that each branch
5262  uses the same numbers. */
5263
5264  if (reset_bracount) cd->bracount = orig_bracount;
5265
5266  /* Handle a change of ims options at the start of the branch */
5267
5268  if ((options & PCRE_IMS) != oldims)
5269    {
5270    *code++ = OP_OPT;
5271    *code++ = options & PCRE_IMS;
5272    length += 2;
5273    }
5274
5275  /* Set up dummy OP_REVERSE if lookbehind assertion */
5276
5277  if (lookbehind)
5278    {
5279    *code++ = OP_REVERSE;
5280    reverse_count = code;
5281    PUTINC(code, 0, 0);
5282    length += 1 + LINK_SIZE;
5283    }
5284
5285  /* Now compile the branch; in the pre-compile phase its length gets added
5286  into the length. */
5287
5288  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5289        &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5290    {
5291    *ptrptr = ptr;
5292    return FALSE;
5293    }
5294
5295  /* Keep the highest bracket count in case (?| was used and some branch
5296  has fewer than the rest. */
5297
5298  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5299
5300  /* In the real compile phase, there is some post-processing to be done. */
5301
5302  if (lengthptr == NULL)
5303    {
5304    /* If this is the first branch, the firstbyte and reqbyte values for the
5305    branch become the values for the regex. */
5306
5307    if (*last_branch != OP_ALT)
5308      {
5309      firstbyte = branchfirstbyte;
5310      reqbyte = branchreqbyte;
5311      }
5312
5313    /* If this is not the first branch, the first char and reqbyte have to
5314    match the values from all the previous branches, except that if the
5315    previous value for reqbyte didn't have REQ_VARY set, it can still match,
5316    and we set REQ_VARY for the regex. */
5317
5318    else
5319      {
5320      /* If we previously had a firstbyte, but it doesn't match the new branch,
5321      we have to abandon the firstbyte for the regex, but if there was
5322      previously no reqbyte, it takes on the value of the old firstbyte. */
5323
5324      if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5325        {
5326        if (reqbyte < 0) reqbyte = firstbyte;
5327        firstbyte = REQ_NONE;
5328        }
5329
5330      /* If we (now or from before) have no firstbyte, a firstbyte from the
5331      branch becomes a reqbyte if there isn't a branch reqbyte. */
5332
5333      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5334          branchreqbyte = branchfirstbyte;
5335
5336      /* Now ensure that the reqbytes match */
5337
5338      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5339        reqbyte = REQ_NONE;
5340      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5341      }
5342
5343    /* If lookbehind, check that this branch matches a fixed-length string, and
5344    put the length into the OP_REVERSE item. Temporarily mark the end of the
5345    branch with OP_END. */
5346
5347    if (lookbehind)
5348      {
5349      int fixed_length;
5350      *code = OP_END;
5351      fixed_length = find_fixedlength(last_branch, options);
5352      DPRINTF(("fixed length = %d\n", fixed_length));
5353      if (fixed_length < 0)
5354        {
5355        *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5356        *ptrptr = ptr;
5357        return FALSE;
5358        }
5359      PUT(reverse_count, 0, fixed_length);
5360      }
5361    }
5362
5363  /* Reached end of expression, either ')' or end of pattern. In the real
5364  compile phase, go back through the alternative branches and reverse the chain
5365  of offsets, with the field in the BRA item now becoming an offset to the
5366  first alternative. If there are no alternatives, it points to the end of the
5367  group. The length in the terminating ket is always the length of the whole
5368  bracketed item. If any of the ims options were changed inside the group,
5369  compile a resetting op-code following, except at the very end of the pattern.
5370  Return leaving the pointer at the terminating char. */
5371
5372  if (*ptr != '|')
5373    {
5374    if (lengthptr == NULL)
5375      {
5376      int branch_length = code - last_branch;
5377      do
5378        {
5379        int prev_length = GET(last_branch, 1);
5380        PUT(last_branch, 1, branch_length);
5381        branch_length = prev_length;
5382        last_branch -= branch_length;
5383        }
5384      while (branch_length > 0);
5385      }
5386
5387    /* Fill in the ket */
5388
5389    *code = OP_KET;
5390    PUT(code, 1, code - start_bracket);
5391    code += 1 + LINK_SIZE;
5392
5393    /* Resetting option if needed */
5394
5395    if ((options & PCRE_IMS) != oldims && *ptr == ')')
5396      {
5397      *code++ = OP_OPT;
5398      *code++ = oldims;
5399      length += 2;
5400      }
5401
5402    /* Retain the highest bracket number, in case resetting was used. */
5403
5404    cd->bracount = max_bracount;
5405
5406    /* Set values to pass back */
5407
5408    *codeptr = code;
5409    *ptrptr = ptr;
5410    *firstbyteptr = firstbyte;
5411    *reqbyteptr = reqbyte;
5412    if (lengthptr != NULL)
5413      {
5414      if (OFLOW_MAX - *lengthptr < length)
5415        {
5416        *errorcodeptr = ERR20;
5417        return FALSE;
5418        }
5419      *lengthptr += length;
5420      }
5421    return TRUE;
5422    }
5423
5424  /* Another branch follows. In the pre-compile phase, we can move the code
5425  pointer back to where it was for the start of the first branch. (That is,
5426  pretend that each branch is the only one.)
5427
5428  In the real compile phase, insert an ALT node. Its length field points back
5429  to the previous branch while the bracket remains open. At the end the chain
5430  is reversed. It's done like this so that the start of the bracket has a
5431  zero offset until it is closed, making it possible to detect recursion. */
5432
5433  if (lengthptr != NULL)
5434    {
5435    code = *codeptr + 1 + LINK_SIZE + skipbytes;
5436    length += 1 + LINK_SIZE;
5437    }
5438  else
5439    {
5440    *code = OP_ALT;
5441    PUT(code, 1, code - last_branch);
5442    bc.current = last_branch = code;
5443    code += 1 + LINK_SIZE;
5444    }
5445
5446  ptr++;
5447  }
5448/* Control never reaches here */
5449}
5450
5451
5452
5453
5454/*************************************************
5455*          Check for anchored expression         *
5456*************************************************/
5457
5458/* Try to find out if this is an anchored regular expression. Consider each
5459alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5460all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5461it's anchored. However, if this is a multiline pattern, then only OP_SOD
5462counts, since OP_CIRC can match in the middle.
5463
5464We can also consider a regex to be anchored if OP_SOM starts all its branches.
5465This is the code for \G, which means "match at start of match position, taking
5466into account the match offset".
5467
5468A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5469because that will try the rest of the pattern at all possible matching points,
5470so there is no point trying again.... er ....
5471
5472.... except when the .* appears inside capturing parentheses, and there is a
5473subsequent back reference to those parentheses. We haven't enough information
5474to catch that case precisely.
5475
5476At first, the best we could do was to detect when .* was in capturing brackets
5477and the highest back reference was greater than or equal to that level.
5478However, by keeping a bitmap of the first 31 back references, we can catch some
5479of the more common cases more precisely.
5480
5481Arguments:
5482  code           points to start of expression (the bracket)
5483  options        points to the options setting
5484  bracket_map    a bitmap of which brackets we are inside while testing; this
5485                  handles up to substring 31; after that we just have to take
5486                  the less precise approach
5487  backref_map    the back reference bitmap
5488
5489Returns:     TRUE or FALSE
5490*/
5491
5492static BOOL
5493is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5494  unsigned int backref_map)
5495{
5496do {
5497   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5498     options, PCRE_MULTILINE, FALSE);
5499   register int op = *scode;
5500
5501   /* Non-capturing brackets */
5502
5503   if (op == OP_BRA)
5504     {
5505     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5506     }
5507
5508   /* Capturing brackets */
5509
5510   else if (op == OP_CBRA)
5511     {
5512     int n = GET2(scode, 1+LINK_SIZE);
5513     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5514     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5515     }
5516
5517   /* Other brackets */
5518
5519   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5520     {
5521     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5522     }
5523
5524   /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5525   are or may be referenced. */
5526
5527   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5528             op == OP_TYPEPOSSTAR) &&
5529            (*options & PCRE_DOTALL) != 0)
5530     {
5531     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5532     }
5533
5534   /* Check for explicit anchoring */
5535
5536   else if (op != OP_SOD && op != OP_SOM &&
5537           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5538     return FALSE;
5539   code += GET(code, 1);
5540   }
5541while (*code == OP_ALT);   /* Loop for each alternative */
5542return TRUE;
5543}
5544
5545
5546
5547/*************************************************
5548*         Check for starting with ^ or .*        *
5549*************************************************/
5550
5551/* This is called to find out if every branch starts with ^ or .* so that
5552"first char" processing can be done to speed things up in multiline
5553matching and for non-DOTALL patterns that start with .* (which must start at
5554the beginning or after \n). As in the case of is_anchored() (see above), we
5555have to take account of back references to capturing brackets that contain .*
5556because in that case we can't make the assumption.
5557
5558Arguments:
5559  code           points to start of expression (the bracket)
5560  bracket_map    a bitmap of which brackets we are inside while testing; this
5561                  handles up to substring 31; after that we just have to take
5562                  the less precise approach
5563  backref_map    the back reference bitmap
5564
5565Returns:         TRUE or FALSE
5566*/
5567
5568static BOOL
5569is_startline(const uschar *code, unsigned int bracket_map,
5570  unsigned int backref_map)
5571{
5572do {
5573   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5574     NULL, 0, FALSE);
5575   register int op = *scode;
5576
5577   /* Non-capturing brackets */
5578
5579   if (op == OP_BRA)
5580     {
5581     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5582     }
5583
5584   /* Capturing brackets */
5585
5586   else if (op == OP_CBRA)
5587     {
5588     int n = GET2(scode, 1+LINK_SIZE);
5589     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5590     if (!is_startline(scode, new_map, backref_map)) return FALSE;
5591     }
5592
5593   /* Other brackets */
5594
5595   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5596     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5597
5598   /* .* means "start at start or after \n" if it isn't in brackets that
5599   may be referenced. */
5600
5601   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5602     {
5603     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5604     }
5605
5606   /* Check for explicit circumflex */
5607
5608   else if (op != OP_CIRC) return FALSE;
5609
5610   /* Move on to the next alternative */
5611
5612   code += GET(code, 1);
5613   }
5614while (*code == OP_ALT);  /* Loop for each alternative */
5615return TRUE;
5616}
5617
5618
5619
5620/*************************************************
5621*       Check for asserted fixed first char      *
5622*************************************************/
5623
5624/* During compilation, the "first char" settings from forward assertions are
5625discarded, because they can cause conflicts with actual literals that follow.
5626However, if we end up without a first char setting for an unanchored pattern,
5627it is worth scanning the regex to see if there is an initial asserted first
5628char. If all branches start with the same asserted char, or with a bracket all
5629of whose alternatives start with the same asserted char (recurse ad lib), then
5630we return that char, otherwise -1.
5631
5632Arguments:
5633  code       points to start of expression (the bracket)
5634  options    pointer to the options (used to check casing changes)
5635  inassert   TRUE if in an assertion
5636
5637Returns:     -1 or the fixed first char
5638*/
5639
5640static int
5641find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5642{
5643register int c = -1;
5644do {
5645   int d;
5646   const uschar *scode =
5647     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5648   register int op = *scode;
5649
5650   switch(op)
5651     {
5652     default:
5653     return -1;
5654
5655     case OP_BRA:
5656     case OP_CBRA:
5657     case OP_ASSERT:
5658     case OP_ONCE:
5659     case OP_COND:
5660     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5661       return -1;
5662     if (c < 0) c = d; else if (c != d) return -1;
5663     break;
5664
5665     case OP_EXACT:       /* Fall through */
5666     scode += 2;
5667
5668     case OP_CHAR:
5669     case OP_CHARNC:
5670     case OP_PLUS:
5671     case OP_MINPLUS:
5672     case OP_POSPLUS:
5673     if (!inassert) return -1;
5674     if (c < 0)
5675       {
5676       c = scode[1];
5677       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5678       }
5679     else if (c != scode[1]) return -1;
5680     break;
5681     }
5682
5683   code += GET(code, 1);
5684   }
5685while (*code == OP_ALT);
5686return c;
5687}
5688
5689
5690
5691/*************************************************
5692*        Compile a Regular Expression            *
5693*************************************************/
5694
5695/* This function takes a string and returns a pointer to a block of store
5696holding a compiled version of the expression. The original API for this
5697function had no error code return variable; it is retained for backwards
5698compatibility. The new function is given a new name.
5699
5700Arguments:
5701  pattern       the regular expression
5702  options       various option bits
5703  errorcodeptr  pointer to error code variable (pcre_compile2() only)
5704                  can be NULL if you don't want a code value
5705  errorptr      pointer to pointer to error text
5706  erroroffset   ptr offset in pattern where error was detected
5707  tables        pointer to character tables or NULL
5708
5709Returns:        pointer to compiled data block, or NULL on error,
5710                with errorptr and erroroffset set
5711*/
5712
5713PCRE_EXP_DEFN pcre *
5714pcre_compile(const char *pattern, int options, const char **errorptr,
5715  int *erroroffset, const unsigned char *tables)
5716{
5717return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5718}
5719
5720
5721PCRE_EXP_DEFN pcre *
5722pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5723  const char **errorptr, int *erroroffset, const unsigned char *tables)
5724{
5725real_pcre *re;
5726int length = 1;  /* For final END opcode */
5727int firstbyte, reqbyte, newline;
5728int errorcode = 0;
5729int skipatstart = 0;
5730#ifdef SUPPORT_UTF8
5731BOOL utf8;
5732#endif
5733size_t size;
5734uschar *code;
5735const uschar *codestart;
5736const uschar *ptr;
5737compile_data compile_block;
5738compile_data *cd = &compile_block;
5739
5740/* This space is used for "compiling" into during the first phase, when we are
5741computing the amount of memory that is needed. Compiled items are thrown away
5742as soon as possible, so that a fairly large buffer should be sufficient for
5743this purpose. The same space is used in the second phase for remembering where
5744to fill in forward references to subpatterns. */
5745
5746uschar cworkspace[COMPILE_WORK_SIZE];
5747
5748
5749/* Set this early so that early errors get offset 0. */
5750
5751ptr = (const uschar *)pattern;
5752
5753/* We can't pass back an error message if errorptr is NULL; I guess the best we
5754can do is just return NULL, but we can set a code value if there is a code
5755pointer. */
5756
5757if (errorptr == NULL)
5758  {
5759  if (errorcodeptr != NULL) *errorcodeptr = 99;
5760  return NULL;
5761  }
5762
5763*errorptr = NULL;
5764if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5765
5766/* However, we can give a message for this error */
5767
5768if (erroroffset == NULL)
5769  {
5770  errorcode = ERR16;
5771  goto PCRE_EARLY_ERROR_RETURN2;
5772  }
5773
5774*erroroffset = 0;
5775
5776/* Can't support UTF8 unless PCRE has been compiled to include the code. */
5777
5778#ifdef SUPPORT_UTF8
5779utf8 = (options & PCRE_UTF8) != 0;
5780if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5781     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5782  {
5783  errorcode = ERR44;
5784  goto PCRE_EARLY_ERROR_RETURN2;
5785  }
5786#else
5787if ((options & PCRE_UTF8) != 0)
5788  {
5789  errorcode = ERR32;
5790  goto PCRE_EARLY_ERROR_RETURN;
5791  }
5792#endif
5793
5794if ((options & ~PUBLIC_OPTIONS) != 0)
5795  {
5796  errorcode = ERR17;
5797  goto PCRE_EARLY_ERROR_RETURN;
5798  }
5799
5800/* Set up pointers to the individual character tables */
5801
5802if (tables == NULL) tables = _pcre_default_tables;
5803cd->lcc = tables + lcc_offset;
5804cd->fcc = tables + fcc_offset;
5805cd->cbits = tables + cbits_offset;
5806cd->ctypes = tables + ctypes_offset;
5807
5808/* Check for global one-time settings at the start of the pattern, and remember
5809the offset for later. */
5810
5811while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5812  {
5813  int newnl = 0;
5814  int newbsr = 0;
5815
5816  if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5817    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5818  else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
5819    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5820  else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
5821    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5822  else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5823    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5824  else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
5825    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5826
5827  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5828    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5829  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5830    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5831
5832  if (newnl != 0)
5833    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5834  else if (newbsr != 0)
5835    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5836  else break;
5837  }
5838
5839/* Check validity of \R options. */
5840
5841switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5842  {
5843  case 0:
5844  case PCRE_BSR_ANYCRLF:
5845  case PCRE_BSR_UNICODE:
5846  break;
5847  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5848  }
5849
5850/* Handle different types of newline. The three bits give seven cases. The
5851current code allows for fixed one- or two-byte sequences, plus "any" and
5852"anycrlf". */
5853
5854switch (options & PCRE_NEWLINE_BITS)
5855  {
5856  case 0: newline = NEWLINE; break;   /* Build-time default */
5857  case PCRE_NEWLINE_CR: newline = '\r'; break;
5858  case PCRE_NEWLINE_LF: newline = '\n'; break;
5859  case PCRE_NEWLINE_CR+
5860       PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5861  case PCRE_NEWLINE_ANY: newline = -1; break;
5862  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5863  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5864  }
5865
5866if (newline == -2)
5867  {
5868  cd->nltype = NLTYPE_ANYCRLF;
5869  }
5870else if (newline < 0)
5871  {
5872  cd->nltype = NLTYPE_ANY;
5873  }
5874else
5875  {
5876  cd->nltype = NLTYPE_FIXED;
5877  if (newline > 255)
5878    {
5879    cd->nllen = 2;
5880    cd->nl[0] = (newline >> 8) & 255;
5881    cd->nl[1] = newline & 255;
5882    }
5883  else
5884    {
5885    cd->nllen = 1;
5886    cd->nl[0] = newline;
5887    }
5888  }
5889
5890/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5891references to help in deciding whether (.*) can be treated as anchored or not.
5892*/
5893
5894cd->top_backref = 0;
5895cd->backref_map = 0;
5896
5897/* Reflect pattern for debugging output */
5898
5899DPRINTF(("------------------------------------------------------------------\n"));
5900DPRINTF(("%s\n", pattern));
5901
5902/* Pretend to compile the pattern while actually just accumulating the length
5903of memory required. This behaviour is triggered by passing a non-NULL final
5904argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5905to compile parts of the pattern into; the compiled code is discarded when it is
5906no longer needed, so hopefully this workspace will never overflow, though there
5907is a test for its doing so. */
5908
5909cd->bracount = 0;
5910cd->names_found = 0;
5911cd->name_entry_size = 0;
5912cd->name_table = NULL;
5913cd->start_workspace = cworkspace;
5914cd->start_code = cworkspace;
5915cd->hwm = cworkspace;
5916cd->start_pattern = (const uschar *)pattern;
5917cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5918cd->req_varyopt = 0;
5919cd->external_options = options;
5920cd->external_flags = 0;
5921
5922/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5923don't need to look at the result of the function here. The initial options have
5924been put into the cd block so that they can be changed if an option setting is
5925found within the regex right at the beginning. Bringing initial option settings
5926outside can help speed up starting point checks. */
5927
5928ptr += skipatstart;
5929code = cworkspace;
5930*code = OP_BRA;
5931(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5932  &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5933  &length);
5934if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5935
5936DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5937  cd->hwm - cworkspace));
5938
5939if (length > MAX_PATTERN_SIZE)
5940  {
5941  errorcode = ERR20;
5942  goto PCRE_EARLY_ERROR_RETURN;
5943  }
5944
5945/* Compute the size of data block needed and get it, either from malloc or
5946externally provided function. Integer overflow should no longer be possible
5947because nowadays we limit the maximum value of cd->names_found and
5948cd->name_entry_size. */
5949
5950size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5951re = (real_pcre *)(pcre_malloc)(size);
5952
5953if (re == NULL)
5954  {
5955  errorcode = ERR21;
5956  goto PCRE_EARLY_ERROR_RETURN;
5957  }
5958
5959/* Put in the magic number, and save the sizes, initial options, internal
5960flags, and character table pointer. NULL is used for the default character
5961tables. The nullpad field is at the end; it's there to help in the case when a
5962regex compiled on a system with 4-byte pointers is run on another with 8-byte
5963pointers. */
5964
5965re->magic_number = MAGIC_NUMBER;
5966re->size = size;
5967re->options = cd->external_options;
5968re->flags = cd->external_flags;
5969re->dummy1 = 0;
5970re->first_byte = 0;
5971re->req_byte = 0;
5972re->name_table_offset = sizeof(real_pcre);
5973re->name_entry_size = cd->name_entry_size;
5974re->name_count = cd->names_found;
5975re->ref_count = 0;
5976re->tables = (tables == _pcre_default_tables)? NULL : tables;
5977re->nullpad = NULL;
5978
5979/* The starting points of the name/number translation table and of the code are
5980passed around in the compile data block. The start/end pattern and initial
5981options are already set from the pre-compile phase, as is the name_entry_size
5982field. Reset the bracket count and the names_found field. Also reset the hwm
5983field; this time it's used for remembering forward references to subpatterns.
5984*/
5985
5986cd->bracount = 0;
5987cd->names_found = 0;
5988cd->name_table = (uschar *)re + re->name_table_offset;
5989codestart = cd->name_table + re->name_entry_size * re->name_count;
5990cd->start_code = codestart;
5991cd->hwm = cworkspace;
5992cd->req_varyopt = 0;
5993cd->had_accept = FALSE;
5994
5995/* Set up a starting, non-extracting bracket, then compile the expression. On
5996error, errorcode will be set non-zero, so we don't need to look at the result
5997of the function here. */
5998
5999ptr = (const uschar *)pattern + skipatstart;
6000code = (uschar *)codestart;
6001*code = OP_BRA;
6002(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6003  &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6004re->top_bracket = cd->bracount;
6005re->top_backref = cd->top_backref;
6006re->flags = cd->external_flags;
6007
6008if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6009
6010/* If not reached end of pattern on success, there's an excess bracket. */
6011
6012if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6013
6014/* Fill in the terminating state and check for disastrous overflow, but
6015if debugging, leave the test till after things are printed out. */
6016
6017*code++ = OP_END;
6018
6019#ifndef DEBUG
6020if (code - codestart > length) errorcode = ERR23;
6021#endif
6022
6023/* Fill in any forward references that are required. */
6024
6025while (errorcode == 0 && cd->hwm > cworkspace)
6026  {
6027  int offset, recno;
6028  const uschar *groupptr;
6029  cd->hwm -= LINK_SIZE;
6030  offset = GET(cd->hwm, 0);
6031  recno = GET(codestart, offset);
6032  groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6033  if (groupptr == NULL) errorcode = ERR53;
6034    else PUT(((uschar *)codestart), offset, groupptr - codestart);
6035  }
6036
6037/* Give an error if there's back reference to a non-existent capturing
6038subpattern. */
6039
6040if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6041
6042/* Failed to compile, or error while post-processing */
6043
6044if (errorcode != 0)
6045  {
6046  (pcre_free)(re);
6047  PCRE_EARLY_ERROR_RETURN:
6048  *erroroffset = ptr - (const uschar *)pattern;
6049  PCRE_EARLY_ERROR_RETURN2:
6050  *errorptr = find_error_text(errorcode);
6051  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6052  return NULL;
6053  }
6054
6055/* If the anchored option was not passed, set the flag if we can determine that
6056the pattern is anchored by virtue of ^ characters or \A or anything else (such
6057as starting with .* when DOTALL is set).
6058
6059Otherwise, if we know what the first byte has to be, save it, because that
6060speeds up unanchored matches no end. If not, see if we can set the
6061PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6062start with ^. and also when all branches start with .* for non-DOTALL matches.
6063*/
6064
6065if ((re->options & PCRE_ANCHORED) == 0)
6066  {
6067  int temp_options = re->options;   /* May get changed during these scans */
6068  if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6069    re->options |= PCRE_ANCHORED;
6070  else
6071    {
6072    if (firstbyte < 0)
6073      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6074    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6075      {
6076      int ch = firstbyte & 255;
6077      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6078         cd->fcc[ch] == ch)? ch : firstbyte;
6079      re->flags |= PCRE_FIRSTSET;
6080      }
6081    else if (is_startline(codestart, 0, cd->backref_map))
6082      re->flags |= PCRE_STARTLINE;
6083    }
6084  }
6085
6086/* For an anchored pattern, we use the "required byte" only if it follows a
6087variable length item in the regex. Remove the caseless flag for non-caseable
6088bytes. */
6089
6090if (reqbyte >= 0 &&
6091     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6092  {
6093  int ch = reqbyte & 255;
6094  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6095    cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6096  re->flags |= PCRE_REQCHSET;
6097  }
6098
6099/* Print out the compiled data if debugging is enabled. This is never the
6100case when building a production library. */
6101
6102#ifdef DEBUG
6103
6104printf("Length = %d top_bracket = %d top_backref = %d\n",
6105  length, re->top_bracket, re->top_backref);
6106
6107printf("Options=%08x\n", re->options);
6108
6109if ((re->flags & PCRE_FIRSTSET) != 0)
6110  {
6111  int ch = re->first_byte & 255;
6112  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6113    "" : " (caseless)";
6114  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6115    else printf("First char = \\x%02x%s\n", ch, caseless);
6116  }
6117
6118if ((re->flags & PCRE_REQCHSET) != 0)
6119  {
6120  int ch = re->req_byte & 255;
6121  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6122    "" : " (caseless)";
6123  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6124    else printf("Req char = \\x%02x%s\n", ch, caseless);
6125  }
6126
6127pcre_printint(re, stdout, TRUE);
6128
6129/* This check is done here in the debugging case so that the code that
6130was compiled can be seen. */
6131
6132if (code - codestart > length)
6133  {
6134  (pcre_free)(re);
6135  *errorptr = find_error_text(ERR23);
6136  *erroroffset = ptr - (uschar *)pattern;
6137  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6138  return NULL;
6139  }
6140#endif   /* DEBUG */
6141
6142return (pcre *)re;
6143}
6144
6145/* End of pcre_compile.c */
Note: See TracBrowser for help on using the repository browser.