source: project/chicken/branches/prerelease/pcre/pcre_compile.c @ 11958

Last change on this file since 11958 was 11958, checked in by Ivan Raikov, 11 years ago

Merged trunk and prerelease.

File size: 197.9 KB
Line 
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2008 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing processed string start */
51#define PSEND   end_pattern    /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* When DEBUG is defined, we need the pcre_printint() function, which is also
57used by pcretest. DEBUG is not defined when building a production library. */
58
59#ifdef DEBUG
60#include "pcre_printint.src"
61#endif
62
63
64/* Macro for setting individual bits in class bitmaps. */
65
66#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68/* Maximum length value to check against when making sure that the integer that
69holds the compiled pattern length does not overflow. We make it a bit less than
70INT_MAX to allow for adding in group terminating bytes, so that we don't have
71to check them every time. */
72
73#define OFLOW_MAX (INT_MAX - 20)
74
75
76/*************************************************
77*      Code parameters and static tables         *
78*************************************************/
79
80/* This value specifies the size of stack workspace that is used during the
81first pre-compile phase that determines how much memory is required. The regex
82is partly compiled into this space, but the compiled parts are discarded as
83soon as they can be, so that hopefully there will never be an overrun. The code
84does, however, check for an overrun. The largest amount I've seen used is 218,
85so this number is very generous.
86
87The same workspace is used during the second, actual compile phase for
88remembering forward references to groups so that they can be filled in at the
89end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90is 4 there is plenty of room. */
91
92#define COMPILE_WORK_SIZE (4096)
93
94
95/* Table for handling escaped characters in the range '0'-'z'. Positive returns
96are simple data values; negative values are for special things like \d and so
97on. Zero means further processing is needed (for things like \x), or the escape
98is invalid. */
99
100#ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101static const short int escapes[] = {
102     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105-ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106-ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107-ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109-ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110-ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111     0,      0, -ESC_z                                            /* x - z */
112};
113
114#else           /* This is the "abnormal" table for EBCDIC systems */
115static const short int escapes[] = {
116/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
118/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
119/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
120/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
121/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
139};
140#endif
141
142
143/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144searched linearly. Put all the names into a single string, in order to reduce
145the number of relocations when a shared library is dynamically linked. */
146
147typedef struct verbitem {
148  int   len;
149  int   op;
150} verbitem;
151
152static const char verbnames[] =
153  "ACCEPT\0"
154  "COMMIT\0"
155  "F\0"
156  "FAIL\0"
157  "PRUNE\0"
158  "SKIP\0"
159  "THEN";
160
161static const verbitem verbs[] = {
162  { 6, OP_ACCEPT },
163  { 6, OP_COMMIT },
164  { 1, OP_FAIL },
165  { 4, OP_FAIL },
166  { 5, OP_PRUNE },
167  { 4, OP_SKIP  },
168  { 4, OP_THEN  }
169};
170
171static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174/* Tables of names of POSIX character classes and their lengths. The names are
175now all in a single string, to reduce the number of relocations when a shared
176library is dynamically loaded. The list of lengths is terminated by a zero
177length entry. The first three must be alpha, lower, upper, as this is assumed
178for handling case independence. */
179
180static const char posix_names[] =
181  "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182  "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183  "word\0"   "xdigit";
184
185static const uschar posix_name_lengths[] = {
186  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188/* Table of class bit maps for each POSIX class. Each class is formed from a
189base map, with an optional addition or removal of another map. Then, for some
190classes, there is some additional tweaking: for [:blank:] the vertical space
191characters are removed, and for [:alpha:] and [:alnum:] the underscore
192character is removed. The triples in the table consist of the base map offset,
193second map offset or -1 if no second map, and a non-negative value for map
194addition or a negative value for map subtraction (if there are two maps). The
195absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196remove vertical space characters, 2 => remove underscore. */
197
198static const int posix_class_maps[] = {
199  cbit_word,  cbit_digit, -2,             /* alpha */
200  cbit_lower, -1,          0,             /* lower */
201  cbit_upper, -1,          0,             /* upper */
202  cbit_word,  -1,          2,             /* alnum - word without underscore */
203  cbit_print, cbit_cntrl,  0,             /* ascii */
204  cbit_space, -1,          1,             /* blank - a GNU extension */
205  cbit_cntrl, -1,          0,             /* cntrl */
206  cbit_digit, -1,          0,             /* digit */
207  cbit_graph, -1,          0,             /* graph */
208  cbit_print, -1,          0,             /* print */
209  cbit_punct, -1,          0,             /* punct */
210  cbit_space, -1,          0,             /* space */
211  cbit_word,  -1,          0,             /* word - a Perl extension */
212  cbit_xdigit,-1,          0              /* xdigit */
213};
214
215
216#define STRING(a)  # a
217#define XSTRING(s) STRING(s)
218
219/* The texts of compile-time error messages. These are "char *" because they
220are passed to the outside world. Do not ever re-use any error number, because
221they are documented. Always add a new error instead. Messages marked DEAD below
222are no longer used. This used to be a table of strings, but in order to reduce
223the number of relocations needed when a shared library is loaded dynamically,
224it is now one long string. We cannot use a table of offsets, because the
225lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226simply count through to the one we want - this isn't a performance issue
227because these strings are used only when there is a compilation error. */
228
229static const char error_texts[] =
230  "no error\0"
231  "\\ at end of pattern\0"
232  "\\c at end of pattern\0"
233  "unrecognized character follows \\\0"
234  "numbers out of order in {} quantifier\0"
235  /* 5 */
236  "number too big in {} quantifier\0"
237  "missing terminating ] for character class\0"
238  "invalid escape sequence in character class\0"
239  "range out of order in character class\0"
240  "nothing to repeat\0"
241  /* 10 */
242  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243  "internal error: unexpected repeat\0"
244  "unrecognized character after (? or (?-\0"
245  "POSIX named classes are supported only within a class\0"
246  "missing )\0"
247  /* 15 */
248  "reference to non-existent subpattern\0"
249  "erroffset passed as NULL\0"
250  "unknown option bit(s) set\0"
251  "missing ) after comment\0"
252  "parentheses nested too deeply\0"  /** DEAD **/
253  /* 20 */
254  "regular expression is too large\0"
255  "failed to get memory\0"
256  "unmatched parentheses\0"
257  "internal error: code overflow\0"
258  "unrecognized character after (?<\0"
259  /* 25 */
260  "lookbehind assertion is not fixed length\0"
261  "malformed number or name after (?(\0"
262  "conditional group contains more than two branches\0"
263  "assertion expected after (?(\0"
264  "(?R or (?[+-]digits must be followed by )\0"
265  /* 30 */
266  "unknown POSIX class name\0"
267  "POSIX collating elements are not supported\0"
268  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269  "spare error\0"  /** DEAD **/
270  "character value in \\x{...} sequence is too large\0"
271  /* 35 */
272  "invalid condition (?(0)\0"
273  "\\C not allowed in lookbehind assertion\0"
274  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275  "number after (?C is > 255\0"
276  "closing ) for (?C expected\0"
277  /* 40 */
278  "recursive call could loop indefinitely\0"
279  "unrecognized character after (?P\0"
280  "syntax error in subpattern name (missing terminator)\0"
281  "two named subpatterns have the same name\0"
282  "invalid UTF-8 string\0"
283  /* 45 */
284  "support for \\P, \\p, and \\X has not been compiled\0"
285  "malformed \\P or \\p sequence\0"
286  "unknown property name after \\P or \\p\0"
287  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289  /* 50 */
290  "repeated subpattern is too long\0"    /** DEAD **/
291  "octal value is greater than \\377 (not in UTF-8 mode)\0"
292  "internal error: overran compiling workspace\0"
293  "internal error: previously-checked referenced subpattern not found\0"
294  "DEFINE group contains more than one branch\0"
295  /* 55 */
296  "repeating a DEFINE group is not allowed\0"
297  "inconsistent NEWLINE options\0"
298  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299  "a numbered reference must not be zero\0"
300  "(*VERB) with an argument is not supported\0"
301  /* 60 */
302  "(*VERB) not recognized\0"
303  "number is too big\0"
304  "subpattern name expected\0"
305  "digit expected after (?+\0"
306  "] is an invalid data character in JavaScript compatibility mode";
307
308
309/* Table to identify digits and hex digits. This is used when compiling
310patterns. Note that the tables in chartables are dependent on the locale, and
311may mark arbitrary characters as digits - but the PCRE compiling code expects
312to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313a private table here. It costs 256 bytes, but it is a lot faster than doing
314character value tests (at least in some simple cases I timed), and in some
315applications one wants PCRE to compile efficiently as well as match
316efficiently.
317
318For convenience, we use the same bit definitions as in chartables:
319
320  0x04   decimal digit
321  0x08   hexadecimal digit
322
323Then we can use ctype_digit and ctype_xdigit in the code. */
324
325#ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
326static const unsigned char digitab[] =
327  {
328  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
329  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
330  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
331  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
332  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
333  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
334  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
335  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
336  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
337  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
338  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
339  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
340  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
341  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
342  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
343  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
344  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360
361#else           /* This is the "abnormal" case, for EBCDIC systems */
362static const unsigned char digitab[] =
363  {
364  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
365  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
366  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
367  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
368  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
369  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
370  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
371  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
372  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
373  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
374  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
375  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
376  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
377  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
378  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
380  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
381  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
382  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
383  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
384  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
385  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
386  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
387  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
388  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
389  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
390  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
391  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
392  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
393  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
394  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
395  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
396
397static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
399  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
400  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
401  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
402  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
403  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
404  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
405  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
406  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
407  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
408  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
409  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
410  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
411  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
412  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
414  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
415  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
416  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
417  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
418  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
419  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
420  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
421  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
423  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
424  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
425  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
426  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
427  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
428  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
429  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
430#endif
431
432
433/* Definition to allow mutual recursion */
434
435static BOOL
436  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437    int *, int *, branch_chain *, compile_data *, int *);
438
439
440
441/*************************************************
442*            Find an error text                  *
443*************************************************/
444
445/* The error texts are now all in one long string, to save on relocations. As
446some of the text is of unknown length, we can't use a table of offsets.
447Instead, just count through the strings. This is not a performance issue
448because it happens only when there has been a compilation error.
449
450Argument:   the error number
451Returns:    pointer to the error string
452*/
453
454static const char *
455find_error_text(int n)
456{
457const char *s = error_texts;
458for (; n > 0; n--) while (*s++ != 0);
459return s;
460}
461
462
463/*************************************************
464*            Handle escapes                      *
465*************************************************/
466
467/* This function is called when a \ has been encountered. It either returns a
468positive value for a simple escape such as \n, or a negative value which
469encodes one of the more complicated things such as \d. A backreference to group
470n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472ptr is pointing at the \. On exit, it is on the final character of the escape
473sequence.
474
475Arguments:
476  ptrptr         points to the pattern position pointer
477  errorcodeptr   points to the errorcode variable
478  bracount       number of previous extracting brackets
479  options        the options bits
480  isclass        TRUE if inside a character class
481
482Returns:         zero or positive => a data character
483                 negative => a special escape sequence
484                 on error, errorcodeptr is set
485*/
486
487static int
488check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489  int options, BOOL isclass)
490{
491BOOL utf8 = (options & PCRE_UTF8) != 0;
492const uschar *ptr = *ptrptr + 1;
493int c, i;
494
495GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
496ptr--;                            /* Set pointer back to the last byte */
497
498/* If backslash is at the end of the pattern, it's an error. */
499
500if (c == 0) *errorcodeptr = ERR1;
501
502/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503in a table. A non-zero result is something that can be returned immediately.
504Otherwise further processing may be required. */
505
506#ifndef EBCDIC  /* ASCII coding */
507else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
508else if ((i = escapes[c - '0']) != 0) c = i;
509
510#else           /* EBCDIC coding */
511else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
512else if ((i = escapes[c - 0x48]) != 0)  c = i;
513#endif
514
515/* Escapes that need further processing, or are illegal. */
516
517else
518  {
519  const uschar *oldptr;
520  BOOL braced, negated;
521
522  switch (c)
523    {
524    /* A number of Perl escapes are not handled by PCRE. We give an explicit
525    error. */
526
527    case 'l':
528    case 'L':
529    case 'N':
530    case 'u':
531    case 'U':
532    *errorcodeptr = ERR37;
533    break;
534
535    /* \g must be followed by one of a number of specific things:
536
537    (1) A number, either plain or braced. If positive, it is an absolute
538    backreference. If negative, it is a relative backreference. This is a Perl
539    5.10 feature.
540
541    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542    is part of Perl's movement towards a unified syntax for back references. As
543    this is synonymous with \k{name}, we fudge it up by pretending it really
544    was \k.
545
546    (3) For Oniguruma compatibility we also support \g followed by a name or a
547    number either in angle brackets or in single quotes. However, these are
548    (possibly recursive) subroutine calls, _not_ backreferences. Just return
549    the -ESC_g code (cf \k). */
550
551    case 'g':
552    if (ptr[1] == '<' || ptr[1] == '\'')
553      {
554      c = -ESC_g;
555      break;
556      }
557
558    /* Handle the Perl-compatible cases */
559
560    if (ptr[1] == '{')
561      {
562      const uschar *p;
563      for (p = ptr+2; *p != 0 && *p != '}'; p++)
564        if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565      if (*p != 0 && *p != '}')
566        {
567        c = -ESC_k;
568        break;
569        }
570      braced = TRUE;
571      ptr++;
572      }
573    else braced = FALSE;
574
575    if (ptr[1] == '-')
576      {
577      negated = TRUE;
578      ptr++;
579      }
580    else negated = FALSE;
581
582    c = 0;
583    while ((digitab[ptr[1]] & ctype_digit) != 0)
584      c = c * 10 + *(++ptr) - '0';
585
586    if (c < 0)   /* Integer overflow */
587      {
588      *errorcodeptr = ERR61;
589      break;
590      }
591
592    if (braced && *(++ptr) != '}')
593      {
594      *errorcodeptr = ERR57;
595      break;
596      }
597
598    if (c == 0)
599      {
600      *errorcodeptr = ERR58;
601      break;
602      }
603
604    if (negated)
605      {
606      if (c > bracount)
607        {
608        *errorcodeptr = ERR15;
609        break;
610        }
611      c = bracount - (c - 1);
612      }
613
614    c = -(ESC_REF + c);
615    break;
616
617    /* The handling of escape sequences consisting of a string of digits
618    starting with one that is not zero is not straightforward. By experiment,
619    the way Perl works seems to be as follows:
620
621    Outside a character class, the digits are read as a decimal number. If the
622    number is less than 10, or if there are that many previous extracting
623    left brackets, then it is a back reference. Otherwise, up to three octal
624    digits are read to form an escaped byte. Thus \123 is likely to be octal
625    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626    value is greater than 377, the least significant 8 bits are taken. Inside a
627    character class, \ followed by a digit is always an octal number. */
628
629    case '1': case '2': case '3': case '4': case '5':
630    case '6': case '7': case '8': case '9':
631
632    if (!isclass)
633      {
634      oldptr = ptr;
635      c -= '0';
636      while ((digitab[ptr[1]] & ctype_digit) != 0)
637        c = c * 10 + *(++ptr) - '0';
638      if (c < 0)    /* Integer overflow */
639        {
640        *errorcodeptr = ERR61;
641        break;
642        }
643      if (c < 10 || c <= bracount)
644        {
645        c = -(ESC_REF + c);
646        break;
647        }
648      ptr = oldptr;      /* Put the pointer back and fall through */
649      }
650
651    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652    generates a binary zero byte and treats the digit as a following literal.
653    Thus we have to pull back the pointer by one. */
654
655    if ((c = *ptr) >= '8')
656      {
657      ptr--;
658      c = 0;
659      break;
660      }
661
662    /* \0 always starts an octal number, but we may drop through to here with a
663    larger first octal digit. The original code used just to take the least
664    significant 8 bits of octal numbers (I think this is what early Perls used
665    to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666    than 3 octal digits. */
667
668    case '0':
669    c -= '0';
670    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671        c = c * 8 + *(++ptr) - '0';
672    if (!utf8 && c > 255) *errorcodeptr = ERR51;
673    break;
674
675    /* \x is complicated. \x{ddd} is a character number which can be greater
676    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677    treated as a data character. */
678
679    case 'x':
680    if (ptr[1] == '{')
681      {
682      const uschar *pt = ptr + 2;
683      int count = 0;
684
685      c = 0;
686      while ((digitab[*pt] & ctype_xdigit) != 0)
687        {
688        register int cc = *pt++;
689        if (c == 0 && cc == '0') continue;     /* Leading zeroes */
690        count++;
691
692#ifndef EBCDIC  /* ASCII coding */
693        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
694        c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695#else           /* EBCDIC coding */
696        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
697        c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698#endif
699        }
700
701      if (*pt == '}')
702        {
703        if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704        ptr = pt;
705        break;
706        }
707
708      /* If the sequence of hex digits does not end with '}', then we don't
709      recognize this construct; fall through to the normal \x handling. */
710      }
711
712    /* Read just a single-byte hex-defined char */
713
714    c = 0;
715    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716      {
717      int cc;                               /* Some compilers don't like ++ */
718      cc = *(++ptr);                        /* in initializers */
719#ifndef EBCDIC  /* ASCII coding */
720      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
721      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722#else           /* EBCDIC coding */
723      if (cc <= 'z') cc += 64;              /* Convert to upper case */
724      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725#endif
726      }
727    break;
728
729    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730    This coding is ASCII-specific, but then the whole concept of \cx is
731    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732
733    case 'c':
734    c = *(++ptr);
735    if (c == 0)
736      {
737      *errorcodeptr = ERR2;
738      break;
739      }
740
741#ifndef EBCDIC  /* ASCII coding */
742    if (c >= 'a' && c <= 'z') c -= 32;
743    c ^= 0x40;
744#else           /* EBCDIC coding */
745    if (c >= 'a' && c <= 'z') c += 64;
746    c ^= 0xC0;
747#endif
748    break;
749
750    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751    other alphanumeric following \ is an error if PCRE_EXTRA was set;
752    otherwise, for Perl compatibility, it is a literal. This code looks a bit
753    odd, but there used to be some cases other than the default, and there may
754    be again in future, so I haven't "optimized" it. */
755
756    default:
757    if ((options & PCRE_EXTRA) != 0) switch(c)
758      {
759      default:
760      *errorcodeptr = ERR3;
761      break;
762      }
763    break;
764    }
765  }
766
767*ptrptr = ptr;
768return c;
769}
770
771
772
773#ifdef SUPPORT_UCP
774/*************************************************
775*               Handle \P and \p                 *
776*************************************************/
777
778/* This function is called after \P or \p has been encountered, provided that
779PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780pointing at the P or p. On exit, it is pointing at the final character of the
781escape sequence.
782
783Argument:
784  ptrptr         points to the pattern position pointer
785  negptr         points to a boolean that is set TRUE for negation else FALSE
786  dptr           points to an int that is set to the detailed property value
787  errorcodeptr   points to the error code variable
788
789Returns:         type value from ucp_type_table, or -1 for an invalid type
790*/
791
792static int
793get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794{
795int c, i, bot, top;
796const uschar *ptr = *ptrptr;
797char name[32];
798
799c = *(++ptr);
800if (c == 0) goto ERROR_RETURN;
801
802*negptr = FALSE;
803
804/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805negation. */
806
807if (c == '{')
808  {
809  if (ptr[1] == '^')
810    {
811    *negptr = TRUE;
812    ptr++;
813    }
814  for (i = 0; i < (int)sizeof(name) - 1; i++)
815    {
816    c = *(++ptr);
817    if (c == 0) goto ERROR_RETURN;
818    if (c == '}') break;
819    name[i] = c;
820    }
821  if (c !='}') goto ERROR_RETURN;
822  name[i] = 0;
823  }
824
825/* Otherwise there is just one following character */
826
827else
828  {
829  name[0] = c;
830  name[1] = 0;
831  }
832
833*ptrptr = ptr;
834
835/* Search for a recognized property name using binary chop */
836
837bot = 0;
838top = _pcre_utt_size;
839
840while (bot < top)
841  {
842  i = (bot + top) >> 1;
843  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844  if (c == 0)
845    {
846    *dptr = _pcre_utt[i].value;
847    return _pcre_utt[i].type;
848    }
849  if (c > 0) bot = i + 1; else top = i;
850  }
851
852*errorcodeptr = ERR47;
853*ptrptr = ptr;
854return -1;
855
856ERROR_RETURN:
857*errorcodeptr = ERR46;
858*ptrptr = ptr;
859return -1;
860}
861#endif
862
863
864
865
866/*************************************************
867*            Check for counted repeat            *
868*************************************************/
869
870/* This function is called when a '{' is encountered in a place where it might
871start a quantifier. It looks ahead to see if it really is a quantifier or not.
872It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873where the ddds are digits.
874
875Arguments:
876  p         pointer to the first char after '{'
877
878Returns:    TRUE or FALSE
879*/
880
881static BOOL
882is_counted_repeat(const uschar *p)
883{
884if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885while ((digitab[*p] & ctype_digit) != 0) p++;
886if (*p == '}') return TRUE;
887
888if (*p++ != ',') return FALSE;
889if (*p == '}') return TRUE;
890
891if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892while ((digitab[*p] & ctype_digit) != 0) p++;
893
894return (*p == '}');
895}
896
897
898
899/*************************************************
900*         Read repeat counts                     *
901*************************************************/
902
903/* Read an item of the form {n,m} and return the values. This is called only
904after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905so the syntax is guaranteed to be correct, but we need to check the values.
906
907Arguments:
908  p              pointer to first char after '{'
909  minp           pointer to int for min
910  maxp           pointer to int for max
911                 returned as -1 if no max
912  errorcodeptr   points to error code variable
913
914Returns:         pointer to '}' on success;
915                 current ptr on error, with errorcodeptr set non-zero
916*/
917
918static const uschar *
919read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920{
921int min = 0;
922int max = -1;
923
924/* Read the minimum value and do a paranoid check: a negative value indicates
925an integer overflow. */
926
927while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928if (min < 0 || min > 65535)
929  {
930  *errorcodeptr = ERR5;
931  return p;
932  }
933
934/* Read the maximum value if there is one, and again do a paranoid on its size.
935Also, max must not be less than min. */
936
937if (*p == '}') max = min; else
938  {
939  if (*(++p) != '}')
940    {
941    max = 0;
942    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943    if (max < 0 || max > 65535)
944      {
945      *errorcodeptr = ERR5;
946      return p;
947      }
948    if (max < min)
949      {
950      *errorcodeptr = ERR4;
951      return p;
952      }
953    }
954  }
955
956/* Fill in the required variables, and pass back the pointer to the terminating
957'}'. */
958
959*minp = min;
960*maxp = max;
961return p;
962}
963
964
965
966/*************************************************
967*       Find forward referenced subpattern       *
968*************************************************/
969
970/* This function scans along a pattern's text looking for capturing
971subpatterns, and counting them. If it finds a named pattern that matches the
972name it is given, it returns its number. Alternatively, if the name is NULL, it
973returns when it reaches a given numbered subpattern. This is used for forward
974references to subpatterns. We know that if (?P< is encountered, the name will
975be terminated by '>' because that is checked in the first pass.
976
977Arguments:
978  ptr          current position in the pattern
979  cd           compile background data
980  name         name to seek, or NULL if seeking a numbered subpattern
981  lorn         name length, or subpattern number if name is NULL
982  xmode        TRUE if we are in /x mode
983
984Returns:       the number of the named subpattern, or -1 if not found
985*/
986
987static int
988find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
989  BOOL xmode)
990{
991const uschar *thisname;
992int count = cd->bracount;
993
994for (; *ptr != 0; ptr++)
995  {
996  int term;
997
998  /* Skip over backslashed characters and also entire \Q...\E */
999
1000  if (*ptr == '\\')
1001    {
1002    if (*(++ptr) == 0) return -1;
1003    if (*ptr == 'Q') for (;;)
1004      {
1005      while (*(++ptr) != 0 && *ptr != '\\');
1006      if (*ptr == 0) return -1;
1007      if (*(++ptr) == 'E') break;
1008      }
1009    continue;
1010    }
1011
1012  /* Skip over character classes; this logic must be similar to the way they
1013  are handled for real. If the first character is '^', skip it. Also, if the
1014  first few characters (either before or after ^) are \Q\E or \E we skip them
1015  too. This makes for compatibility with Perl. */
1016
1017  if (*ptr == '[')
1018    {
1019    BOOL negate_class = FALSE;
1020    for (;;)
1021      {
1022      int c = *(++ptr);
1023      if (c == '\\')
1024        {
1025        if (ptr[1] == 'E') ptr++;
1026          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1027            else break;
1028        }
1029      else if (!negate_class && c == '^')
1030        negate_class = TRUE;
1031      else break;
1032      }
1033
1034    /* If the next character is ']', it is a data character that must be
1035    skipped, except in JavaScript compatibility mode. */
1036
1037    if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038      ptr++;
1039
1040    while (*(++ptr) != ']')
1041      {
1042      if (*ptr == 0) return -1;
1043      if (*ptr == '\\')
1044        {
1045        if (*(++ptr) == 0) return -1;
1046        if (*ptr == 'Q') for (;;)
1047          {
1048          while (*(++ptr) != 0 && *ptr != '\\');
1049          if (*ptr == 0) return -1;
1050          if (*(++ptr) == 'E') break;
1051          }
1052        continue;
1053        }
1054      }
1055    continue;
1056    }
1057
1058  /* Skip comments in /x mode */
1059
1060  if (xmode && *ptr == '#')
1061    {
1062    while (*(++ptr) != 0 && *ptr != '\n');
1063    if (*ptr == 0) return -1;
1064    continue;
1065    }
1066
1067  /* An opening parens must now be a real metacharacter */
1068
1069  if (*ptr != '(') continue;
1070  if (ptr[1] != '?' && ptr[1] != '*')
1071    {
1072    count++;
1073    if (name == NULL && count == lorn) return count;
1074    continue;
1075    }
1076
1077  ptr += 2;
1078  if (*ptr == 'P') ptr++;                      /* Allow optional P */
1079
1080  /* We have to disambiguate (?<! and (?<= from (?<name> */
1081
1082  if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1083       *ptr != '\'')
1084    continue;
1085
1086  count++;
1087
1088  if (name == NULL && count == lorn) return count;
1089  term = *ptr++;
1090  if (term == '<') term = '>';
1091  thisname = ptr;
1092  while (*ptr != term) ptr++;
1093  if (name != NULL && lorn == ptr - thisname &&
1094      strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1095    return count;
1096  }
1097
1098return -1;
1099}
1100
1101
1102
1103/*************************************************
1104*      Find first significant op code            *
1105*************************************************/
1106
1107/* This is called by several functions that scan a compiled expression looking
1108for a fixed first character, or an anchoring op code etc. It skips over things
1109that do not influence this. For some calls, a change of option is important.
1110For some calls, it makes sense to skip negative forward and all backward
1111assertions, and also the \b assertion; for others it does not.
1112
1113Arguments:
1114  code         pointer to the start of the group
1115  options      pointer to external options
1116  optbit       the option bit whose changing is significant, or
1117                 zero if none are
1118  skipassert   TRUE if certain assertions are to be skipped
1119
1120Returns:       pointer to the first significant opcode
1121*/
1122
1123static const uschar*
1124first_significant_code(const uschar *code, int *options, int optbit,
1125  BOOL skipassert)
1126{
1127for (;;)
1128  {
1129  switch ((int)*code)
1130    {
1131    case OP_OPT:
1132    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1133      *options = (int)code[1];
1134    code += 2;
1135    break;
1136
1137    case OP_ASSERT_NOT:
1138    case OP_ASSERTBACK:
1139    case OP_ASSERTBACK_NOT:
1140    if (!skipassert) return code;
1141    do code += GET(code, 1); while (*code == OP_ALT);
1142    code += _pcre_OP_lengths[*code];
1143    break;
1144
1145    case OP_WORD_BOUNDARY:
1146    case OP_NOT_WORD_BOUNDARY:
1147    if (!skipassert) return code;
1148    /* Fall through */
1149
1150    case OP_CALLOUT:
1151    case OP_CREF:
1152    case OP_RREF:
1153    case OP_DEF:
1154    code += _pcre_OP_lengths[*code];
1155    break;
1156
1157    default:
1158    return code;
1159    }
1160  }
1161/* Control never reaches here */
1162}
1163
1164
1165
1166
1167/*************************************************
1168*        Find the fixed length of a pattern      *
1169*************************************************/
1170
1171/* Scan a pattern and compute the fixed length of subject that will match it,
1172if the length is fixed. This is needed for dealing with backward assertions.
1173In UTF8 mode, the result is in characters rather than bytes.
1174
1175Arguments:
1176  code     points to the start of the pattern (the bracket)
1177  options  the compiling options
1178
1179Returns:   the fixed length, or -1 if there is no fixed length,
1180             or -2 if \C was encountered
1181*/
1182
1183static int
1184find_fixedlength(uschar *code, int options)
1185{
1186int length = -1;
1187
1188register int branchlength = 0;
1189register uschar *cc = code + 1 + LINK_SIZE;
1190
1191/* Scan along the opcodes for this branch. If we get to the end of the
1192branch, check the length against that of the other branches. */
1193
1194for (;;)
1195  {
1196  int d;
1197  register int op = *cc;
1198  switch (op)
1199    {
1200    case OP_CBRA:
1201    case OP_BRA:
1202    case OP_ONCE:
1203    case OP_COND:
1204    d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1205    if (d < 0) return d;
1206    branchlength += d;
1207    do cc += GET(cc, 1); while (*cc == OP_ALT);
1208    cc += 1 + LINK_SIZE;
1209    break;
1210
1211    /* Reached end of a branch; if it's a ket it is the end of a nested
1212    call. If it's ALT it is an alternation in a nested call. If it is
1213    END it's the end of the outer call. All can be handled by the same code. */
1214
1215    case OP_ALT:
1216    case OP_KET:
1217    case OP_KETRMAX:
1218    case OP_KETRMIN:
1219    case OP_END:
1220    if (length < 0) length = branchlength;
1221      else if (length != branchlength) return -1;
1222    if (*cc != OP_ALT) return length;
1223    cc += 1 + LINK_SIZE;
1224    branchlength = 0;
1225    break;
1226
1227    /* Skip over assertive subpatterns */
1228
1229    case OP_ASSERT:
1230    case OP_ASSERT_NOT:
1231    case OP_ASSERTBACK:
1232    case OP_ASSERTBACK_NOT:
1233    do cc += GET(cc, 1); while (*cc == OP_ALT);
1234    /* Fall through */
1235
1236    /* Skip over things that don't match chars */
1237
1238    case OP_REVERSE:
1239    case OP_CREF:
1240    case OP_RREF:
1241    case OP_DEF:
1242    case OP_OPT:
1243    case OP_CALLOUT:
1244    case OP_SOD:
1245    case OP_SOM:
1246    case OP_EOD:
1247    case OP_EODN:
1248    case OP_CIRC:
1249    case OP_DOLL:
1250    case OP_NOT_WORD_BOUNDARY:
1251    case OP_WORD_BOUNDARY:
1252    cc += _pcre_OP_lengths[*cc];
1253    break;
1254
1255    /* Handle literal characters */
1256
1257    case OP_CHAR:
1258    case OP_CHARNC:
1259    case OP_NOT:
1260    branchlength++;
1261    cc += 2;
1262#ifdef SUPPORT_UTF8
1263    if ((options & PCRE_UTF8) != 0)
1264      {
1265      while ((*cc & 0xc0) == 0x80) cc++;
1266      }
1267#endif
1268    break;
1269
1270    /* Handle exact repetitions. The count is already in characters, but we
1271    need to skip over a multibyte character in UTF8 mode.  */
1272
1273    case OP_EXACT:
1274    branchlength += GET2(cc,1);
1275    cc += 4;
1276#ifdef SUPPORT_UTF8
1277    if ((options & PCRE_UTF8) != 0)
1278      {
1279      while((*cc & 0x80) == 0x80) cc++;
1280      }
1281#endif
1282    break;
1283
1284    case OP_TYPEEXACT:
1285    branchlength += GET2(cc,1);
1286    if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1287    cc += 4;
1288    break;
1289
1290    /* Handle single-char matchers */
1291
1292    case OP_PROP:
1293    case OP_NOTPROP:
1294    cc += 2;
1295    /* Fall through */
1296
1297    case OP_NOT_DIGIT:
1298    case OP_DIGIT:
1299    case OP_NOT_WHITESPACE:
1300    case OP_WHITESPACE:
1301    case OP_NOT_WORDCHAR:
1302    case OP_WORDCHAR:
1303    case OP_ANY:
1304    case OP_ALLANY:
1305    branchlength++;
1306    cc++;
1307    break;
1308
1309    /* The single-byte matcher isn't allowed */
1310
1311    case OP_ANYBYTE:
1312    return -2;
1313
1314    /* Check a class for variable quantification */
1315
1316#ifdef SUPPORT_UTF8
1317    case OP_XCLASS:
1318    cc += GET(cc, 1) - 33;
1319    /* Fall through */
1320#endif
1321
1322    case OP_CLASS:
1323    case OP_NCLASS:
1324    cc += 33;
1325
1326    switch (*cc)
1327      {
1328      case OP_CRSTAR:
1329      case OP_CRMINSTAR:
1330      case OP_CRQUERY:
1331      case OP_CRMINQUERY:
1332      return -1;
1333
1334      case OP_CRRANGE:
1335      case OP_CRMINRANGE:
1336      if (GET2(cc,1) != GET2(cc,3)) return -1;
1337      branchlength += GET2(cc,1);
1338      cc += 5;
1339      break;
1340
1341      default:
1342      branchlength++;
1343      }
1344    break;
1345
1346    /* Anything else is variable length */
1347
1348    default:
1349    return -1;
1350    }
1351  }
1352/* Control never gets here */
1353}
1354
1355
1356
1357
1358/*************************************************
1359*    Scan compiled regex for numbered bracket    *
1360*************************************************/
1361
1362/* This little function scans through a compiled pattern until it finds a
1363capturing bracket with the given number.
1364
1365Arguments:
1366  code        points to start of expression
1367  utf8        TRUE in UTF-8 mode
1368  number      the required bracket number
1369
1370Returns:      pointer to the opcode for the bracket, or NULL if not found
1371*/
1372
1373static const uschar *
1374find_bracket(const uschar *code, BOOL utf8, int number)
1375{
1376for (;;)
1377  {
1378  register int c = *code;
1379  if (c == OP_END) return NULL;
1380
1381  /* XCLASS is used for classes that cannot be represented just by a bit
1382  map. This includes negated single high-valued characters. The length in
1383  the table is zero; the actual length is stored in the compiled code. */
1384
1385  if (c == OP_XCLASS) code += GET(code, 1);
1386
1387  /* Handle capturing bracket */
1388
1389  else if (c == OP_CBRA)
1390    {
1391    int n = GET2(code, 1+LINK_SIZE);
1392    if (n == number) return (uschar *)code;
1393    code += _pcre_OP_lengths[c];
1394    }
1395
1396  /* Otherwise, we can get the item's length from the table, except that for
1397  repeated character types, we have to test for \p and \P, which have an extra
1398  two bytes of parameters. */
1399
1400  else
1401    {
1402    switch(c)
1403      {
1404      case OP_TYPESTAR:
1405      case OP_TYPEMINSTAR:
1406      case OP_TYPEPLUS:
1407      case OP_TYPEMINPLUS:
1408      case OP_TYPEQUERY:
1409      case OP_TYPEMINQUERY:
1410      case OP_TYPEPOSSTAR:
1411      case OP_TYPEPOSPLUS:
1412      case OP_TYPEPOSQUERY:
1413      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1414      break;
1415
1416      case OP_TYPEUPTO:
1417      case OP_TYPEMINUPTO:
1418      case OP_TYPEEXACT:
1419      case OP_TYPEPOSUPTO:
1420      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1421      break;
1422      }
1423
1424    /* Add in the fixed length from the table */
1425
1426    code += _pcre_OP_lengths[c];
1427
1428  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1429  a multi-byte character. The length in the table is a minimum, so we have to
1430  arrange to skip the extra bytes. */
1431
1432#ifdef SUPPORT_UTF8
1433    if (utf8) switch(c)
1434      {
1435      case OP_CHAR:
1436      case OP_CHARNC:
1437      case OP_EXACT:
1438      case OP_UPTO:
1439      case OP_MINUPTO:
1440      case OP_POSUPTO:
1441      case OP_STAR:
1442      case OP_MINSTAR:
1443      case OP_POSSTAR:
1444      case OP_PLUS:
1445      case OP_MINPLUS:
1446      case OP_POSPLUS:
1447      case OP_QUERY:
1448      case OP_MINQUERY:
1449      case OP_POSQUERY:
1450      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1451      break;
1452      }
1453#endif
1454    }
1455  }
1456}
1457
1458
1459
1460/*************************************************
1461*   Scan compiled regex for recursion reference  *
1462*************************************************/
1463
1464/* This little function scans through a compiled pattern until it finds an
1465instance of OP_RECURSE.
1466
1467Arguments:
1468  code        points to start of expression
1469  utf8        TRUE in UTF-8 mode
1470
1471Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1472*/
1473
1474static const uschar *
1475find_recurse(const uschar *code, BOOL utf8)
1476{
1477for (;;)
1478  {
1479  register int c = *code;
1480  if (c == OP_END) return NULL;
1481  if (c == OP_RECURSE) return code;
1482
1483  /* XCLASS is used for classes that cannot be represented just by a bit
1484  map. This includes negated single high-valued characters. The length in
1485  the table is zero; the actual length is stored in the compiled code. */
1486
1487  if (c == OP_XCLASS) code += GET(code, 1);
1488
1489  /* Otherwise, we can get the item's length from the table, except that for
1490  repeated character types, we have to test for \p and \P, which have an extra
1491  two bytes of parameters. */
1492
1493  else
1494    {
1495    switch(c)
1496      {
1497      case OP_TYPESTAR:
1498      case OP_TYPEMINSTAR:
1499      case OP_TYPEPLUS:
1500      case OP_TYPEMINPLUS:
1501      case OP_TYPEQUERY:
1502      case OP_TYPEMINQUERY:
1503      case OP_TYPEPOSSTAR:
1504      case OP_TYPEPOSPLUS:
1505      case OP_TYPEPOSQUERY:
1506      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1507      break;
1508
1509      case OP_TYPEPOSUPTO:
1510      case OP_TYPEUPTO:
1511      case OP_TYPEMINUPTO:
1512      case OP_TYPEEXACT:
1513      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1514      break;
1515      }
1516
1517    /* Add in the fixed length from the table */
1518
1519    code += _pcre_OP_lengths[c];
1520
1521    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1522    by a multi-byte character. The length in the table is a minimum, so we have
1523    to arrange to skip the extra bytes. */
1524
1525#ifdef SUPPORT_UTF8
1526    if (utf8) switch(c)
1527      {
1528      case OP_CHAR:
1529      case OP_CHARNC:
1530      case OP_EXACT:
1531      case OP_UPTO:
1532      case OP_MINUPTO:
1533      case OP_POSUPTO:
1534      case OP_STAR:
1535      case OP_MINSTAR:
1536      case OP_POSSTAR:
1537      case OP_PLUS:
1538      case OP_MINPLUS:
1539      case OP_POSPLUS:
1540      case OP_QUERY:
1541      case OP_MINQUERY:
1542      case OP_POSQUERY:
1543      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1544      break;
1545      }
1546#endif
1547    }
1548  }
1549}
1550
1551
1552
1553/*************************************************
1554*    Scan compiled branch for non-emptiness      *
1555*************************************************/
1556
1557/* This function scans through a branch of a compiled pattern to see whether it
1558can match the empty string or not. It is called from could_be_empty()
1559below and from compile_branch() when checking for an unlimited repeat of a
1560group that can match nothing. Note that first_significant_code() skips over
1561backward and negative forward assertions when its final argument is TRUE. If we
1562hit an unclosed bracket, we return "empty" - this means we've struck an inner
1563bracket whose current branch will already have been scanned.
1564
1565Arguments:
1566  code        points to start of search
1567  endcode     points to where to stop
1568  utf8        TRUE if in UTF8 mode
1569
1570Returns:      TRUE if what is matched could be empty
1571*/
1572
1573static BOOL
1574could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1575{
1576register int c;
1577for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1578     code < endcode;
1579     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1580  {
1581  const uschar *ccode;
1582
1583  c = *code;
1584
1585  /* Skip over forward assertions; the other assertions are skipped by
1586  first_significant_code() with a TRUE final argument. */
1587
1588  if (c == OP_ASSERT)
1589    {
1590    do code += GET(code, 1); while (*code == OP_ALT);
1591    c = *code;
1592    continue;
1593    }
1594
1595  /* Groups with zero repeats can of course be empty; skip them. */
1596
1597  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1598    {
1599    code += _pcre_OP_lengths[c];
1600    do code += GET(code, 1); while (*code == OP_ALT);
1601    c = *code;
1602    continue;
1603    }
1604
1605  /* For other groups, scan the branches. */
1606
1607  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1608    {
1609    BOOL empty_branch;
1610    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1611
1612    /* Scan a closed bracket */
1613
1614    empty_branch = FALSE;
1615    do
1616      {
1617      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1618        empty_branch = TRUE;
1619      code += GET(code, 1);
1620      }
1621    while (*code == OP_ALT);
1622    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1623    c = *code;
1624    continue;
1625    }
1626
1627  /* Handle the other opcodes */
1628
1629  switch (c)
1630    {
1631    /* Check for quantifiers after a class. XCLASS is used for classes that
1632    cannot be represented just by a bit map. This includes negated single
1633    high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1634    actual length is stored in the compiled code, so we must update "code"
1635    here. */
1636
1637#ifdef SUPPORT_UTF8
1638    case OP_XCLASS:
1639    ccode = code += GET(code, 1);
1640    goto CHECK_CLASS_REPEAT;
1641#endif
1642
1643    case OP_CLASS:
1644    case OP_NCLASS:
1645    ccode = code + 33;
1646
1647#ifdef SUPPORT_UTF8
1648    CHECK_CLASS_REPEAT:
1649#endif
1650
1651    switch (*ccode)
1652      {
1653      case OP_CRSTAR:            /* These could be empty; continue */
1654      case OP_CRMINSTAR:
1655      case OP_CRQUERY:
1656      case OP_CRMINQUERY:
1657      break;
1658
1659      default:                   /* Non-repeat => class must match */
1660      case OP_CRPLUS:            /* These repeats aren't empty */
1661      case OP_CRMINPLUS:
1662      return FALSE;
1663
1664      case OP_CRRANGE:
1665      case OP_CRMINRANGE:
1666      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1667      break;
1668      }
1669    break;
1670
1671    /* Opcodes that must match a character */
1672
1673    case OP_PROP:
1674    case OP_NOTPROP:
1675    case OP_EXTUNI:
1676    case OP_NOT_DIGIT:
1677    case OP_DIGIT:
1678    case OP_NOT_WHITESPACE:
1679    case OP_WHITESPACE:
1680    case OP_NOT_WORDCHAR:
1681    case OP_WORDCHAR:
1682    case OP_ANY:
1683    case OP_ALLANY:
1684    case OP_ANYBYTE:
1685    case OP_CHAR:
1686    case OP_CHARNC:
1687    case OP_NOT:
1688    case OP_PLUS:
1689    case OP_MINPLUS:
1690    case OP_POSPLUS:
1691    case OP_EXACT:
1692    case OP_NOTPLUS:
1693    case OP_NOTMINPLUS:
1694    case OP_NOTPOSPLUS:
1695    case OP_NOTEXACT:
1696    case OP_TYPEPLUS:
1697    case OP_TYPEMINPLUS:
1698    case OP_TYPEPOSPLUS:
1699    case OP_TYPEEXACT:
1700    return FALSE;
1701
1702    /* These are going to continue, as they may be empty, but we have to
1703    fudge the length for the \p and \P cases. */
1704
1705    case OP_TYPESTAR:
1706    case OP_TYPEMINSTAR:
1707    case OP_TYPEPOSSTAR:
1708    case OP_TYPEQUERY:
1709    case OP_TYPEMINQUERY:
1710    case OP_TYPEPOSQUERY:
1711    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1712    break;
1713
1714    /* Same for these */
1715
1716    case OP_TYPEUPTO:
1717    case OP_TYPEMINUPTO:
1718    case OP_TYPEPOSUPTO:
1719    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1720    break;
1721
1722    /* End of branch */
1723
1724    case OP_KET:
1725    case OP_KETRMAX:
1726    case OP_KETRMIN:
1727    case OP_ALT:
1728    return TRUE;
1729
1730    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1731    MINUPTO, and POSUPTO may be followed by a multibyte character */
1732
1733#ifdef SUPPORT_UTF8
1734    case OP_STAR:
1735    case OP_MINSTAR:
1736    case OP_POSSTAR:
1737    case OP_QUERY:
1738    case OP_MINQUERY:
1739    case OP_POSQUERY:
1740    case OP_UPTO:
1741    case OP_MINUPTO:
1742    case OP_POSUPTO:
1743    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1744    break;
1745#endif
1746    }
1747  }
1748
1749return TRUE;
1750}
1751
1752
1753
1754/*************************************************
1755*    Scan compiled regex for non-emptiness       *
1756*************************************************/
1757
1758/* This function is called to check for left recursive calls. We want to check
1759the current branch of the current pattern to see if it could match the empty
1760string. If it could, we must look outwards for branches at other levels,
1761stopping when we pass beyond the bracket which is the subject of the recursion.
1762
1763Arguments:
1764  code        points to start of the recursion
1765  endcode     points to where to stop (current RECURSE item)
1766  bcptr       points to the chain of current (unclosed) branch starts
1767  utf8        TRUE if in UTF-8 mode
1768
1769Returns:      TRUE if what is matched could be empty
1770*/
1771
1772static BOOL
1773could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1774  BOOL utf8)
1775{
1776while (bcptr != NULL && bcptr->current >= code)
1777  {
1778  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1779  bcptr = bcptr->outer;
1780  }
1781return TRUE;
1782}
1783
1784
1785
1786/*************************************************
1787*           Check for POSIX class syntax         *
1788*************************************************/
1789
1790/* This function is called when the sequence "[:" or "[." or "[=" is
1791encountered in a character class. It checks whether this is followed by a
1792sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1793reach an unescaped ']' without the special preceding character, return FALSE.
1794
1795Originally, this function only recognized a sequence of letters between the
1796terminators, but it seems that Perl recognizes any sequence of characters,
1797though of course unknown POSIX names are subsequently rejected. Perl gives an
1798"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1799didn't consider this to be a POSIX class. Likewise for [:1234:].
1800
1801The problem in trying to be exactly like Perl is in the handling of escapes. We
1802have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1803class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1804below handles the special case of \], but does not try to do any other escape
1805processing. This makes it different from Perl for cases such as [:l\ower:]
1806where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1807"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1808I think.
1809
1810Arguments:
1811  ptr      pointer to the initial [
1812  endptr   where to return the end pointer
1813
1814Returns:   TRUE or FALSE
1815*/
1816
1817static BOOL
1818check_posix_syntax(const uschar *ptr, const uschar **endptr)
1819{
1820int terminator;          /* Don't combine these lines; the Solaris cc */
1821terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1822for (++ptr; *ptr != 0; ptr++)
1823  {
1824  if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1825    {
1826    if (*ptr == ']') return FALSE;
1827    if (*ptr == terminator && ptr[1] == ']')
1828      {
1829      *endptr = ptr;
1830      return TRUE;
1831      }
1832    }
1833  }
1834return FALSE;
1835}
1836
1837
1838
1839
1840/*************************************************
1841*          Check POSIX class name                *
1842*************************************************/
1843
1844/* This function is called to check the name given in a POSIX-style class entry
1845such as [:alnum:].
1846
1847Arguments:
1848  ptr        points to the first letter
1849  len        the length of the name
1850
1851Returns:     a value representing the name, or -1 if unknown
1852*/
1853
1854static int
1855check_posix_name(const uschar *ptr, int len)
1856{
1857const char *pn = posix_names;
1858register int yield = 0;
1859while (posix_name_lengths[yield] != 0)
1860  {
1861  if (len == posix_name_lengths[yield] &&
1862    strncmp((const char *)ptr, pn, len) == 0) return yield;
1863  pn += posix_name_lengths[yield] + 1;
1864  yield++;
1865  }
1866return -1;
1867}
1868
1869
1870/*************************************************
1871*    Adjust OP_RECURSE items in repeated group   *
1872*************************************************/
1873
1874/* OP_RECURSE items contain an offset from the start of the regex to the group
1875that is referenced. This means that groups can be replicated for fixed
1876repetition simply by copying (because the recursion is allowed to refer to
1877earlier groups that are outside the current group). However, when a group is
1878optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1879inserted before it, after it has been compiled. This means that any OP_RECURSE
1880items within it that refer to the group itself or any contained groups have to
1881have their offsets adjusted. That one of the jobs of this function. Before it
1882is called, the partially compiled regex must be temporarily terminated with
1883OP_END.
1884
1885This function has been extended with the possibility of forward references for
1886recursions and subroutine calls. It must also check the list of such references
1887for the group we are dealing with. If it finds that one of the recursions in
1888the current group is on this list, it adjusts the offset in the list, not the
1889value in the reference (which is a group number).
1890
1891Arguments:
1892  group      points to the start of the group
1893  adjust     the amount by which the group is to be moved
1894  utf8       TRUE in UTF-8 mode
1895  cd         contains pointers to tables etc.
1896  save_hwm   the hwm forward reference pointer at the start of the group
1897
1898Returns:     nothing
1899*/
1900
1901static void
1902adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1903  uschar *save_hwm)
1904{
1905uschar *ptr = group;
1906
1907while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1908  {
1909  int offset;
1910  uschar *hc;
1911
1912  /* See if this recursion is on the forward reference list. If so, adjust the
1913  reference. */
1914
1915  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1916    {
1917    offset = GET(hc, 0);
1918    if (cd->start_code + offset == ptr + 1)
1919      {
1920      PUT(hc, 0, offset + adjust);
1921      break;
1922      }
1923    }
1924
1925  /* Otherwise, adjust the recursion offset if it's after the start of this
1926  group. */
1927
1928  if (hc >= cd->hwm)
1929    {
1930    offset = GET(ptr, 1);
1931    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1932    }
1933
1934  ptr += 1 + LINK_SIZE;
1935  }
1936}
1937
1938
1939
1940/*************************************************
1941*        Insert an automatic callout point       *
1942*************************************************/
1943
1944/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1945callout points before each pattern item.
1946
1947Arguments:
1948  code           current code pointer
1949  ptr            current pattern pointer
1950  cd             pointers to tables etc
1951
1952Returns:         new code pointer
1953*/
1954
1955static uschar *
1956auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1957{
1958*code++ = OP_CALLOUT;
1959*code++ = 255;
1960PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1961PUT(code, LINK_SIZE, 0);                /* Default length */
1962return code + 2*LINK_SIZE;
1963}
1964
1965
1966
1967/*************************************************
1968*         Complete a callout item                *
1969*************************************************/
1970
1971/* A callout item contains the length of the next item in the pattern, which
1972we can't fill in till after we have reached the relevant point. This is used
1973for both automatic and manual callouts.
1974
1975Arguments:
1976  previous_callout   points to previous callout item
1977  ptr                current pattern pointer
1978  cd                 pointers to tables etc
1979
1980Returns:             nothing
1981*/
1982
1983static void
1984complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1985{
1986int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1987PUT(previous_callout, 2 + LINK_SIZE, length);
1988}
1989
1990
1991
1992#ifdef SUPPORT_UCP
1993/*************************************************
1994*           Get othercase range                  *
1995*************************************************/
1996
1997/* This function is passed the start and end of a class range, in UTF-8 mode
1998with UCP support. It searches up the characters, looking for internal ranges of
1999characters in the "other" case. Each call returns the next one, updating the
2000start address.
2001
2002Arguments:
2003  cptr        points to starting character value; updated
2004  d           end value
2005  ocptr       where to put start of othercase range
2006  odptr       where to put end of othercase range
2007
2008Yield:        TRUE when range returned; FALSE when no more
2009*/
2010
2011static BOOL
2012get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2013  unsigned int *odptr)
2014{
2015unsigned int c, othercase, next;
2016
2017for (c = *cptr; c <= d; c++)
2018  { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2019
2020if (c > d) return FALSE;
2021
2022*ocptr = othercase;
2023next = othercase + 1;
2024
2025for (++c; c <= d; c++)
2026  {
2027  if (_pcre_ucp_othercase(c) != next) break;
2028  next++;
2029  }
2030
2031*odptr = next - 1;
2032*cptr = c;
2033
2034return TRUE;
2035}
2036#endif  /* SUPPORT_UCP */
2037
2038
2039
2040/*************************************************
2041*     Check if auto-possessifying is possible    *
2042*************************************************/
2043
2044/* This function is called for unlimited repeats of certain items, to see
2045whether the next thing could possibly match the repeated item. If not, it makes
2046sense to automatically possessify the repeated item.
2047
2048Arguments:
2049  op_code       the repeated op code
2050  this          data for this item, depends on the opcode
2051  utf8          TRUE in UTF-8 mode
2052  utf8_char     used for utf8 character bytes, NULL if not relevant
2053  ptr           next character in pattern
2054  options       options bits
2055  cd            contains pointers to tables etc.
2056
2057Returns:        TRUE if possessifying is wanted
2058*/
2059
2060static BOOL
2061check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2062  const uschar *ptr, int options, compile_data *cd)
2063{
2064int next;
2065
2066/* Skip whitespace and comments in extended mode */
2067
2068if ((options & PCRE_EXTENDED) != 0)
2069  {
2070  for (;;)
2071    {
2072    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2073    if (*ptr == '#')
2074      {
2075      while (*(++ptr) != 0)
2076        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2077      }
2078    else break;
2079    }
2080  }
2081
2082/* If the next item is one that we can handle, get its value. A non-negative
2083value is a character, a negative value is an escape value. */
2084
2085if (*ptr == '\\')
2086  {
2087  int temperrorcode = 0;
2088  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2089  if (temperrorcode != 0) return FALSE;
2090  ptr++;    /* Point after the escape sequence */
2091  }
2092
2093else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2094  {
2095#ifdef SUPPORT_UTF8
2096  if (utf8) { GETCHARINC(next, ptr); } else
2097#endif
2098  next = *ptr++;
2099  }
2100
2101else return FALSE;
2102
2103/* Skip whitespace and comments in extended mode */
2104
2105if ((options & PCRE_EXTENDED) != 0)
2106  {
2107  for (;;)
2108    {
2109    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2110    if (*ptr == '#')
2111      {
2112      while (*(++ptr) != 0)
2113        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2114      }
2115    else break;
2116    }
2117  }
2118
2119/* If the next thing is itself optional, we have to give up. */
2120
2121if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2122  return FALSE;
2123
2124/* Now compare the next item with the previous opcode. If the previous is a
2125positive single character match, "item" either contains the character or, if
2126"item" is greater than 127 in utf8 mode, the character's bytes are in
2127utf8_char. */
2128
2129
2130/* Handle cases when the next item is a character. */
2131
2132if (next >= 0) switch(op_code)
2133  {
2134  case OP_CHAR:
2135#ifdef SUPPORT_UTF8
2136  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2137#endif
2138  return item != next;
2139
2140  /* For CHARNC (caseless character) we must check the other case. If we have
2141  Unicode property support, we can use it to test the other case of
2142  high-valued characters. */
2143
2144  case OP_CHARNC:
2145#ifdef SUPPORT_UTF8
2146  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2147#endif
2148  if (item == next) return FALSE;
2149#ifdef SUPPORT_UTF8
2150  if (utf8)
2151    {
2152    unsigned int othercase;
2153    if (next < 128) othercase = cd->fcc[next]; else
2154#ifdef SUPPORT_UCP
2155    othercase = _pcre_ucp_othercase((unsigned int)next);
2156#else
2157    othercase = NOTACHAR;
2158#endif
2159    return (unsigned int)item != othercase;
2160    }
2161  else
2162#endif  /* SUPPORT_UTF8 */
2163  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2164
2165  /* For OP_NOT, "item" must be a single-byte character. */
2166
2167  case OP_NOT:
2168  if (item == next) return TRUE;
2169  if ((options & PCRE_CASELESS) == 0) return FALSE;
2170#ifdef SUPPORT_UTF8
2171  if (utf8)
2172    {
2173    unsigned int othercase;
2174    if (next < 128) othercase = cd->fcc[next]; else
2175#ifdef SUPPORT_UCP
2176    othercase = _pcre_ucp_othercase(next);
2177#else
2178    othercase = NOTACHAR;
2179#endif
2180    return (unsigned int)item == othercase;
2181    }
2182  else
2183#endif  /* SUPPORT_UTF8 */
2184  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2185
2186  case OP_DIGIT:
2187  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2188
2189  case OP_NOT_DIGIT:
2190  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2191
2192  case OP_WHITESPACE:
2193  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2194
2195  case OP_NOT_WHITESPACE:
2196  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2197
2198  case OP_WORDCHAR:
2199  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2200
2201  case OP_NOT_WORDCHAR:
2202  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2203
2204  case OP_HSPACE:
2205  case OP_NOT_HSPACE:
2206  switch(next)
2207    {
2208    case 0x09:
2209    case 0x20:
2210    case 0xa0:
2211    case 0x1680:
2212    case 0x180e:
2213    case 0x2000:
2214    case 0x2001:
2215    case 0x2002:
2216    case 0x2003:
2217    case 0x2004:
2218    case 0x2005:
2219    case 0x2006:
2220    case 0x2007:
2221    case 0x2008:
2222    case 0x2009:
2223    case 0x200A:
2224    case 0x202f:
2225    case 0x205f:
2226    case 0x3000:
2227    return op_code != OP_HSPACE;
2228    default:
2229    return op_code == OP_HSPACE;
2230    }
2231
2232  case OP_VSPACE:
2233  case OP_NOT_VSPACE:
2234  switch(next)
2235    {
2236    case 0x0a:
2237    case 0x0b:
2238    case 0x0c:
2239    case 0x0d:
2240    case 0x85:
2241    case 0x2028:
2242    case 0x2029:
2243    return op_code != OP_VSPACE;
2244    default:
2245    return op_code == OP_VSPACE;
2246    }
2247
2248  default:
2249  return FALSE;
2250  }
2251
2252
2253/* Handle the case when the next item is \d, \s, etc. */
2254
2255switch(op_code)
2256  {
2257  case OP_CHAR:
2258  case OP_CHARNC:
2259#ifdef SUPPORT_UTF8
2260  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2261#endif
2262  switch(-next)
2263    {
2264    case ESC_d:
2265    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2266
2267    case ESC_D:
2268    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2269
2270    case ESC_s:
2271    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2272
2273    case ESC_S:
2274    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2275
2276    case ESC_w:
2277    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2278
2279    case ESC_W:
2280    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2281
2282    case ESC_h:
2283    case ESC_H:
2284    switch(item)
2285      {
2286      case 0x09:
2287      case 0x20:
2288      case 0xa0:
2289      case 0x1680:
2290      case 0x180e:
2291      case 0x2000:
2292      case 0x2001:
2293      case 0x2002:
2294      case 0x2003:
2295      case 0x2004:
2296      case 0x2005:
2297      case 0x2006:
2298      case 0x2007:
2299      case 0x2008:
2300      case 0x2009:
2301      case 0x200A:
2302      case 0x202f:
2303      case 0x205f:
2304      case 0x3000:
2305      return -next != ESC_h;
2306      default:
2307      return -next == ESC_h;
2308      }
2309
2310    case ESC_v:
2311    case ESC_V:
2312    switch(item)
2313      {
2314      case 0x0a:
2315      case 0x0b:
2316      case 0x0c:
2317      case 0x0d:
2318      case 0x85:
2319      case 0x2028:
2320      case 0x2029:
2321      return -next != ESC_v;
2322      default:
2323      return -next == ESC_v;
2324      }
2325
2326    default:
2327    return FALSE;
2328    }
2329
2330  case OP_DIGIT:
2331  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2332         next == -ESC_h || next == -ESC_v;
2333
2334  case OP_NOT_DIGIT:
2335  return next == -ESC_d;
2336
2337  case OP_WHITESPACE:
2338  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2339
2340  case OP_NOT_WHITESPACE:
2341  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2342
2343  case OP_HSPACE:
2344  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2345
2346  case OP_NOT_HSPACE:
2347  return next == -ESC_h;
2348
2349  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2350  case OP_VSPACE:
2351  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2352
2353  case OP_NOT_VSPACE:
2354  return next == -ESC_v;
2355
2356  case OP_WORDCHAR:
2357  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2358
2359  case OP_NOT_WORDCHAR:
2360  return next == -ESC_w || next == -ESC_d;
2361
2362  default:
2363  return FALSE;
2364  }
2365
2366/* Control does not reach here */
2367}
2368
2369
2370
2371/*************************************************
2372*           Compile one branch                   *
2373*************************************************/
2374
2375/* Scan the pattern, compiling it into the a vector. If the options are
2376changed during the branch, the pointer is used to change the external options
2377bits. This function is used during the pre-compile phase when we are trying
2378to find out the amount of memory needed, as well as during the real compile
2379phase. The value of lengthptr distinguishes the two phases.
2380
2381Arguments:
2382  optionsptr     pointer to the option bits
2383  codeptr        points to the pointer to the current code point
2384  ptrptr         points to the current pattern pointer
2385  errorcodeptr   points to error code variable
2386  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2387  reqbyteptr     set to the last literal character required, else < 0
2388  bcptr          points to current branch chain
2389  cd             contains pointers to tables etc.
2390  lengthptr      NULL during the real compile phase
2391                 points to length accumulator during pre-compile phase
2392
2393Returns:         TRUE on success
2394                 FALSE, with *errorcodeptr set non-zero on error
2395*/
2396
2397static BOOL
2398compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2399  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2400  compile_data *cd, int *lengthptr)
2401{
2402int repeat_type, op_type;
2403int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2404int bravalue = 0;
2405int greedy_default, greedy_non_default;
2406int firstbyte, reqbyte;
2407int zeroreqbyte, zerofirstbyte;
2408int req_caseopt, reqvary, tempreqvary;
2409int options = *optionsptr;
2410int after_manual_callout = 0;
2411int length_prevgroup = 0;
2412register int c;
2413register uschar *code = *codeptr;
2414uschar *last_code = code;
2415uschar *orig_code = code;
2416uschar *tempcode;
2417BOOL inescq = FALSE;
2418BOOL groupsetfirstbyte = FALSE;
2419const uschar *ptr = *ptrptr;
2420const uschar *tempptr;
2421uschar *previous = NULL;
2422uschar *previous_callout = NULL;
2423uschar *save_hwm = NULL;
2424uschar classbits[32];
2425
2426#ifdef SUPPORT_UTF8
2427BOOL class_utf8;
2428BOOL utf8 = (options & PCRE_UTF8) != 0;
2429uschar *class_utf8data;
2430uschar *class_utf8data_base;
2431uschar utf8_char[6];
2432#else
2433BOOL utf8 = FALSE;
2434uschar *utf8_char = NULL;
2435#endif
2436
2437#ifdef DEBUG
2438if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2439#endif
2440
2441/* Set up the default and non-default settings for greediness */
2442
2443greedy_default = ((options & PCRE_UNGREEDY) != 0);
2444greedy_non_default = greedy_default ^ 1;
2445
2446/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2447matching encountered yet". It gets changed to REQ_NONE if we hit something that
2448matches a non-fixed char first char; reqbyte just remains unset if we never
2449find one.
2450
2451When we hit a repeat whose minimum is zero, we may have to adjust these values
2452to take the zero repeat into account. This is implemented by setting them to
2453zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2454item types that can be repeated set these backoff variables appropriately. */
2455
2456firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2457
2458/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2459according to the current setting of the caseless flag. REQ_CASELESS is a bit
2460value > 255. It is added into the firstbyte or reqbyte variables to record the
2461case status of the value. This is used only for ASCII characters. */
2462
2463req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2464
2465/* Switch on next character until the end of the branch */
2466
2467for (;; ptr++)
2468  {
2469  BOOL negate_class;
2470  BOOL should_flip_negation;
2471  BOOL possessive_quantifier;
2472  BOOL is_quantifier;
2473  BOOL is_recurse;
2474  BOOL reset_bracount;
2475  int class_charcount;
2476  int class_lastchar;
2477  int newoptions;
2478  int recno;
2479  int refsign;
2480  int skipbytes;
2481  int subreqbyte;
2482  int subfirstbyte;
2483  int terminator;
2484  int mclength;
2485  uschar mcbuffer[8];
2486
2487  /* Get next byte in the pattern */
2488
2489  c = *ptr;
2490
2491  /* If we are in the pre-compile phase, accumulate the length used for the
2492  previous cycle of this loop. */
2493
2494  if (lengthptr != NULL)
2495    {
2496#ifdef DEBUG
2497    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2498#endif
2499    if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2500      {
2501      *errorcodeptr = ERR52;
2502      goto FAILED;
2503      }
2504
2505    /* There is at least one situation where code goes backwards: this is the
2506    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2507    the class is simply eliminated. However, it is created first, so we have to
2508    allow memory for it. Therefore, don't ever reduce the length at this point.
2509    */
2510
2511    if (code < last_code) code = last_code;
2512
2513    /* Paranoid check for integer overflow */
2514
2515    if (OFLOW_MAX - *lengthptr < code - last_code)
2516      {
2517      *errorcodeptr = ERR20;
2518      goto FAILED;
2519      }
2520
2521    *lengthptr += code - last_code;
2522    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2523
2524    /* If "previous" is set and it is not at the start of the work space, move
2525    it back to there, in order to avoid filling up the work space. Otherwise,
2526    if "previous" is NULL, reset the current code pointer to the start. */
2527
2528    if (previous != NULL)
2529      {
2530      if (previous > orig_code)
2531        {
2532        memmove(orig_code, previous, code - previous);
2533        code -= previous - orig_code;
2534        previous = orig_code;
2535        }
2536      }
2537    else code = orig_code;
2538
2539    /* Remember where this code item starts so we can pick up the length
2540    next time round. */
2541
2542    last_code = code;
2543    }
2544
2545  /* In the real compile phase, just check the workspace used by the forward
2546  reference list. */
2547
2548  else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2549    {
2550    *errorcodeptr = ERR52;
2551    goto FAILED;
2552    }
2553
2554  /* If in \Q...\E, check for the end; if not, we have a literal */
2555
2556  if (inescq && c != 0)
2557    {
2558    if (c == '\\' && ptr[1] == 'E')
2559      {
2560      inescq = FALSE;
2561      ptr++;
2562      continue;
2563      }
2564    else
2565      {
2566      if (previous_callout != NULL)
2567        {
2568        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2569          complete_callout(previous_callout, ptr, cd);
2570        previous_callout = NULL;
2571        }
2572      if ((options & PCRE_AUTO_CALLOUT) != 0)
2573        {
2574        previous_callout = code;
2575        code = auto_callout(code, ptr, cd);
2576        }
2577      goto NORMAL_CHAR;
2578      }
2579    }
2580
2581  /* Fill in length of a previous callout, except when the next thing is
2582  a quantifier. */
2583
2584  is_quantifier = c == '*' || c == '+' || c == '?' ||
2585    (c == '{' && is_counted_repeat(ptr+1));
2586
2587  if (!is_quantifier && previous_callout != NULL &&
2588       after_manual_callout-- <= 0)
2589    {
2590    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2591      complete_callout(previous_callout, ptr, cd);
2592    previous_callout = NULL;
2593    }
2594
2595  /* In extended mode, skip white space and comments */
2596
2597  if ((options & PCRE_EXTENDED) != 0)
2598    {
2599    if ((cd->ctypes[c] & ctype_space) != 0) continue;
2600    if (c == '#')
2601      {
2602      while (*(++ptr) != 0)
2603        {
2604        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2605        }
2606      if (*ptr != 0) continue;
2607
2608      /* Else fall through to handle end of string */
2609      c = 0;
2610      }
2611    }
2612
2613  /* No auto callout for quantifiers. */
2614
2615  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2616    {
2617    previous_callout = code;
2618    code = auto_callout(code, ptr, cd);
2619    }
2620
2621  switch(c)
2622    {
2623    /* ===================================================================*/
2624    case 0:                        /* The branch terminates at string end */
2625    case '|':                      /* or | or ) */
2626    case ')':
2627    *firstbyteptr = firstbyte;
2628    *reqbyteptr = reqbyte;
2629    *codeptr = code;
2630    *ptrptr = ptr;
2631    if (lengthptr != NULL)
2632      {
2633      if (OFLOW_MAX - *lengthptr < code - last_code)
2634        {
2635        *errorcodeptr = ERR20;
2636        goto FAILED;
2637        }
2638      *lengthptr += code - last_code;   /* To include callout length */
2639      DPRINTF((">> end branch\n"));
2640      }
2641    return TRUE;
2642
2643
2644    /* ===================================================================*/
2645    /* Handle single-character metacharacters. In multiline mode, ^ disables
2646    the setting of any following char as a first character. */
2647
2648    case '^':
2649    if ((options & PCRE_MULTILINE) != 0)
2650      {
2651      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2652      }
2653    previous = NULL;
2654    *code++ = OP_CIRC;
2655    break;
2656
2657    case '$':
2658    previous = NULL;
2659    *code++ = OP_DOLL;
2660    break;
2661
2662    /* There can never be a first char if '.' is first, whatever happens about
2663    repeats. The value of reqbyte doesn't change either. */
2664
2665    case '.':
2666    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2667    zerofirstbyte = firstbyte;
2668    zeroreqbyte = reqbyte;
2669    previous = code;
2670    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2671    break;
2672
2673
2674    /* ===================================================================*/
2675    /* Character classes. If the included characters are all < 256, we build a
2676    32-byte bitmap of the permitted characters, except in the special case
2677    where there is only one such character. For negated classes, we build the
2678    map as usual, then invert it at the end. However, we use a different opcode
2679    so that data characters > 255 can be handled correctly.
2680
2681    If the class contains characters outside the 0-255 range, a different
2682    opcode is compiled. It may optionally have a bit map for characters < 256,
2683    but those above are are explicitly listed afterwards. A flag byte tells
2684    whether the bitmap is present, and whether this is a negated class or not.
2685
2686    In JavaScript compatibility mode, an isolated ']' causes an error. In
2687    default (Perl) mode, it is treated as a data character. */
2688
2689    case ']':
2690    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2691      {
2692      *errorcodeptr = ERR64;
2693      goto FAILED;
2694      }
2695    goto NORMAL_CHAR;
2696
2697    case '[':
2698    previous = code;
2699
2700    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2701    they are encountered at the top level, so we'll do that too. */
2702
2703    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2704        check_posix_syntax(ptr, &tempptr))
2705      {
2706      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2707      goto FAILED;
2708      }
2709
2710    /* If the first character is '^', set the negation flag and skip it. Also,
2711    if the first few characters (either before or after ^) are \Q\E or \E we
2712    skip them too. This makes for compatibility with Perl. */
2713
2714    negate_class = FALSE;
2715    for (;;)
2716      {
2717      c = *(++ptr);
2718      if (c == '\\')
2719        {
2720        if (ptr[1] == 'E') ptr++;
2721          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2722            else break;
2723        }
2724      else if (!negate_class && c == '^')
2725        negate_class = TRUE;
2726      else break;
2727      }
2728
2729    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2730    an initial ']' is taken as a data character -- the code below handles
2731    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2732    [^] must match any character, so generate OP_ALLANY. */
2733
2734    if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2735      {
2736      *code++ = negate_class? OP_ALLANY : OP_FAIL;
2737      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2738      zerofirstbyte = firstbyte;
2739      break;
2740      }
2741
2742    /* If a class contains a negative special such as \S, we need to flip the
2743    negation flag at the end, so that support for characters > 255 works
2744    correctly (they are all included in the class). */
2745
2746    should_flip_negation = FALSE;
2747
2748    /* Keep a count of chars with values < 256 so that we can optimize the case
2749    of just a single character (as long as it's < 256). However, For higher
2750    valued UTF-8 characters, we don't yet do any optimization. */
2751
2752    class_charcount = 0;
2753    class_lastchar = -1;
2754
2755    /* Initialize the 32-char bit map to all zeros. We build the map in a
2756    temporary bit of memory, in case the class contains only 1 character (less
2757    than 256), because in that case the compiled code doesn't use the bit map.
2758    */
2759
2760    memset(classbits, 0, 32 * sizeof(uschar));
2761
2762#ifdef SUPPORT_UTF8
2763    class_utf8 = FALSE;                       /* No chars >= 256 */
2764    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2765    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2766#endif
2767
2768    /* Process characters until ] is reached. By writing this as a "do" it
2769    means that an initial ] is taken as a data character. At the start of the
2770    loop, c contains the first byte of the character. */
2771
2772    if (c != 0) do
2773      {
2774      const uschar *oldptr;
2775
2776#ifdef SUPPORT_UTF8
2777      if (utf8 && c > 127)
2778        {                           /* Braces are required because the */
2779        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2780        }
2781
2782      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2783      data and reset the pointer. This is so that very large classes that
2784      contain a zillion UTF-8 characters no longer overwrite the work space
2785      (which is on the stack). */
2786
2787      if (lengthptr != NULL)
2788        {
2789        *lengthptr += class_utf8data - class_utf8data_base;
2790        class_utf8data = class_utf8data_base;
2791        }
2792
2793#endif
2794
2795      /* Inside \Q...\E everything is literal except \E */
2796
2797      if (inescq)
2798        {
2799        if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2800          {
2801          inescq = FALSE;                   /* Reset literal state */
2802          ptr++;                            /* Skip the 'E' */
2803          continue;                         /* Carry on with next */
2804          }
2805        goto CHECK_RANGE;                   /* Could be range if \E follows */
2806        }
2807
2808      /* Handle POSIX class names. Perl allows a negation extension of the
2809      form [:^name:]. A square bracket that doesn't match the syntax is
2810      treated as a literal. We also recognize the POSIX constructions
2811      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2812      5.6 and 5.8 do. */
2813
2814      if (c == '[' &&
2815          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2816          check_posix_syntax(ptr, &tempptr))
2817        {
2818        BOOL local_negate = FALSE;
2819        int posix_class, taboffset, tabopt;
2820        register const uschar *cbits = cd->cbits;
2821        uschar pbits[32];
2822
2823        if (ptr[1] != ':')
2824          {
2825          *errorcodeptr = ERR31;
2826          goto FAILED;
2827          }
2828
2829        ptr += 2;
2830        if (*ptr == '^')
2831          {
2832          local_negate = TRUE;
2833          should_flip_negation = TRUE;  /* Note negative special */
2834          ptr++;
2835          }
2836
2837        posix_class = check_posix_name(ptr, tempptr - ptr);
2838        if (posix_class < 0)
2839          {
2840          *errorcodeptr = ERR30;
2841          goto FAILED;
2842          }
2843
2844        /* If matching is caseless, upper and lower are converted to
2845        alpha. This relies on the fact that the class table starts with
2846        alpha, lower, upper as the first 3 entries. */
2847
2848        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2849          posix_class = 0;
2850
2851        /* We build the bit map for the POSIX class in a chunk of local store
2852        because we may be adding and subtracting from it, and we don't want to
2853        subtract bits that may be in the main map already. At the end we or the
2854        result into the bit map that is being built. */
2855
2856        posix_class *= 3;
2857
2858        /* Copy in the first table (always present) */
2859
2860        memcpy(pbits, cbits + posix_class_maps[posix_class],
2861          32 * sizeof(uschar));
2862
2863        /* If there is a second table, add or remove it as required. */
2864
2865        taboffset = posix_class_maps[posix_class + 1];
2866        tabopt = posix_class_maps[posix_class + 2];
2867
2868        if (taboffset >= 0)
2869          {
2870          if (tabopt >= 0)
2871            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2872          else
2873            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2874          }
2875
2876        /* Not see if we need to remove any special characters. An option
2877        value of 1 removes vertical space and 2 removes underscore. */
2878
2879        if (tabopt < 0) tabopt = -tabopt;
2880        if (tabopt == 1) pbits[1] &= ~0x3c;
2881          else if (tabopt == 2) pbits[11] &= 0x7f;
2882
2883        /* Add the POSIX table or its complement into the main table that is
2884        being built and we are done. */
2885
2886        if (local_negate)
2887          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2888        else
2889          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2890
2891        ptr = tempptr + 1;
2892        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2893        continue;    /* End of POSIX syntax handling */
2894        }
2895
2896      /* Backslash may introduce a single character, or it may introduce one
2897      of the specials, which just set a flag. The sequence \b is a special
2898      case. Inside a class (and only there) it is treated as backspace.
2899      Elsewhere it marks a word boundary. Other escapes have preset maps ready
2900      to 'or' into the one we are building. We assume they have more than one
2901      character in them, so set class_charcount bigger than one. */
2902
2903      if (c == '\\')
2904        {
2905        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2906        if (*errorcodeptr != 0) goto FAILED;
2907
2908        if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2909        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2910        else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2911        else if (-c == ESC_Q)            /* Handle start of quoted string */
2912          {
2913          if (ptr[1] == '\\' && ptr[2] == 'E')
2914            {
2915            ptr += 2; /* avoid empty string */
2916            }
2917          else inescq = TRUE;
2918          continue;
2919          }
2920        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2921
2922        if (c < 0)
2923          {
2924          register const uschar *cbits = cd->cbits;
2925          class_charcount += 2;     /* Greater than 1 is what matters */
2926
2927          /* Save time by not doing this in the pre-compile phase. */
2928
2929          if (lengthptr == NULL) switch (-c)
2930            {
2931            case ESC_d:
2932            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2933            continue;
2934
2935            case ESC_D:
2936            should_flip_negation = TRUE;
2937            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2938            continue;
2939
2940            case ESC_w:
2941            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2942            continue;
2943
2944            case ESC_W:
2945            should_flip_negation = TRUE;
2946            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2947            continue;
2948
2949            case ESC_s:
2950            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2951            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2952            continue;
2953
2954            case ESC_S:
2955            should_flip_negation = TRUE;
2956            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2957            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2958            continue;
2959
2960            default:    /* Not recognized; fall through */
2961            break;      /* Need "default" setting to stop compiler warning. */
2962            }
2963
2964          /* In the pre-compile phase, just do the recognition. */
2965
2966          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2967                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2968
2969          /* We need to deal with \H, \h, \V, and \v in both phases because
2970          they use extra memory. */
2971
2972          if (-c == ESC_h)
2973            {
2974            SETBIT(classbits, 0x09); /* VT */
2975            SETBIT(classbits, 0x20); /* SPACE */
2976            SETBIT(classbits, 0xa0); /* NSBP */
2977#ifdef SUPPORT_UTF8
2978            if (utf8)
2979              {
2980              class_utf8 = TRUE;
2981              *class_utf8data++ = XCL_SINGLE;
2982              class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2983              *class_utf8data++ = XCL_SINGLE;
2984              class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2985              *class_utf8data++ = XCL_RANGE;
2986              class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2987              class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2988              *class_utf8data++ = XCL_SINGLE;
2989              class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2990              *class_utf8data++ = XCL_SINGLE;
2991              class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2992              *class_utf8data++ = XCL_SINGLE;
2993              class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2994              }
2995#endif
2996            continue;
2997            }
2998
2999          if (-c == ESC_H)
3000            {
3001            for (c = 0; c < 32; c++)
3002              {
3003              int x = 0xff;
3004              switch (c)
3005                {
3006                case 0x09/8: x ^= 1 << (0x09%8); break;
3007                case 0x20/8: x ^= 1 << (0x20%8); break;
3008                case 0xa0/8: x ^= 1 << (0xa0%8); break;
3009                default: break;
3010                }
3011              classbits[c] |= x;
3012              }
3013
3014#ifdef SUPPORT_UTF8
3015            if (utf8)
3016              {
3017              class_utf8 = TRUE;
3018              *class_utf8data++ = XCL_RANGE;
3019              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3020              class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3021              *class_utf8data++ = XCL_RANGE;
3022              class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3023              class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3024              *class_utf8data++ = XCL_RANGE;
3025              class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3026              class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3027              *class_utf8data++ = XCL_RANGE;
3028              class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3029              class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3030              *class_utf8data++ = XCL_RANGE;
3031              class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3032              class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3033              *class_utf8data++ = XCL_RANGE;
3034              class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3035              class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3036              *class_utf8data++ = XCL_RANGE;
3037              class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3038              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3039              }
3040#endif
3041            continue;
3042            }
3043
3044          if (-c == ESC_v)
3045            {
3046            SETBIT(classbits, 0x0a); /* LF */
3047            SETBIT(classbits, 0x0b); /* VT */
3048            SETBIT(classbits, 0x0c); /* FF */
3049            SETBIT(classbits, 0x0d); /* CR */
3050            SETBIT(classbits, 0x85); /* NEL */
3051#ifdef SUPPORT_UTF8
3052            if (utf8)
3053              {
3054              class_utf8 = TRUE;
3055              *class_utf8data++ = XCL_RANGE;
3056              class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3057              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3058              }
3059#endif
3060            continue;
3061            }
3062
3063          if (-c == ESC_V)
3064            {
3065            for (c = 0; c < 32; c++)
3066              {
3067              int x = 0xff;
3068              switch (c)
3069                {
3070                case 0x0a/8: x ^= 1 << (0x0a%8);
3071                             x ^= 1 << (0x0b%8);
3072                             x ^= 1 << (0x0c%8);
3073                             x ^= 1 << (0x0d%8);
3074                             break;
3075                case 0x85/8: x ^= 1 << (0x85%8); break;
3076                default: break;
3077                }
3078              classbits[c] |= x;
3079              }
3080
3081#ifdef SUPPORT_UTF8
3082            if (utf8)
3083              {
3084              class_utf8 = TRUE;
3085              *class_utf8data++ = XCL_RANGE;
3086              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3087              class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3088              *class_utf8data++ = XCL_RANGE;
3089              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3090              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3091              }
3092#endif
3093            continue;
3094            }
3095
3096          /* We need to deal with \P and \p in both phases. */
3097
3098#ifdef SUPPORT_UCP
3099          if (-c == ESC_p || -c == ESC_P)
3100            {
3101            BOOL negated;
3102            int pdata;
3103            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3104            if (ptype < 0) goto FAILED;
3105            class_utf8 = TRUE;
3106            *class_utf8data++ = ((-c == ESC_p) != negated)?
3107              XCL_PROP : XCL_NOTPROP;
3108            *class_utf8data++ = ptype;
3109            *class_utf8data++ = pdata;
3110            class_charcount -= 2;   /* Not a < 256 character */
3111            continue;
3112            }
3113#endif
3114          /* Unrecognized escapes are faulted if PCRE is running in its
3115          strict mode. By default, for compatibility with Perl, they are
3116          treated as literals. */
3117
3118          if ((options & PCRE_EXTRA) != 0)
3119            {
3120            *errorcodeptr = ERR7;
3121            goto FAILED;
3122            }
3123
3124          class_charcount -= 2;  /* Undo the default count from above */
3125          c = *ptr;              /* Get the final character and fall through */
3126          }
3127
3128        /* Fall through if we have a single character (c >= 0). This may be
3129        greater than 256 in UTF-8 mode. */
3130
3131        }   /* End of backslash handling */
3132
3133      /* A single character may be followed by '-' to form a range. However,
3134      Perl does not permit ']' to be the end of the range. A '-' character
3135      at the end is treated as a literal. Perl ignores orphaned \E sequences
3136      entirely. The code for handling \Q and \E is messy. */
3137
3138      CHECK_RANGE:
3139      while (ptr[1] == '\\' && ptr[2] == 'E')
3140        {
3141        inescq = FALSE;
3142        ptr += 2;
3143        }
3144
3145      oldptr = ptr;
3146
3147      /* Remember \r or \n */
3148
3149      if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3150
3151      /* Check for range */
3152
3153      if (!inescq && ptr[1] == '-')
3154        {
3155        int d;
3156        ptr += 2;
3157        while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3158
3159        /* If we hit \Q (not followed by \E) at this point, go into escaped
3160        mode. */
3161
3162        while (*ptr == '\\' && ptr[1] == 'Q')
3163          {
3164          ptr += 2;
3165          if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3166          inescq = TRUE;
3167          break;
3168          }
3169
3170        if (*ptr == 0 || (!inescq && *ptr == ']'))
3171          {
3172          ptr = oldptr;
3173          goto LONE_SINGLE_CHARACTER;
3174          }
3175
3176#ifdef SUPPORT_UTF8
3177        if (utf8)
3178          {                           /* Braces are required because the */
3179          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3180          }
3181        else
3182#endif
3183        d = *ptr;  /* Not UTF-8 mode */
3184
3185        /* The second part of a range can be a single-character escape, but
3186        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3187        in such circumstances. */
3188
3189        if (!inescq && d == '\\')
3190          {
3191          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3192          if (*errorcodeptr != 0) goto FAILED;
3193
3194          /* \b is backspace; \X is literal X; \R is literal R; any other
3195          special means the '-' was literal */
3196
3197          if (d < 0)
3198            {
3199            if (d == -ESC_b) d = '\b';
3200            else if (d == -ESC_X) d = 'X';
3201            else if (d == -ESC_R) d = 'R'; else
3202              {
3203              ptr = oldptr;
3204              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3205              }
3206            }
3207          }
3208
3209        /* Check that the two values are in the correct order. Optimize
3210        one-character ranges */
3211
3212        if (d < c)
3213          {
3214          *errorcodeptr = ERR8;
3215          goto FAILED;
3216          }
3217
3218        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3219
3220        /* Remember \r or \n */
3221
3222        if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3223
3224        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3225        matching, we have to use an XCLASS with extra data items. Caseless
3226        matching for characters > 127 is available only if UCP support is
3227        available. */
3228
3229#ifdef SUPPORT_UTF8
3230        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3231          {
3232          class_utf8 = TRUE;
3233
3234          /* With UCP support, we can find the other case equivalents of
3235          the relevant characters. There may be several ranges. Optimize how
3236          they fit with the basic range. */
3237
3238#ifdef SUPPORT_UCP
3239          if ((options & PCRE_CASELESS) != 0)
3240            {
3241            unsigned int occ, ocd;
3242            unsigned int cc = c;
3243            unsigned int origd = d;
3244            while (get_othercase_range(&cc, origd, &occ, &ocd))
3245              {
3246              if (occ >= (unsigned int)c &&
3247                  ocd <= (unsigned int)d)
3248                continue;                          /* Skip embedded ranges */
3249
3250              if (occ < (unsigned int)&&
3251                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3252                {                                  /* if there is overlap,   */
3253                c = occ;                           /* noting that if occ < c */
3254                continue;                          /* we can't have ocd > d  */
3255                }                                  /* because a subrange is  */
3256              if (ocd > (unsigned int)d &&
3257                  occ <= (unsigned int)d + 1)      /* always shorter than    */
3258                {                                  /* the basic range.       */
3259                d = ocd;
3260                continue;
3261                }
3262
3263              if (occ == ocd)
3264                {
3265                *class_utf8data++ = XCL_SINGLE;
3266                }
3267              else
3268                {
3269                *class_utf8data++ = XCL_RANGE;
3270                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3271                }
3272              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3273              }
3274            }
3275#endif  /* SUPPORT_UCP */
3276
3277          /* Now record the original range, possibly modified for UCP caseless
3278          overlapping ranges. */
3279
3280          *class_utf8data++ = XCL_RANGE;
3281          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3282          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3283
3284          /* With UCP support, we are done. Without UCP support, there is no
3285          caseless matching for UTF-8 characters > 127; we can use the bit map
3286          for the smaller ones. */
3287
3288#ifdef SUPPORT_UCP
3289          continue;    /* With next character in the class */
3290#else
3291          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3292
3293          /* Adjust upper limit and fall through to set up the map */
3294
3295          d = 127;
3296
3297#endif  /* SUPPORT_UCP */
3298          }
3299#endif  /* SUPPORT_UTF8 */
3300
3301        /* We use the bit map for all cases when not in UTF-8 mode; else
3302        ranges that lie entirely within 0-127 when there is UCP support; else
3303        for partial ranges without UCP support. */
3304
3305        class_charcount += d - c + 1;
3306        class_lastchar = d;
3307
3308        /* We can save a bit of time by skipping this in the pre-compile. */
3309
3310        if (lengthptr == NULL) for (; c <= d; c++)
3311          {
3312          classbits[c/8] |= (1 << (c&7));
3313          if ((options & PCRE_CASELESS) != 0)
3314            {
3315            int uc = cd->fcc[c];           /* flip case */
3316            classbits[uc/8] |= (1 << (uc&7));
3317            }
3318          }
3319
3320        continue;   /* Go get the next char in the class */
3321        }
3322
3323      /* Handle a lone single character - we can get here for a normal
3324      non-escape char, or after \ that introduces a single character or for an
3325      apparent range that isn't. */
3326
3327      LONE_SINGLE_CHARACTER:
3328
3329      /* Handle a character that cannot go in the bit map */
3330
3331#ifdef SUPPORT_UTF8
3332      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3333        {
3334        class_utf8 = TRUE;
3335        *class_utf8data++ = XCL_SINGLE;
3336        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3337
3338#ifdef SUPPORT_UCP
3339        if ((options & PCRE_CASELESS) != 0)
3340          {
3341          unsigned int othercase;
3342          if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3343            {
3344            *class_utf8data++ = XCL_SINGLE;
3345            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3346            }
3347          }
3348#endif  /* SUPPORT_UCP */
3349
3350        }
3351      else
3352#endif  /* SUPPORT_UTF8 */
3353
3354      /* Handle a single-byte character */
3355        {
3356        classbits[c/8] |= (1 << (c&7));
3357        if ((options & PCRE_CASELESS) != 0)
3358          {
3359          c = cd->fcc[c];   /* flip case */
3360          classbits[c/8] |= (1 << (c&7));
3361          }
3362        class_charcount++;
3363        class_lastchar = c;
3364        }
3365      }
3366
3367    /* Loop until ']' reached. This "while" is the end of the "do" above. */
3368
3369    while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3370
3371    if (c == 0)                          /* Missing terminating ']' */
3372      {
3373      *errorcodeptr = ERR6;
3374      goto FAILED;
3375      }
3376
3377
3378/* This code has been disabled because it would mean that \s counts as
3379an explicit \r or \n reference, and that's not really what is wanted. Now
3380we set the flag only if there is a literal "\r" or "\n" in the class. */
3381
3382#if 0
3383    /* Remember whether \r or \n are in this class */
3384
3385    if (negate_class)
3386      {
3387      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3388      }
3389    else
3390      {
3391      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3392      }
3393#endif
3394
3395
3396    /* If class_charcount is 1, we saw precisely one character whose value is
3397    less than 256. As long as there were no characters >= 128 and there was no
3398    use of \p or \P, in other words, no use of any XCLASS features, we can
3399    optimize.
3400
3401    In UTF-8 mode, we can optimize the negative case only if there were no
3402    characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3403    operate on single-bytes only. This is an historical hangover. Maybe one day
3404    we can tidy these opcodes to handle multi-byte characters.
3405
3406    The optimization throws away the bit map. We turn the item into a
3407    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3408    that OP_NOT does not support multibyte characters. In the positive case, it
3409    can cause firstbyte to be set. Otherwise, there can be no first char if
3410    this item is first, whatever repeat count may follow. In the case of
3411    reqbyte, save the previous value for reinstating. */
3412
3413#ifdef SUPPORT_UTF8
3414    if (class_charcount == 1 && !class_utf8 &&
3415      (!utf8 || !negate_class || class_lastchar < 128))
3416#else
3417    if (class_charcount == 1)
3418#endif
3419      {
3420      zeroreqbyte = reqbyte;
3421
3422      /* The OP_NOT opcode works on one-byte characters only. */
3423
3424      if (negate_class)
3425        {
3426        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3427        zerofirstbyte = firstbyte;
3428        *code++ = OP_NOT;
3429        *code++ = class_lastchar;
3430        break;
3431        }
3432
3433      /* For a single, positive character, get the value into mcbuffer, and
3434      then we can handle this with the normal one-character code. */
3435
3436#ifdef SUPPORT_UTF8
3437      if (utf8 && class_lastchar > 127)
3438        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3439      else
3440#endif
3441        {
3442        mcbuffer[0] = class_lastchar;
3443        mclength = 1;
3444        }
3445      goto ONE_CHAR;
3446      }       /* End of 1-char optimization */
3447
3448    /* The general case - not the one-char optimization. If this is the first
3449    thing in the branch, there can be no first char setting, whatever the
3450    repeat count. Any reqbyte setting must remain unchanged after any kind of
3451    repeat. */
3452
3453    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3454    zerofirstbyte = firstbyte;
3455    zeroreqbyte = reqbyte;
3456
3457    /* If there are characters with values > 255, we have to compile an
3458    extended class, with its own opcode, unless there was a negated special
3459    such as \S in the class, because in that case all characters > 255 are in
3460    the class, so any that were explicitly given as well can be ignored. If
3461    (when there are explicit characters > 255 that must be listed) there are no
3462    characters < 256, we can omit the bitmap in the actual compiled code. */
3463
3464#ifdef SUPPORT_UTF8
3465    if (class_utf8 && !should_flip_negation)
3466      {
3467      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3468      *code++ = OP_XCLASS;
3469      code += LINK_SIZE;
3470      *code = negate_class? XCL_NOT : 0;
3471
3472      /* If the map is required, move up the extra data to make room for it;
3473      otherwise just move the code pointer to the end of the extra data. */
3474
3475      if (class_charcount > 0)
3476        {
3477        *code++ |= XCL_MAP;
3478        memmove(code + 32, code, class_utf8data - code);
3479        memcpy(code, classbits, 32);
3480        code = class_utf8data + 32;
3481        }
3482      else code = class_utf8data;
3483
3484      /* Now fill in the complete length of the item */
3485
3486      PUT(previous, 1, code - previous);
3487      break;   /* End of class handling */
3488      }
3489#endif
3490
3491    /* If there are no characters > 255, set the opcode to OP_CLASS or
3492    OP_NCLASS, depending on whether the whole class was negated and whether
3493    there were negative specials such as \S in the class. Then copy the 32-byte
3494    map into the code vector, negating it if necessary. */
3495
3496    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3497    if (negate_class)
3498      {
3499      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3500        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3501      }
3502    else
3503      {
3504      memcpy(code, classbits, 32);
3505      }
3506    code += 32;
3507    break;
3508
3509
3510    /* ===================================================================*/
3511    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3512    has been tested above. */
3513
3514    case '{':
3515    if (!is_quantifier) goto NORMAL_CHAR;
3516    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3517    if (*errorcodeptr != 0) goto FAILED;
3518    goto REPEAT;
3519
3520    case '*':
3521    repeat_min = 0;
3522    repeat_max = -1;
3523    goto REPEAT;
3524
3525    case '+':
3526    repeat_min = 1;
3527    repeat_max = -1;
3528    goto REPEAT;
3529
3530    case '?':
3531    repeat_min = 0;
3532    repeat_max = 1;
3533
3534    REPEAT:
3535    if (previous == NULL)
3536      {
3537      *errorcodeptr = ERR9;
3538      goto FAILED;
3539      }
3540
3541    if (repeat_min == 0)
3542      {
3543      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3544      reqbyte = zeroreqbyte;        /* Ditto */
3545      }
3546
3547    /* Remember whether this is a variable length repeat */
3548
3549    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3550
3551    op_type = 0;                    /* Default single-char op codes */
3552    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3553
3554    /* Save start of previous item, in case we have to move it up to make space
3555    for an inserted OP_ONCE for the additional '+' extension. */
3556
3557    tempcode = previous;
3558
3559    /* If the next character is '+', we have a possessive quantifier. This
3560    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3561    If the next character is '?' this is a minimizing repeat, by default,
3562    but if PCRE_UNGREEDY is set, it works the other way round. We change the
3563    repeat type to the non-default. */
3564
3565    if (ptr[1] == '+')
3566      {
3567      repeat_type = 0;                  /* Force greedy */
3568      possessive_quantifier = TRUE;
3569      ptr++;
3570      }
3571    else if (ptr[1] == '?')
3572      {
3573      repeat_type = greedy_non_default;
3574      ptr++;
3575      }
3576    else repeat_type = greedy_default;
3577
3578    /* If previous was a character match, abolish the item and generate a
3579    repeat item instead. If a char item has a minumum of more than one, ensure
3580    that it is set in reqbyte - it might not be if a sequence such as x{3} is
3581    the first thing in a branch because the x will have gone into firstbyte
3582    instead.  */
3583
3584    if (*previous == OP_CHAR || *previous == OP_CHARNC)
3585      {
3586      /* Deal with UTF-8 characters that take up more than one byte. It's
3587      easier to write this out separately than try to macrify it. Use c to
3588      hold the length of the character in bytes, plus 0x80 to flag that it's a
3589      length rather than a small character. */
3590
3591#ifdef SUPPORT_UTF8
3592      if (utf8 && (code[-1] & 0x80) != 0)
3593        {
3594        uschar *lastchar = code - 1;
3595        while((*lastchar & 0xc0) == 0x80) lastchar--;
3596        c = code - lastchar;            /* Length of UTF-8 character */
3597        memcpy(utf8_char, lastchar, c); /* Save the char */
3598        c |= 0x80;                      /* Flag c as a length */
3599        }
3600      else
3601#endif
3602
3603      /* Handle the case of a single byte - either with no UTF8 support, or
3604      with UTF-8 disabled, or for a UTF-8 character < 128. */
3605
3606        {
3607        c = code[-1];
3608        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3609        }
3610
3611      /* If the repetition is unlimited, it pays to see if the next thing on
3612      the line is something that cannot possibly match this character. If so,
3613      automatically possessifying this item gains some performance in the case
3614      where the match fails. */
3615
3616      if (!possessive_quantifier &&
3617          repeat_max < 0 &&
3618          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3619            options, cd))
3620        {
3621        repeat_type = 0;    /* Force greedy */
3622        possessive_quantifier = TRUE;
3623        }
3624
3625      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3626      }
3627
3628    /* If previous was a single negated character ([^a] or similar), we use
3629    one of the special opcodes, replacing it. The code is shared with single-
3630    character repeats by setting opt_type to add a suitable offset into
3631    repeat_type. We can also test for auto-possessification. OP_NOT is
3632    currently used only for single-byte chars. */
3633
3634    else if (*previous == OP_NOT)
3635      {
3636      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3637      c = previous[1];
3638      if (!possessive_quantifier &&
3639          repeat_max < 0 &&
3640          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3641        {
3642        repeat_type = 0;    /* Force greedy */
3643        possessive_quantifier = TRUE;
3644        }
3645      goto OUTPUT_SINGLE_REPEAT;
3646      }
3647
3648    /* If previous was a character type match (\d or similar), abolish it and
3649    create a suitable repeat item. The code is shared with single-character
3650    repeats by setting op_type to add a suitable offset into repeat_type. Note
3651    the the Unicode property types will be present only when SUPPORT_UCP is
3652    defined, but we don't wrap the little bits of code here because it just
3653    makes it horribly messy. */
3654
3655    else if (*previous < OP_EODN)
3656      {
3657      uschar *oldcode;
3658      int prop_type, prop_value;
3659      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3660      c = *previous;
3661
3662      if (!possessive_quantifier &&
3663          repeat_max < 0 &&
3664          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3665        {
3666        repeat_type = 0;    /* Force greedy */
3667        possessive_quantifier = TRUE;
3668        }
3669
3670      OUTPUT_SINGLE_REPEAT:
3671      if (*previous == OP_PROP || *previous == OP_NOTPROP)
3672        {
3673        prop_type = previous[1];
3674        prop_value = previous[2];
3675        }
3676      else prop_type = prop_value = -1;
3677
3678      oldcode = code;
3679      code = previous;                  /* Usually overwrite previous item */
3680
3681      /* If the maximum is zero then the minimum must also be zero; Perl allows
3682      this case, so we do too - by simply omitting the item altogether. */
3683
3684      if (repeat_max == 0) goto END_REPEAT;
3685
3686      /* All real repeats make it impossible to handle partial matching (maybe
3687      one day we will be able to remove this restriction). */
3688
3689      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3690
3691      /* Combine the op_type with the repeat_type */
3692
3693      repeat_type += op_type;
3694
3695      /* A minimum of zero is handled either as the special case * or ?, or as
3696      an UPTO, with the maximum given. */
3697
3698      if (repeat_min == 0)
3699        {
3700        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3701          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3702        else
3703          {
3704          *code++ = OP_UPTO + repeat_type;
3705          PUT2INC(code, 0, repeat_max);
3706          }
3707        }
3708
3709      /* A repeat minimum of 1 is optimized into some special cases. If the
3710      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3711      left in place and, if the maximum is greater than 1, we use OP_UPTO with
3712      one less than the maximum. */
3713
3714      else if (repeat_min == 1)
3715        {
3716        if (repeat_max == -1)
3717          *code++ = OP_PLUS + repeat_type;
3718        else
3719          {
3720          code = oldcode;                 /* leave previous item in place */
3721          if (repeat_max == 1) goto END_REPEAT;
3722          *code++ = OP_UPTO + repeat_type;
3723          PUT2INC(code, 0, repeat_max - 1);
3724          }
3725        }
3726
3727      /* The case {n,n} is just an EXACT, while the general case {n,m} is
3728      handled as an EXACT followed by an UPTO. */
3729
3730      else
3731        {
3732        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3733        PUT2INC(code, 0, repeat_min);
3734
3735        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3736        we have to insert the character for the previous code. For a repeated
3737        Unicode property match, there are two extra bytes that define the
3738        required property. In UTF-8 mode, long characters have their length in
3739        c, with the 0x80 bit as a flag. */
3740
3741        if (repeat_max < 0)
3742          {
3743#ifdef SUPPORT_UTF8
3744          if (utf8 && c >= 128)
3745            {
3746            memcpy(code, utf8_char, c & 7);
3747            code += c & 7;
3748            }
3749          else
3750#endif
3751            {
3752            *code++ = c;
3753            if (prop_type >= 0)
3754              {
3755              *code++ = prop_type;
3756              *code++ = prop_value;
3757              }
3758            }
3759          *code++ = OP_STAR + repeat_type;
3760          }
3761
3762        /* Else insert an UPTO if the max is greater than the min, again
3763        preceded by the character, for the previously inserted code. If the
3764        UPTO is just for 1 instance, we can use QUERY instead. */
3765
3766        else if (repeat_max != repeat_min)
3767          {
3768#ifdef SUPPORT_UTF8
3769          if (utf8 && c >= 128)
3770            {
3771            memcpy(code, utf8_char, c & 7);
3772            code += c & 7;
3773            }
3774          else
3775#endif
3776          *code++ = c;
3777          if (prop_type >= 0)
3778            {
3779            *code++ = prop_type;
3780            *code++ = prop_value;
3781            }
3782          repeat_max -= repeat_min;
3783
3784          if (repeat_max == 1)
3785            {
3786            *code++ = OP_QUERY + repeat_type;
3787            }
3788          else
3789            {
3790            *code++ = OP_UPTO + repeat_type;
3791            PUT2INC(code, 0, repeat_max);
3792            }
3793          }
3794        }
3795
3796      /* The character or character type itself comes last in all cases. */
3797
3798#ifdef SUPPORT_UTF8
3799      if (utf8 && c >= 128)
3800        {
3801        memcpy(code, utf8_char, c & 7);
3802        code += c & 7;
3803        }
3804      else
3805#endif
3806      *code++ = c;
3807
3808      /* For a repeated Unicode property match, there are two extra bytes that
3809      define the required property. */
3810
3811#ifdef SUPPORT_UCP
3812      if (prop_type >= 0)
3813        {
3814        *code++ = prop_type;
3815        *code++ = prop_value;
3816        }
3817#endif
3818      }
3819
3820    /* If previous was a character class or a back reference, we put the repeat
3821    stuff after it, but just skip the item if the repeat was {0,0}. */
3822
3823    else if (*previous == OP_CLASS ||
3824             *previous == OP_NCLASS ||
3825#ifdef SUPPORT_UTF8
3826             *previous == OP_XCLASS ||
3827#endif
3828             *previous == OP_REF)
3829      {
3830      if (repeat_max == 0)
3831        {
3832        code = previous;
3833        goto END_REPEAT;
3834        }
3835
3836      /* All real repeats make it impossible to handle partial matching (maybe
3837      one day we will be able to remove this restriction). */
3838
3839      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3840
3841      if (repeat_min == 0 && repeat_max == -1)
3842        *code++ = OP_CRSTAR + repeat_type;
3843      else if (repeat_min == 1 && repeat_max == -1)
3844        *code++ = OP_CRPLUS + repeat_type;
3845      else if (repeat_min == 0 && repeat_max == 1)
3846        *code++ = OP_CRQUERY + repeat_type;
3847      else
3848        {
3849        *code++ = OP_CRRANGE + repeat_type;
3850        PUT2INC(code, 0, repeat_min);
3851        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3852        PUT2INC(code, 0, repeat_max);
3853        }
3854      }
3855
3856    /* If previous was a bracket group, we may have to replicate it in certain
3857    cases. */
3858
3859    else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3860             *previous == OP_ONCE || *previous == OP_COND)
3861      {
3862      register int i;
3863      int ketoffset = 0;
3864      int len = code - previous;
3865      uschar *bralink = NULL;
3866
3867      /* Repeating a DEFINE group is pointless */
3868
3869      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3870        {
3871        *errorcodeptr = ERR55;
3872        goto FAILED;
3873        }
3874
3875      /* If the maximum repeat count is unlimited, find the end of the bracket
3876      by scanning through from the start, and compute the offset back to it
3877      from the current code pointer. There may be an OP_OPT setting following
3878      the final KET, so we can't find the end just by going back from the code
3879      pointer. */
3880
3881      if (repeat_max == -1)
3882        {
3883        register uschar *ket = previous;
3884        do ket += GET(ket, 1); while (*ket != OP_KET);
3885        ketoffset = code - ket;
3886        }
3887
3888      /* The case of a zero minimum is special because of the need to stick
3889      OP_BRAZERO in front of it, and because the group appears once in the
3890      data, whereas in other cases it appears the minimum number of times. For
3891      this reason, it is simplest to treat this case separately, as otherwise
3892      the code gets far too messy. There are several special subcases when the
3893      minimum is zero. */
3894
3895      if (repeat_min == 0)
3896        {
3897        /* If the maximum is also zero, we used to just omit the group from the
3898        output altogether, like this:
3899
3900        ** if (repeat_max == 0)
3901        **   {
3902        **   code = previous;
3903        **   goto END_REPEAT;
3904        **   }
3905
3906        However, that fails when a group is referenced as a subroutine from
3907        elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3908        so that it is skipped on execution. As we don't have a list of which
3909        groups are referenced, we cannot do this selectively.
3910
3911        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3912        and do no more at this point. However, we do need to adjust any
3913        OP_RECURSE calls inside the group that refer to the group itself or any
3914        internal or forward referenced group, because the offset is from the
3915        start of the whole regex. Temporarily terminate the pattern while doing
3916        this. */
3917
3918        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
3919          {
3920          *code = OP_END;
3921          adjust_recurse(previous, 1, utf8, cd, save_hwm);
3922          memmove(previous+1, previous, len);
3923          code++;
3924          if (repeat_max == 0)
3925            {
3926            *previous++ = OP_SKIPZERO;
3927            goto END_REPEAT;
3928            }
3929          *previous++ = OP_BRAZERO + repeat_type;
3930          }
3931
3932        /* If the maximum is greater than 1 and limited, we have to replicate
3933        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3934        The first one has to be handled carefully because it's the original
3935        copy, which has to be moved up. The remainder can be handled by code
3936        that is common with the non-zero minimum case below. We have to
3937        adjust the value or repeat_max, since one less copy is required. Once
3938        again, we may have to adjust any OP_RECURSE calls inside the group. */
3939
3940        else
3941          {
3942          int offset;
3943          *code = OP_END;
3944          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3945          memmove(previous + 2 + LINK_SIZE, previous, len);
3946          code += 2 + LINK_SIZE;
3947          *previous++ = OP_BRAZERO + repeat_type;
3948          *previous++ = OP_BRA;
3949
3950          /* We chain together the bracket offset fields that have to be
3951          filled in later when the ends of the brackets are reached. */
3952
3953          offset = (bralink == NULL)? 0 : previous - bralink;
3954          bralink = previous;
3955          PUTINC(previous, 0, offset);
3956          }
3957
3958        repeat_max--;
3959        }
3960
3961      /* If the minimum is greater than zero, replicate the group as many
3962      times as necessary, and adjust the maximum to the number of subsequent
3963      copies that we need. If we set a first char from the group, and didn't
3964      set a required char, copy the latter from the former. If there are any
3965      forward reference subroutine calls in the group, there will be entries on
3966      the workspace list; replicate these with an appropriate increment. */
3967
3968      else
3969        {
3970        if (repeat_min > 1)
3971          {
3972          /* In the pre-compile phase, we don't actually do the replication. We
3973          just adjust the length as if we had. Do some paranoid checks for
3974          potential integer overflow. */
3975
3976          if (lengthptr != NULL)
3977            {
3978            int delta = (repeat_min - 1)*length_prevgroup;
3979            if ((double)(repeat_min - 1)*(double)length_prevgroup >
3980                                                            (double)INT_MAX ||
3981                OFLOW_MAX - *lengthptr < delta)
3982              {
3983              *errorcodeptr = ERR20;
3984              goto FAILED;
3985              }
3986            *lengthptr += delta;
3987            }
3988
3989          /* This is compiling for real */
3990
3991          else
3992            {
3993            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3994            for (i = 1; i < repeat_min; i++)
3995              {
3996              uschar *hc;
3997              uschar *this_hwm = cd->hwm;
3998              memcpy(code, previous, len);
3999              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4000                {
4001                PUT(cd->hwm, 0, GET(hc, 0) + len);
4002                cd->hwm += LINK_SIZE;
4003                }
4004              save_hwm = this_hwm;
4005              code += len;
4006              }
4007            }
4008          }
4009
4010        if (repeat_max > 0) repeat_max -= repeat_min;
4011        }
4012
4013      /* This code is common to both the zero and non-zero minimum cases. If
4014      the maximum is limited, it replicates the group in a nested fashion,
4015      remembering the bracket starts on a stack. In the case of a zero minimum,
4016      the first one was set up above. In all cases the repeat_max now specifies
4017      the number of additional copies needed. Again, we must remember to
4018      replicate entries on the forward reference list. */
4019
4020      if (repeat_max >= 0)
4021        {
4022        /* In the pre-compile phase, we don't actually do the replication. We
4023        just adjust the length as if we had. For each repetition we must add 1
4024        to the length for BRAZERO and for all but the last repetition we must
4025        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4026        paranoid checks to avoid integer overflow. */
4027
4028        if (lengthptr != NULL && repeat_max > 0)
4029          {
4030          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4031                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4032          if ((double)repeat_max *
4033                (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4034                  > (double)INT_MAX ||
4035              OFLOW_MAX - *lengthptr < delta)
4036            {
4037            *errorcodeptr = ERR20;
4038            goto FAILED;
4039            }
4040          *lengthptr += delta;
4041          }
4042
4043        /* This is compiling for real */
4044
4045        else for (i = repeat_max - 1; i >= 0; i--)
4046          {
4047          uschar *hc;
4048          uschar *this_hwm = cd->hwm;
4049
4050          *code++ = OP_BRAZERO + repeat_type;
4051
4052          /* All but the final copy start a new nesting, maintaining the
4053          chain of brackets outstanding. */
4054
4055          if (i != 0)
4056            {
4057            int offset;
4058            *code++ = OP_BRA;
4059            offset = (bralink == NULL)? 0 : code - bralink;
4060            bralink = code;
4061            PUTINC(code, 0, offset);
4062            }
4063
4064          memcpy(code, previous, len);
4065          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4066            {
4067            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4068            cd->hwm += LINK_SIZE;
4069            }
4070          save_hwm = this_hwm;
4071          code += len;
4072          }
4073
4074        /* Now chain through the pending brackets, and fill in their length
4075        fields (which are holding the chain links pro tem). */
4076
4077        while (bralink != NULL)
4078          {
4079          int oldlinkoffset;
4080          int offset = code - bralink + 1;
4081          uschar *bra = code - offset;
4082          oldlinkoffset = GET(bra, 1);
4083          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4084          *code++ = OP_KET;
4085          PUTINC(code, 0, offset);
4086          PUT(bra, 1, offset);
4087          }
4088        }
4089
4090      /* If the maximum is unlimited, set a repeater in the final copy. We
4091      can't just offset backwards from the current code point, because we
4092      don't know if there's been an options resetting after the ket. The
4093      correct offset was computed above.
4094
4095      Then, when we are doing the actual compile phase, check to see whether
4096      this group is a non-atomic one that could match an empty string. If so,
4097      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4098      that runtime checking can be done. [This check is also applied to
4099      atomic groups at runtime, but in a different way.] */
4100
4101      else
4102        {
4103        uschar *ketcode = code - ketoffset;
4104        uschar *bracode = ketcode - GET(ketcode, 1);
4105        *ketcode = OP_KETRMAX + repeat_type;
4106        if (lengthptr == NULL && *bracode != OP_ONCE)
4107          {
4108          uschar *scode = bracode;
4109          do
4110            {
4111            if (could_be_empty_branch(scode, ketcode, utf8))
4112              {
4113              *bracode += OP_SBRA - OP_BRA;
4114              break;
4115              }
4116            scode += GET(scode, 1);
4117            }
4118          while (*scode == OP_ALT);
4119          }
4120        }
4121      }
4122
4123    /* If previous is OP_FAIL, it was generated by an empty class [] in
4124    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4125    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4126    error above. We can just ignore the repeat in JS case. */
4127
4128    else if (*previous == OP_FAIL) goto END_REPEAT;
4129
4130    /* Else there's some kind of shambles */
4131
4132    else
4133      {
4134      *errorcodeptr = ERR11;
4135      goto FAILED;
4136      }
4137
4138    /* If the character following a repeat is '+', or if certain optimization
4139    tests above succeeded, possessive_quantifier is TRUE. For some of the
4140    simpler opcodes, there is an special alternative opcode for this. For
4141    anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4142    The '+' notation is just syntactic sugar, taken from Sun's Java package,
4143    but the special opcodes can optimize it a bit. The repeated item starts at
4144    tempcode, not at previous, which might be the first part of a string whose
4145    (former) last char we repeated.
4146
4147    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4148    an 'upto' may follow. We skip over an 'exact' item, and then test the
4149    length of what remains before proceeding. */
4150
4151    if (possessive_quantifier)
4152      {
4153      int len;
4154      if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4155          *tempcode == OP_NOTEXACT)
4156        tempcode += _pcre_OP_lengths[*tempcode] +
4157          ((*tempcode == OP_TYPEEXACT &&
4158             (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4159      len = code - tempcode;
4160      if (len > 0) switch (*tempcode)
4161        {
4162        case OP_STAR:  *tempcode = OP_POSSTAR; break;
4163        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4164        case OP_QUERY: *tempcode = OP_POSQUERY; break;
4165        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4166
4167        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4168        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4169        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4170        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4171
4172        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4173        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4174        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4175        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4176
4177        default:
4178        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4179        code += 1 + LINK_SIZE;
4180        len += 1 + LINK_SIZE;
4181        tempcode[0] = OP_ONCE;
4182        *code++ = OP_KET;
4183        PUTINC(code, 0, len);
4184        PUT(tempcode, 1, len);
4185        break;
4186        }
4187      }
4188
4189    /* In all case we no longer have a previous item. We also set the
4190    "follows varying string" flag for subsequently encountered reqbytes if
4191    it isn't already set and we have just passed a varying length item. */
4192
4193    END_REPEAT:
4194    previous = NULL;
4195    cd->req_varyopt |= reqvary;
4196    break;
4197
4198
4199    /* ===================================================================*/
4200    /* Start of nested parenthesized sub-expression, or comment or lookahead or
4201    lookbehind or option setting or condition or all the other extended
4202    parenthesis forms.  */
4203
4204    case '(':
4205    newoptions = options;
4206    skipbytes = 0;
4207    bravalue = OP_CBRA;
4208    save_hwm = cd->hwm;
4209    reset_bracount = FALSE;
4210
4211    /* First deal with various "verbs" that can be introduced by '*'. */
4212
4213    if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4214      {
4215      int i, namelen;
4216      const char *vn = verbnames;
4217      const uschar *name = ++ptr;
4218      previous = NULL;
4219      while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4220      if (*ptr == ':')
4221        {
4222        *errorcodeptr = ERR59;   /* Not supported */
4223        goto FAILED;
4224        }
4225      if (*ptr != ')')
4226        {
4227        *errorcodeptr = ERR60;
4228        goto FAILED;
4229        }
4230      namelen = ptr - name;
4231      for (i = 0; i < verbcount; i++)
4232        {
4233        if (namelen == verbs[i].len &&
4234            strncmp((char *)name, vn, namelen) == 0)
4235          {
4236          *code = verbs[i].op;
4237          if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4238          break;
4239          }
4240        vn += verbs[i].len + 1;
4241        }
4242      if (i < verbcount) continue;
4243      *errorcodeptr = ERR60;
4244      goto FAILED;
4245      }
4246
4247    /* Deal with the extended parentheses; all are introduced by '?', and the
4248    appearance of any of them means that this is not a capturing group. */
4249
4250    else if (*ptr == '?')
4251      {
4252      int i, set, unset, namelen;
4253      int *optset;
4254      const uschar *name;
4255      uschar *slot;
4256
4257      switch (*(++ptr))
4258        {
4259        case '#':                 /* Comment; skip to ket */
4260        ptr++;
4261        while (*ptr != 0 && *ptr != ')') ptr++;
4262        if (*ptr == 0)
4263          {
4264          *errorcodeptr = ERR18;
4265          goto FAILED;
4266          }
4267        continue;
4268
4269
4270        /* ------------------------------------------------------------ */
4271        case '|':                 /* Reset capture count for each branch */
4272        reset_bracount = TRUE;
4273        /* Fall through */
4274
4275        /* ------------------------------------------------------------ */
4276        case ':':                 /* Non-capturing bracket */
4277        bravalue = OP_BRA;
4278        ptr++;
4279        break;
4280
4281
4282        /* ------------------------------------------------------------ */
4283        case '(':
4284        bravalue = OP_COND;       /* Conditional group */
4285
4286        /* A condition can be an assertion, a number (referring to a numbered
4287        group), a name (referring to a named group), or 'R', referring to
4288        recursion. R<digits> and R&name are also permitted for recursion tests.
4289
4290        There are several syntaxes for testing a named group: (?(name)) is used
4291        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4292
4293        There are two unfortunate ambiguities, caused by history. (a) 'R' can
4294        be the recursive thing or the name 'R' (and similarly for 'R' followed
4295        by digits), and (b) a number could be a name that consists of digits.
4296        In both cases, we look for a name first; if not found, we try the other
4297        cases. */
4298
4299        /* For conditions that are assertions, check the syntax, and then exit
4300        the switch. This will take control down to where bracketed groups,
4301        including assertions, are processed. */
4302
4303        if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4304          break;
4305
4306        /* Most other conditions use OP_CREF (a couple change to OP_RREF
4307        below), and all need to skip 3 bytes at the start of the group. */
4308
4309        code[1+LINK_SIZE] = OP_CREF;
4310        skipbytes = 3;
4311        refsign = -1;
4312
4313        /* Check for a test for recursion in a named group. */
4314
4315        if (ptr[1] == 'R' && ptr[2] == '&')
4316          {
4317          terminator = -1;
4318          ptr += 2;
4319          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4320          }
4321
4322        /* Check for a test for a named group's having been set, using the Perl
4323        syntax (?(<name>) or (?('name') */
4324
4325        else if (ptr[1] == '<')
4326          {
4327          terminator = '>';
4328          ptr++;
4329          }
4330        else if (ptr[1] == '\'')
4331          {
4332          terminator = '\'';
4333          ptr++;
4334          }
4335        else
4336          {
4337          terminator = 0;
4338          if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4339          }
4340
4341        /* We now expect to read a name; any thing else is an error */
4342
4343        if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4344          {
4345          ptr += 1;  /* To get the right offset */
4346          *errorcodeptr = ERR28;
4347          goto FAILED;
4348          }
4349
4350        /* Read the name, but also get it as a number if it's all digits */
4351
4352        recno = 0;
4353        name = ++ptr;
4354        while ((cd->ctypes[*ptr] & ctype_word) != 0)
4355          {
4356          if (recno >= 0)
4357            recno = ((digitab[*ptr] & ctype_digit) != 0)?
4358              recno * 10 + *ptr - '0' : -1;
4359          ptr++;
4360          }
4361        namelen = ptr - name;
4362
4363        if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4364          {
4365          ptr--;      /* Error offset */
4366          *errorcodeptr = ERR26;
4367          goto FAILED;
4368          }
4369
4370        /* Do no further checking in the pre-compile phase. */
4371
4372        if (lengthptr != NULL) break;
4373
4374        /* In the real compile we do the work of looking for the actual
4375        reference. If the string started with "+" or "-" we require the rest to
4376        be digits, in which case recno will be set. */
4377
4378        if (refsign > 0)
4379          {
4380          if (recno <= 0)
4381            {
4382            *errorcodeptr = ERR58;
4383            goto FAILED;
4384            }
4385          recno = (refsign == '-')?
4386            cd->bracount - recno + 1 : recno +cd->bracount;
4387          if (recno <= 0 || recno > cd->final_bracount)
4388            {
4389            *errorcodeptr = ERR15;
4390            goto FAILED;
4391            }
4392          PUT2(code, 2+LINK_SIZE, recno);
4393          break;
4394          }
4395
4396        /* Otherwise (did not start with "+" or "-"), start by looking for the
4397        name. */
4398
4399        slot = cd->name_table;
4400        for (i = 0; i < cd->names_found; i++)
4401          {
4402          if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4403          slot += cd->name_entry_size;
4404          }
4405
4406        /* Found a previous named subpattern */
4407
4408        if (i < cd->names_found)
4409          {
4410          recno = GET2(slot, 0);
4411          PUT2(code, 2+LINK_SIZE, recno);
4412          }
4413
4414        /* Search the pattern for a forward reference */
4415
4416        else if ((i = find_parens(ptr, cd, name, namelen,
4417                        (options & PCRE_EXTENDED) != 0)) > 0)
4418          {
4419          PUT2(code, 2+LINK_SIZE, i);
4420          }
4421
4422        /* If terminator == 0 it means that the name followed directly after
4423        the opening parenthesis [e.g. (?(abc)...] and in this case there are
4424        some further alternatives to try. For the cases where terminator != 0
4425        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4426        now checked all the possibilities, so give an error. */
4427
4428        else if (terminator != 0)
4429          {
4430          *errorcodeptr = ERR15;
4431          goto FAILED;
4432          }
4433
4434        /* Check for (?(R) for recursion. Allow digits after R to specify a
4435        specific group number. */
4436
4437        else if (*name == 'R')
4438          {
4439          recno = 0;
4440          for (i = 1; i < namelen; i++)
4441            {
4442            if ((digitab[name[i]] & ctype_digit) == 0)
4443              {
4444              *errorcodeptr = ERR15;
4445              goto FAILED;
4446              }
4447            recno = recno * 10 + name[i] - '0';
4448            }
4449          if (recno == 0) recno = RREF_ANY;
4450          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4451          PUT2(code, 2+LINK_SIZE, recno);
4452          }
4453
4454        /* Similarly, check for the (?(DEFINE) "condition", which is always
4455        false. */
4456
4457        else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4458          {
4459          code[1+LINK_SIZE] = OP_DEF;
4460          skipbytes = 1;
4461          }
4462
4463        /* Check for the "name" actually being a subpattern number. We are
4464        in the second pass here, so final_bracount is set. */
4465
4466        else if (recno > 0 && recno <= cd->final_bracount)
4467          {
4468          PUT2(code, 2+LINK_SIZE, recno);
4469          }
4470
4471        /* Either an unidentified subpattern, or a reference to (?(0) */
4472
4473        else
4474          {
4475          *errorcodeptr = (recno == 0)? ERR35: ERR15;
4476          goto FAILED;
4477          }
4478        break;
4479
4480
4481        /* ------------------------------------------------------------ */
4482        case '=':                 /* Positive lookahead */
4483        bravalue = OP_ASSERT;
4484        ptr++;
4485        break;
4486
4487
4488        /* ------------------------------------------------------------ */
4489        case '!':                 /* Negative lookahead */
4490        ptr++;
4491        if (*ptr == ')')          /* Optimize (?!) */
4492          {
4493          *code++ = OP_FAIL;
4494          previous = NULL;
4495          continue;
4496          }
4497        bravalue = OP_ASSERT_NOT;
4498        break;
4499
4500
4501        /* ------------------------------------------------------------ */
4502        case '<':                 /* Lookbehind or named define */
4503        switch (ptr[1])
4504          {
4505          case '=':               /* Positive lookbehind */
4506          bravalue = OP_ASSERTBACK;
4507          ptr += 2;
4508          break;
4509
4510          case '!':               /* Negative lookbehind */
4511          bravalue = OP_ASSERTBACK_NOT;
4512          ptr += 2;
4513          break;
4514
4515          default:                /* Could be name define, else bad */
4516          if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4517          ptr++;                  /* Correct offset for error */
4518          *errorcodeptr = ERR24;
4519          goto FAILED;
4520          }
4521        break;
4522
4523
4524        /* ------------------------------------------------------------ */
4525        case '>':                 /* One-time brackets */
4526        bravalue = OP_ONCE;
4527        ptr++;
4528        break;
4529
4530
4531        /* ------------------------------------------------------------ */
4532        case 'C':                 /* Callout - may be followed by digits; */
4533        previous_callout = code;  /* Save for later completion */
4534        after_manual_callout = 1; /* Skip one item before completing */
4535        *code++ = OP_CALLOUT;
4536          {
4537          int n = 0;
4538          while ((digitab[*(++ptr)] & ctype_digit) != 0)
4539            n = n * 10 + *ptr - '0';
4540          if (*ptr != ')')
4541            {
4542            *errorcodeptr = ERR39;
4543            goto FAILED;
4544            }
4545          if (n > 255)
4546            {
4547            *errorcodeptr = ERR38;
4548            goto FAILED;
4549            }
4550          *code++ = n;
4551          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4552          PUT(code, LINK_SIZE, 0);                    /* Default length */
4553          code += 2 * LINK_SIZE;
4554          }
4555        previous = NULL;
4556        continue;
4557
4558
4559        /* ------------------------------------------------------------ */
4560        case 'P':                 /* Python-style named subpattern handling */
4561        if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4562          {
4563          is_recurse = *ptr == '>';
4564          terminator = ')';
4565          goto NAMED_REF_OR_RECURSE;
4566          }
4567        else if (*ptr != '<')    /* Test for Python-style definition */
4568          {
4569          *errorcodeptr = ERR41;
4570          goto FAILED;
4571          }
4572        /* Fall through to handle (?P< as (?< is handled */
4573
4574
4575        /* ------------------------------------------------------------ */
4576        DEFINE_NAME:    /* Come here from (?< handling */
4577        case '\'':
4578          {
4579          terminator = (*ptr == '<')? '>' : '\'';
4580          name = ++ptr;
4581
4582          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4583          namelen = ptr - name;
4584
4585          /* In the pre-compile phase, just do a syntax check. */
4586
4587          if (lengthptr != NULL)
4588            {
4589            if (*ptr != terminator)
4590              {
4591              *errorcodeptr = ERR42;
4592              goto FAILED;
4593              }
4594            if (cd->names_found >= MAX_NAME_COUNT)
4595              {
4596              *errorcodeptr = ERR49;
4597              goto FAILED;
4598              }
4599            if (namelen + 3 > cd->name_entry_size)
4600              {
4601              cd->name_entry_size = namelen + 3;
4602              if (namelen > MAX_NAME_SIZE)
4603                {
4604                *errorcodeptr = ERR48;
4605                goto FAILED;
4606                }
4607              }
4608            }
4609
4610          /* In the real compile, create the entry in the table */
4611
4612          else
4613            {
4614            slot = cd->name_table;
4615            for (i = 0; i < cd->names_found; i++)
4616              {
4617              int crc = memcmp(name, slot+2, namelen);
4618              if (crc == 0)
4619                {
4620                if (slot[2+namelen] == 0)
4621                  {
4622                  if ((options & PCRE_DUPNAMES) == 0)
4623                    {
4624                    *errorcodeptr = ERR43;
4625                    goto FAILED;
4626                    }
4627                  }
4628                else crc = -1;      /* Current name is substring */
4629                }
4630              if (crc < 0)
4631                {
4632                memmove(slot + cd->name_entry_size, slot,
4633                  (cd->names_found - i) * cd->name_entry_size);
4634                break;
4635                }
4636              slot += cd->name_entry_size;
4637              }
4638
4639            PUT2(slot, 0, cd->bracount + 1);
4640            memcpy(slot + 2, name, namelen);
4641            slot[2+namelen] = 0;
4642            }
4643          }
4644
4645        /* In both cases, count the number of names we've encountered. */
4646
4647        ptr++;                    /* Move past > or ' */
4648        cd->names_found++;
4649        goto NUMBERED_GROUP;
4650
4651
4652        /* ------------------------------------------------------------ */
4653        case '&':                 /* Perl recursion/subroutine syntax */
4654        terminator = ')';
4655        is_recurse = TRUE;
4656        /* Fall through */
4657
4658        /* We come here from the Python syntax above that handles both
4659        references (?P=name) and recursion (?P>name), as well as falling
4660        through from the Perl recursion syntax (?&name). We also come here from
4661        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4662        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4663
4664        NAMED_REF_OR_RECURSE:
4665        name = ++ptr;
4666        while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4667        namelen = ptr - name;
4668
4669        /* In the pre-compile phase, do a syntax check and set a dummy
4670        reference number. */
4671
4672        if (lengthptr != NULL)
4673          {
4674          if (namelen == 0)
4675            {
4676            *errorcodeptr = ERR62;
4677            goto FAILED;
4678            }
4679          if (*ptr != terminator)
4680            {
4681            *errorcodeptr = ERR42;
4682            goto FAILED;
4683            }
4684          if (namelen > MAX_NAME_SIZE)
4685            {
4686            *errorcodeptr = ERR48;
4687            goto FAILED;
4688            }
4689          recno = 0;
4690          }
4691
4692        /* In the real compile, seek the name in the table. We check the name
4693        first, and then check that we have reached the end of the name in the
4694        table. That way, if the name that is longer than any in the table,
4695        the comparison will fail without reading beyond the table entry. */
4696
4697        else
4698          {
4699          slot = cd->name_table;
4700          for (i = 0; i < cd->names_found; i++)
4701            {
4702            if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4703                slot[2+namelen] == 0)
4704              break;
4705            slot += cd->name_entry_size;
4706            }
4707
4708          if (i < cd->names_found)         /* Back reference */
4709            {
4710            recno = GET2(slot, 0);
4711            }
4712          else if ((recno =                /* Forward back reference */
4713                    find_parens(ptr, cd, name, namelen,
4714                      (options & PCRE_EXTENDED) != 0)) <= 0)
4715            {
4716            *errorcodeptr = ERR15;
4717            goto FAILED;
4718            }
4719          }
4720
4721        /* In both phases, we can now go to the code than handles numerical
4722        recursion or backreferences. */
4723
4724        if (is_recurse) goto HANDLE_RECURSION;
4725          else goto HANDLE_REFERENCE;
4726
4727
4728        /* ------------------------------------------------------------ */
4729        case 'R':                 /* Recursion */
4730        ptr++;                    /* Same as (?0)      */
4731        /* Fall through */
4732
4733
4734        /* ------------------------------------------------------------ */
4735        case '-': case '+':
4736        case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4737        case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4738          {
4739          const uschar *called;
4740          terminator = ')';
4741
4742          /* Come here from the \g<...> and \g'...' code (Oniguruma
4743          compatibility). However, the syntax has been checked to ensure that
4744          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4745          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4746          ever be taken. */
4747
4748          HANDLE_NUMERICAL_RECURSION:
4749
4750          if ((refsign = *ptr) == '+')
4751            {
4752            ptr++;
4753            if ((digitab[*ptr] & ctype_digit) == 0)
4754              {
4755              *errorcodeptr = ERR63;
4756              goto FAILED;
4757              }
4758            }
4759          else if (refsign == '-')
4760            {
4761            if ((digitab[ptr[1]] & ctype_digit) == 0)
4762              goto OTHER_CHAR_AFTER_QUERY;
4763            ptr++;
4764            }
4765
4766          recno = 0;
4767          while((digitab[*ptr] & ctype_digit) != 0)
4768            recno = recno * 10 + *ptr++ - '0';
4769
4770          if (*ptr != terminator)
4771            {
4772            *errorcodeptr = ERR29;
4773            goto FAILED;
4774            }
4775
4776          if (refsign == '-')
4777            {
4778            if (recno == 0)
4779              {
4780              *errorcodeptr = ERR58;
4781              goto FAILED;
4782              }
4783            recno = cd->bracount - recno + 1;
4784            if (recno <= 0)
4785              {
4786              *errorcodeptr = ERR15;
4787              goto FAILED;
4788              }
4789            }
4790          else if (refsign == '+')
4791            {
4792            if (recno == 0)
4793              {
4794              *errorcodeptr = ERR58;
4795              goto FAILED;
4796              }
4797            recno += cd->bracount;
4798            }
4799
4800          /* Come here from code above that handles a named recursion */
4801
4802          HANDLE_RECURSION:
4803
4804          previous = code;
4805          called = cd->start_code;
4806
4807          /* When we are actually compiling, find the bracket that is being
4808          referenced. Temporarily end the regex in case it doesn't exist before
4809          this point. If we end up with a forward reference, first check that
4810          the bracket does occur later so we can give the error (and position)
4811          now. Then remember this forward reference in the workspace so it can
4812          be filled in at the end. */
4813
4814          if (lengthptr == NULL)
4815            {
4816            *code = OP_END;
4817            if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4818
4819            /* Forward reference */
4820
4821            if (called == NULL)
4822              {
4823              if (find_parens(ptr, cd, NULL, recno,
4824                    (options & PCRE_EXTENDED) != 0) < 0)
4825                {
4826                *errorcodeptr = ERR15;
4827                goto FAILED;
4828                }
4829              called = cd->start_code + recno;
4830              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4831              }
4832
4833            /* If not a forward reference, and the subpattern is still open,
4834            this is a recursive call. We check to see if this is a left
4835            recursion that could loop for ever, and diagnose that case. */
4836
4837            else if (GET(called, 1) == 0 &&
4838                     could_be_empty(called, code, bcptr, utf8))
4839              {
4840              *errorcodeptr = ERR40;
4841              goto FAILED;
4842              }
4843            }
4844
4845          /* Insert the recursion/subroutine item, automatically wrapped inside
4846          "once" brackets. Set up a "previous group" length so that a
4847          subsequent quantifier will work. */
4848
4849          *code = OP_ONCE;
4850          PUT(code, 1, 2 + 2*LINK_SIZE);
4851          code += 1 + LINK_SIZE;
4852
4853          *code = OP_RECURSE;
4854          PUT(code, 1, called - cd->start_code);
4855          code += 1 + LINK_SIZE;
4856
4857          *code = OP_KET;
4858          PUT(code, 1, 2 + 2*LINK_SIZE);
4859          code += 1 + LINK_SIZE;
4860
4861          length_prevgroup = 3 + 3*LINK_SIZE;
4862          }
4863
4864        /* Can't determine a first byte now */
4865
4866        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4867        continue;
4868
4869
4870        /* ------------------------------------------------------------ */
4871        default:              /* Other characters: check option setting */
4872        OTHER_CHAR_AFTER_QUERY:
4873        set = unset = 0;
4874        optset = &set;
4875
4876        while (*ptr != ')' && *ptr != ':')
4877          {
4878          switch (*ptr++)
4879            {
4880            case '-': optset = &unset; break;
4881
4882            case 'J':    /* Record that it changed in the external options */
4883            *optset |= PCRE_DUPNAMES;
4884            cd->external_flags |= PCRE_JCHANGED;
4885            break;
4886
4887            case 'i': *optset |= PCRE_CASELESS; break;
4888            case 'm': *optset |= PCRE_MULTILINE; break;
4889            case 's': *optset |= PCRE_DOTALL; break;
4890            case 'x': *optset |= PCRE_EXTENDED; break;
4891            case 'U': *optset |= PCRE_UNGREEDY; break;
4892            case 'X': *optset |= PCRE_EXTRA; break;
4893
4894            default:  *errorcodeptr = ERR12;
4895                      ptr--;    /* Correct the offset */
4896                      goto FAILED;
4897            }
4898          }
4899
4900        /* Set up the changed option bits, but don't change anything yet. */
4901
4902        newoptions = (options | set) & (~unset);
4903
4904        /* If the options ended with ')' this is not the start of a nested
4905        group with option changes, so the options change at this level. If this
4906        item is right at the start of the pattern, the options can be
4907        abstracted and made external in the pre-compile phase, and ignored in
4908        the compile phase. This can be helpful when matching -- for instance in
4909        caseless checking of required bytes.
4910
4911        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4912        definitely *not* at the start of the pattern because something has been
4913        compiled. In the pre-compile phase, however, the code pointer can have
4914        that value after the start, because it gets reset as code is discarded
4915        during the pre-compile. However, this can happen only at top level - if
4916        we are within parentheses, the starting BRA will still be present. At
4917        any parenthesis level, the length value can be used to test if anything
4918        has been compiled at that level. Thus, a test for both these conditions
4919        is necessary to ensure we correctly detect the start of the pattern in
4920        both phases.
4921
4922        If we are not at the pattern start, compile code to change the ims
4923        options if this setting actually changes any of them. We also pass the
4924        new setting back so that it can be put at the start of any following
4925        branches, and when this group ends (if we are in a group), a resetting
4926        item can be compiled. */
4927
4928        if (*ptr == ')')
4929          {
4930          if (code == cd->start_code + 1 + LINK_SIZE &&
4931               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4932            {
4933            cd->external_options = newoptions;
4934            options = newoptions;
4935            }
4936         else
4937            {
4938            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4939              {
4940              *code++ = OP_OPT;
4941              *code++ = newoptions & PCRE_IMS;
4942              }
4943
4944            /* Change options at this level, and pass them back for use
4945            in subsequent branches. Reset the greedy defaults and the case
4946            value for firstbyte and reqbyte. */
4947
4948            *optionsptr = options = newoptions;
4949            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4950            greedy_non_default = greedy_default ^ 1;
4951            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4952            }
4953
4954          previous = NULL;       /* This item can't be repeated */
4955          continue;              /* It is complete */
4956          }
4957
4958        /* If the options ended with ':' we are heading into a nested group
4959        with possible change of options. Such groups are non-capturing and are
4960        not assertions of any kind. All we need to do is skip over the ':';
4961        the newoptions value is handled below. */
4962
4963        bravalue = OP_BRA;
4964        ptr++;
4965        }     /* End of switch for character following (? */
4966      }       /* End of (? handling */
4967
4968    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4969    all unadorned brackets become non-capturing and behave like (?:...)
4970    brackets. */
4971
4972    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4973      {
4974      bravalue = OP_BRA;
4975      }
4976
4977    /* Else we have a capturing group. */
4978
4979    else
4980      {
4981      NUMBERED_GROUP:
4982      cd->bracount += 1;
4983      PUT2(code, 1+LINK_SIZE, cd->bracount);
4984      skipbytes = 2;
4985      }
4986
4987    /* Process nested bracketed regex. Assertions may not be repeated, but
4988    other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4989    non-register variable in order to be able to pass its address because some
4990    compilers complain otherwise. Pass in a new setting for the ims options if
4991    they have changed. */
4992
4993    previous = (bravalue >= OP_ONCE)? code : NULL;
4994    *code = bravalue;
4995    tempcode = code;
4996    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4997    length_prevgroup = 0;              /* Initialize for pre-compile phase */
4998
4999    if (!compile_regex(
5000         newoptions,                   /* The complete new option state */
5001         options & PCRE_IMS,           /* The previous ims option state */
5002         &tempcode,                    /* Where to put code (updated) */
5003         &ptr,                         /* Input pointer (updated) */
5004         errorcodeptr,                 /* Where to put an error message */
5005         (bravalue == OP_ASSERTBACK ||
5006          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5007         reset_bracount,               /* True if (?| group */
5008         skipbytes,                    /* Skip over bracket number */
5009         &subfirstbyte,                /* For possible first char */
5010         &subreqbyte,                  /* For possible last char */
5011         bcptr,                        /* Current branch chain */
5012         cd,                           /* Tables block */
5013         (lengthptr == NULL)? NULL :   /* Actual compile phase */
5014           &length_prevgroup           /* Pre-compile phase */
5015         ))
5016      goto FAILED;
5017
5018    /* At the end of compiling, code is still pointing to the start of the
5019    group, while tempcode has been updated to point past the end of the group
5020    and any option resetting that may follow it. The pattern pointer (ptr)
5021    is on the bracket. */
5022
5023    /* If this is a conditional bracket, check that there are no more than
5024    two branches in the group, or just one if it's a DEFINE group. We do this
5025    in the real compile phase, not in the pre-pass, where the whole group may
5026    not be available. */
5027
5028    if (bravalue == OP_COND && lengthptr == NULL)
5029      {
5030      uschar *tc = code;
5031      int condcount = 0;
5032
5033      do {
5034         condcount++;
5035         tc += GET(tc,1);
5036         }
5037      while (*tc != OP_KET);
5038
5039      /* A DEFINE group is never obeyed inline (the "condition" is always
5040      false). It must have only one branch. */
5041
5042      if (code[LINK_SIZE+1] == OP_DEF)
5043        {
5044        if (condcount > 1)
5045          {
5046          *errorcodeptr = ERR54;
5047          goto FAILED;
5048          }
5049        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5050        }
5051
5052      /* A "normal" conditional group. If there is just one branch, we must not
5053      make use of its firstbyte or reqbyte, because this is equivalent to an
5054      empty second branch. */
5055
5056      else
5057        {
5058        if (condcount > 2)
5059          {
5060          *errorcodeptr = ERR27;
5061          goto FAILED;
5062          }
5063        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5064        }
5065      }
5066
5067    /* Error if hit end of pattern */
5068
5069    if (*ptr != ')')
5070      {
5071      *errorcodeptr = ERR14;
5072      goto FAILED;
5073      }
5074
5075    /* In the pre-compile phase, update the length by the length of the group,
5076    less the brackets at either end. Then reduce the compiled code to just a
5077    set of non-capturing brackets so that it doesn't use much memory if it is
5078    duplicated by a quantifier.*/
5079
5080    if (lengthptr != NULL)
5081      {
5082      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5083        {
5084        *errorcodeptr = ERR20;
5085        goto FAILED;
5086        }
5087      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5088      *code++ = OP_BRA;
5089      PUTINC(code, 0, 1 + LINK_SIZE);
5090      *code++ = OP_KET;
5091      PUTINC(code, 0, 1 + LINK_SIZE);
5092      break;    /* No need to waste time with special character handling */
5093      }
5094
5095    /* Otherwise update the main code pointer to the end of the group. */
5096
5097    code = tempcode;
5098
5099    /* For a DEFINE group, required and first character settings are not
5100    relevant. */
5101
5102    if (bravalue == OP_DEF) break;
5103
5104    /* Handle updating of the required and first characters for other types of
5105    group. Update for normal brackets of all kinds, and conditions with two
5106    branches (see code above). If the bracket is followed by a quantifier with
5107    zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5108    zerofirstbyte outside the main loop so that they can be accessed for the
5109    back off. */
5110
5111    zeroreqbyte = reqbyte;
5112    zerofirstbyte = firstbyte;
5113    groupsetfirstbyte = FALSE;
5114
5115    if (bravalue >= OP_ONCE)
5116      {
5117      /* If we have not yet set a firstbyte in this branch, take it from the
5118      subpattern, remembering that it was set here so that a repeat of more
5119      than one can replicate it as reqbyte if necessary. If the subpattern has
5120      no firstbyte, set "none" for the whole branch. In both cases, a zero
5121      repeat forces firstbyte to "none". */
5122
5123      if (firstbyte == REQ_UNSET)
5124        {
5125        if (subfirstbyte >= 0)
5126          {
5127          firstbyte = subfirstbyte;
5128          groupsetfirstbyte = TRUE;
5129          }
5130        else firstbyte = REQ_NONE;
5131        zerofirstbyte = REQ_NONE;
5132        }
5133
5134      /* If firstbyte was previously set, convert the subpattern's firstbyte
5135      into reqbyte if there wasn't one, using the vary flag that was in
5136      existence beforehand. */
5137
5138      else if (subfirstbyte >= 0 && subreqbyte < 0)
5139        subreqbyte = subfirstbyte | tempreqvary;
5140
5141      /* If the subpattern set a required byte (or set a first byte that isn't
5142      really the first byte - see above), set it. */
5143
5144      if (subreqbyte >= 0) reqbyte = subreqbyte;
5145      }
5146
5147    /* For a forward assertion, we take the reqbyte, if set. This can be
5148    helpful if the pattern that follows the assertion doesn't set a different
5149    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5150    for an assertion, however because it leads to incorrect effect for patterns
5151    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5152    of a firstbyte. This is overcome by a scan at the end if there's no
5153    firstbyte, looking for an asserted first char. */
5154
5155    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5156    break;     /* End of processing '(' */
5157
5158
5159    /* ===================================================================*/
5160    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5161    are arranged to be the negation of the corresponding OP_values. For the
5162    back references, the values are ESC_REF plus the reference number. Only
5163    back references and those types that consume a character may be repeated.
5164    We can test for values between ESC_b and ESC_Z for the latter; this may
5165    have to change if any new ones are ever created. */
5166
5167    case '\\':
5168    tempptr = ptr;
5169    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5170    if (*errorcodeptr != 0) goto FAILED;
5171
5172    if (c < 0)
5173      {
5174      if (-c == ESC_Q)            /* Handle start of quoted string */
5175        {
5176        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5177          else inescq = TRUE;
5178        continue;
5179        }
5180
5181      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5182
5183      /* For metasequences that actually match a character, we disable the
5184      setting of a first character if it hasn't already been set. */
5185
5186      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5187        firstbyte = REQ_NONE;
5188
5189      /* Set values to reset to if this is followed by a zero repeat. */
5190
5191      zerofirstbyte = firstbyte;
5192      zeroreqbyte = reqbyte;
5193
5194      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5195      is a subroutine call by number (Oniguruma syntax). In fact, the value
5196      -ESC_g is returned only for these cases. So we don't need to check for <
5197      or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5198      -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5199      that is a synonym for a named back reference). */
5200
5201      if (-c == ESC_g)
5202        {
5203        const uschar *p;
5204        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5205        terminator = (*(++ptr) == '<')? '>' : '\'';
5206
5207        /* These two statements stop the compiler for warning about possibly
5208        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5209        fact, because we actually check for a number below, the paths that
5210        would actually be in error are never taken. */
5211
5212        skipbytes = 0;
5213        reset_bracount = FALSE;
5214
5215        /* Test for a name */
5216
5217        if (ptr[1] != '+' && ptr[1] != '-')
5218          {
5219          BOOL isnumber = TRUE;
5220          for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5221            {
5222            if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5223            if ((cd->ctypes[*p] & ctype_word) == 0) break;
5224            }
5225          if (*p != terminator)
5226            {
5227            *errorcodeptr = ERR57;
5228            break;
5229            }
5230          if (isnumber)
5231            {
5232            ptr++;
5233            goto HANDLE_NUMERICAL_RECURSION;
5234            }
5235          is_recurse = TRUE;
5236          goto NAMED_REF_OR_RECURSE;
5237          }
5238
5239        /* Test a signed number in angle brackets or quotes. */
5240
5241        p = ptr + 2;
5242        while ((digitab[*p] & ctype_digit) != 0) p++;
5243        if (*p != terminator)
5244          {
5245          *errorcodeptr = ERR57;
5246          break;
5247          }
5248        ptr++;
5249        goto HANDLE_NUMERICAL_RECURSION;
5250        }
5251
5252      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5253      We also support \k{name} (.NET syntax) */
5254
5255      if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5256        {
5257        is_recurse = FALSE;
5258        terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5259        goto NAMED_REF_OR_RECURSE;
5260        }
5261
5262      /* Back references are handled specially; must disable firstbyte if
5263      not set to cope with cases like (?=(\w+))\1: which would otherwise set
5264      ':' later. */
5265
5266      if (-c >= ESC_REF)
5267        {
5268        recno = -c - ESC_REF;
5269
5270        HANDLE_REFERENCE:    /* Come here from named backref handling */
5271        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5272        previous = code;
5273        *code++ = OP_REF;
5274        PUT2INC(code, 0, recno);
5275        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5276        if (recno > cd->top_backref) cd->top_backref = recno;
5277        }
5278
5279      /* So are Unicode property matches, if supported. */
5280
5281#ifdef SUPPORT_UCP
5282      else if (-c == ESC_P || -c == ESC_p)
5283        {
5284        BOOL negated;
5285        int pdata;
5286        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5287        if (ptype < 0) goto FAILED;
5288        previous = code;
5289        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5290        *code++ = ptype;
5291        *code++ = pdata;
5292        }
5293#else
5294
5295      /* If Unicode properties are not supported, \X, \P, and \p are not
5296      allowed. */
5297
5298      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5299        {
5300        *errorcodeptr = ERR45;
5301        goto FAILED;
5302        }
5303#endif
5304
5305      /* For the rest (including \X when Unicode properties are supported), we
5306      can obtain the OP value by negating the escape value. */
5307
5308      else
5309        {
5310        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5311        *code++ = -c;
5312        }
5313      continue;
5314      }
5315
5316    /* We have a data character whose value is in c. In UTF-8 mode it may have
5317    a value > 127. We set its representation in the length/buffer, and then
5318    handle it as a data character. */
5319
5320#ifdef SUPPORT_UTF8
5321    if (utf8 && c > 127)
5322      mclength = _pcre_ord2utf8(c, mcbuffer);
5323    else
5324#endif
5325
5326     {
5327     mcbuffer[0] = c;
5328     mclength = 1;
5329     }
5330    goto ONE_CHAR;
5331
5332
5333    /* ===================================================================*/
5334    /* Handle a literal character. It is guaranteed not to be whitespace or #
5335    when the extended flag is set. If we are in UTF-8 mode, it may be a
5336    multi-byte literal character. */
5337
5338    default:
5339    NORMAL_CHAR:
5340    mclength = 1;
5341    mcbuffer[0] = c;
5342
5343#ifdef SUPPORT_UTF8
5344    if (utf8 && c >= 0xc0)
5345      {
5346      while ((ptr[1] & 0xc0) == 0x80)
5347        mcbuffer[mclength++] = *(++ptr);
5348      }
5349#endif
5350
5351    /* At this point we have the character's bytes in mcbuffer, and the length
5352    in mclength. When not in UTF-8 mode, the length is always 1. */
5353
5354    ONE_CHAR:
5355    previous = code;
5356    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5357    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5358
5359    /* Remember if \r or \n were seen */
5360
5361    if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5362      cd->external_flags |= PCRE_HASCRORLF;
5363
5364    /* Set the first and required bytes appropriately. If no previous first
5365    byte, set it from this character, but revert to none on a zero repeat.
5366    Otherwise, leave the firstbyte value alone, and don't change it on a zero
5367    repeat. */
5368
5369    if (firstbyte == REQ_UNSET)
5370      {
5371      zerofirstbyte = REQ_NONE;
5372      zeroreqbyte = reqbyte;
5373
5374      /* If the character is more than one byte long, we can set firstbyte
5375      only if it is not to be matched caselessly. */
5376
5377      if (mclength == 1 || req_caseopt == 0)
5378        {
5379        firstbyte = mcbuffer[0] | req_caseopt;
5380        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5381        }
5382      else firstbyte = reqbyte = REQ_NONE;
5383      }
5384
5385    /* firstbyte was previously set; we can set reqbyte only the length is
5386    1 or the matching is caseful. */
5387
5388    else
5389      {
5390      zerofirstbyte = firstbyte;
5391      zeroreqbyte = reqbyte;
5392      if (mclength == 1 || req_caseopt == 0)
5393        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5394      }
5395
5396    break;            /* End of literal character handling */
5397    }
5398  }                   /* end of big loop */
5399
5400
5401/* Control never reaches here by falling through, only by a goto for all the
5402error states. Pass back the position in the pattern so that it can be displayed
5403to the user for diagnosing the error. */
5404
5405FAILED:
5406*ptrptr = ptr;
5407return FALSE;
5408}
5409
5410
5411
5412
5413/*************************************************
5414*     Compile sequence of alternatives           *
5415*************************************************/
5416
5417/* On entry, ptr is pointing past the bracket character, but on return it
5418points to the closing bracket, or vertical bar, or end of string. The code
5419variable is pointing at the byte into which the BRA operator has been stored.
5420If the ims options are changed at the start (for a (?ims: group) or during any
5421branch, we need to insert an OP_OPT item at the start of every following branch
5422to ensure they get set correctly at run time, and also pass the new options
5423into every subsequent branch compile.
5424
5425This function is used during the pre-compile phase when we are trying to find
5426out the amount of memory needed, as well as during the real compile phase. The
5427value of lengthptr distinguishes the two phases.
5428
5429Arguments:
5430  options        option bits, including any changes for this subpattern
5431  oldims         previous settings of ims option bits
5432  codeptr        -> the address of the current code pointer
5433  ptrptr         -> the address of the current pattern pointer
5434  errorcodeptr   -> pointer to error code variable
5435  lookbehind     TRUE if this is a lookbehind assertion
5436  reset_bracount TRUE to reset the count for each branch
5437  skipbytes      skip this many bytes at start (for brackets and OP_COND)
5438  firstbyteptr   place to put the first required character, or a negative number
5439  reqbyteptr     place to put the last required character, or a negative number
5440  bcptr          pointer to the chain of currently open branches
5441  cd             points to the data block with tables pointers etc.
5442  lengthptr      NULL during the real compile phase
5443                 points to length accumulator during pre-compile phase
5444
5445Returns:         TRUE on success
5446*/
5447
5448static BOOL
5449compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5450  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5451  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5452  int *lengthptr)
5453{
5454const uschar *ptr = *ptrptr;
5455uschar *code = *codeptr;
5456uschar *last_branch = code;
5457uschar *start_bracket = code;
5458uschar *reverse_count = NULL;
5459int firstbyte, reqbyte;
5460int branchfirstbyte, branchreqbyte;
5461int length;
5462int orig_bracount;
5463int max_bracount;
5464branch_chain bc;
5465
5466bc.outer = bcptr;
5467bc.current = code;
5468
5469firstbyte = reqbyte = REQ_UNSET;
5470
5471/* Accumulate the length for use in the pre-compile phase. Start with the
5472length of the BRA and KET and any extra bytes that are required at the
5473beginning. We accumulate in a local variable to save frequent testing of
5474lenthptr for NULL. We cannot do this by looking at the value of code at the
5475start and end of each alternative, because compiled items are discarded during
5476the pre-compile phase so that the work space is not exceeded. */
5477
5478length = 2 + 2*LINK_SIZE + skipbytes;
5479
5480/* WARNING: If the above line is changed for any reason, you must also change
5481the code that abstracts option settings at the start of the pattern and makes
5482them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5483pre-compile phase to find out whether anything has yet been compiled or not. */
5484
5485/* Offset is set zero to mark that this bracket is still open */
5486
5487PUT(code, 1, 0);
5488code += 1 + LINK_SIZE + skipbytes;
5489
5490/* Loop for each alternative branch */
5491
5492orig_bracount = max_bracount = cd->bracount;
5493for (;;)
5494  {
5495  /* For a (?| group, reset the capturing bracket count so that each branch
5496  uses the same numbers. */
5497
5498  if (reset_bracount) cd->bracount = orig_bracount;
5499
5500  /* Handle a change of ims options at the start of the branch */
5501
5502  if ((options & PCRE_IMS) != oldims)
5503    {
5504    *code++ = OP_OPT;
5505    *code++ = options & PCRE_IMS;
5506    length += 2;
5507    }
5508
5509  /* Set up dummy OP_REVERSE if lookbehind assertion */
5510
5511  if (lookbehind)
5512    {
5513    *code++ = OP_REVERSE;
5514    reverse_count = code;
5515    PUTINC(code, 0, 0);
5516    length += 1 + LINK_SIZE;
5517    }
5518
5519  /* Now compile the branch; in the pre-compile phase its length gets added
5520  into the length. */
5521
5522  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5523        &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5524    {
5525    *ptrptr = ptr;
5526    return FALSE;
5527    }
5528
5529  /* Keep the highest bracket count in case (?| was used and some branch
5530  has fewer than the rest. */
5531
5532  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5533
5534  /* In the real compile phase, there is some post-processing to be done. */
5535
5536  if (lengthptr == NULL)
5537    {
5538    /* If this is the first branch, the firstbyte and reqbyte values for the
5539    branch become the values for the regex. */
5540
5541    if (*last_branch != OP_ALT)
5542      {
5543      firstbyte = branchfirstbyte;
5544      reqbyte = branchreqbyte;
5545      }
5546
5547    /* If this is not the first branch, the first char and reqbyte have to
5548    match the values from all the previous branches, except that if the
5549    previous value for reqbyte didn't have REQ_VARY set, it can still match,
5550    and we set REQ_VARY for the regex. */
5551
5552    else
5553      {
5554      /* If we previously had a firstbyte, but it doesn't match the new branch,
5555      we have to abandon the firstbyte for the regex, but if there was
5556      previously no reqbyte, it takes on the value of the old firstbyte. */
5557
5558      if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5559        {
5560        if (reqbyte < 0) reqbyte = firstbyte;
5561        firstbyte = REQ_NONE;
5562        }
5563
5564      /* If we (now or from before) have no firstbyte, a firstbyte from the
5565      branch becomes a reqbyte if there isn't a branch reqbyte. */
5566
5567      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5568          branchreqbyte = branchfirstbyte;
5569
5570      /* Now ensure that the reqbytes match */
5571
5572      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5573        reqbyte = REQ_NONE;
5574      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5575      }
5576
5577    /* If lookbehind, check that this branch matches a fixed-length string, and
5578    put the length into the OP_REVERSE item. Temporarily mark the end of the
5579    branch with OP_END. */
5580
5581    if (lookbehind)
5582      {
5583      int fixed_length;
5584      *code = OP_END;
5585      fixed_length = find_fixedlength(last_branch, options);
5586      DPRINTF(("fixed length = %d\n", fixed_length));
5587      if (fixed_length < 0)
5588        {
5589        *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5590        *ptrptr = ptr;
5591        return FALSE;
5592        }
5593      PUT(reverse_count, 0, fixed_length);
5594      }
5595    }
5596
5597  /* Reached end of expression, either ')' or end of pattern. In the real
5598  compile phase, go back through the alternative branches and reverse the chain
5599  of offsets, with the field in the BRA item now becoming an offset to the
5600  first alternative. If there are no alternatives, it points to the end of the
5601  group. The length in the terminating ket is always the length of the whole
5602  bracketed item. If any of the ims options were changed inside the group,
5603  compile a resetting op-code following, except at the very end of the pattern.
5604  Return leaving the pointer at the terminating char. */
5605
5606  if (*ptr != '|')
5607    {
5608    if (lengthptr == NULL)
5609      {
5610      int branch_length = code - last_branch;
5611      do
5612        {
5613        int prev_length = GET(last_branch, 1);
5614        PUT(last_branch, 1, branch_length);
5615        branch_length = prev_length;
5616        last_branch -= branch_length;
5617        }
5618      while (branch_length > 0);
5619      }
5620
5621    /* Fill in the ket */
5622
5623    *code = OP_KET;
5624    PUT(code, 1, code - start_bracket);
5625    code += 1 + LINK_SIZE;
5626
5627    /* Resetting option if needed */
5628
5629    if ((options & PCRE_IMS) != oldims && *ptr == ')')
5630      {
5631      *code++ = OP_OPT;
5632      *code++ = oldims;
5633      length += 2;
5634      }
5635
5636    /* Retain the highest bracket number, in case resetting was used. */
5637
5638    cd->bracount = max_bracount;
5639
5640    /* Set values to pass back */
5641
5642    *codeptr = code;
5643    *ptrptr = ptr;
5644    *firstbyteptr = firstbyte;
5645    *reqbyteptr = reqbyte;
5646    if (lengthptr != NULL)
5647      {
5648      if (OFLOW_MAX - *lengthptr < length)
5649        {
5650        *errorcodeptr = ERR20;
5651        return FALSE;
5652        }
5653      *lengthptr += length;
5654      }
5655    return TRUE;
5656    }
5657
5658  /* Another branch follows. In the pre-compile phase, we can move the code
5659  pointer back to where it was for the start of the first branch. (That is,
5660  pretend that each branch is the only one.)
5661
5662  In the real compile phase, insert an ALT node. Its length field points back
5663  to the previous branch while the bracket remains open. At the end the chain
5664  is reversed. It's done like this so that the start of the bracket has a
5665  zero offset until it is closed, making it possible to detect recursion. */
5666
5667  if (lengthptr != NULL)
5668    {
5669    code = *codeptr + 1 + LINK_SIZE + skipbytes;
5670    length += 1 + LINK_SIZE;
5671    }
5672  else
5673    {
5674    *code = OP_ALT;
5675    PUT(code, 1, code - last_branch);
5676    bc.current = last_branch = code;
5677    code += 1 + LINK_SIZE;
5678    }
5679
5680  ptr++;
5681  }
5682/* Control never reaches here */
5683}
5684
5685
5686
5687
5688/*************************************************
5689*          Check for anchored expression         *
5690*************************************************/
5691
5692/* Try to find out if this is an anchored regular expression. Consider each
5693alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5694all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5695it's anchored. However, if this is a multiline pattern, then only OP_SOD
5696counts, since OP_CIRC can match in the middle.
5697
5698We can also consider a regex to be anchored if OP_SOM starts all its branches.
5699This is the code for \G, which means "match at start of match position, taking
5700into account the match offset".
5701
5702A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5703because that will try the rest of the pattern at all possible matching points,
5704so there is no point trying again.... er ....
5705
5706.... except when the .* appears inside capturing parentheses, and there is a
5707subsequent back reference to those parentheses. We haven't enough information
5708to catch that case precisely.
5709
5710At first, the best we could do was to detect when .* was in capturing brackets
5711and the highest back reference was greater than or equal to that level.
5712However, by keeping a bitmap of the first 31 back references, we can catch some
5713of the more common cases more precisely.
5714
5715Arguments:
5716  code           points to start of expression (the bracket)
5717  options        points to the options setting
5718  bracket_map    a bitmap of which brackets we are inside while testing; this
5719                  handles up to substring 31; after that we just have to take
5720                  the less precise approach
5721  backref_map    the back reference bitmap
5722
5723Returns:     TRUE or FALSE
5724*/
5725
5726static BOOL
5727is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5728  unsigned int backref_map)
5729{
5730do {
5731   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5732     options, PCRE_MULTILINE, FALSE);
5733   register int op = *scode;
5734
5735   /* Non-capturing brackets */
5736
5737   if (op == OP_BRA)
5738     {
5739     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5740     }
5741
5742   /* Capturing brackets */
5743
5744   else if (op == OP_CBRA)
5745     {
5746     int n = GET2(scode, 1+LINK_SIZE);
5747     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5748     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5749     }
5750
5751   /* Other brackets */
5752
5753   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5754     {
5755     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5756     }
5757
5758   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5759   it isn't in brackets that are or may be referenced. */
5760
5761   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5762             op == OP_TYPEPOSSTAR))
5763     {
5764     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5765       return FALSE;
5766     }
5767
5768   /* Check for explicit anchoring */
5769
5770   else if (op != OP_SOD && op != OP_SOM &&
5771           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5772     return FALSE;
5773   code += GET(code, 1);
5774   }
5775while (*code == OP_ALT);   /* Loop for each alternative */
5776return TRUE;
5777}
5778
5779
5780
5781/*************************************************
5782*         Check for starting with ^ or .*        *
5783*************************************************/
5784
5785/* This is called to find out if every branch starts with ^ or .* so that
5786"first char" processing can be done to speed things up in multiline
5787matching and for non-DOTALL patterns that start with .* (which must start at
5788the beginning or after \n). As in the case of is_anchored() (see above), we
5789have to take account of back references to capturing brackets that contain .*
5790because in that case we can't make the assumption.
5791
5792Arguments:
5793  code           points to start of expression (the bracket)
5794  bracket_map    a bitmap of which brackets we are inside while testing; this
5795                  handles up to substring 31; after that we just have to take
5796                  the less precise approach
5797  backref_map    the back reference bitmap
5798
5799Returns:         TRUE or FALSE
5800*/
5801
5802static BOOL
5803is_startline(const uschar *code, unsigned int bracket_map,
5804  unsigned int backref_map)
5805{
5806do {
5807   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5808     NULL, 0, FALSE);
5809   register int op = *scode;
5810
5811   /* Non-capturing brackets */
5812
5813   if (op == OP_BRA)
5814     {
5815     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5816     }
5817
5818   /* Capturing brackets */
5819
5820   else if (op == OP_CBRA)
5821     {
5822     int n = GET2(scode, 1+LINK_SIZE);
5823     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5824     if (!is_startline(scode, new_map, backref_map)) return FALSE;
5825     }
5826
5827   /* Other brackets */
5828
5829   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5830     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5831
5832   /* .* means "start at start or after \n" if it isn't in brackets that
5833   may be referenced. */
5834
5835   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5836     {
5837     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5838     }
5839
5840   /* Check for explicit circumflex */
5841
5842   else if (op != OP_CIRC) return FALSE;
5843
5844   /* Move on to the next alternative */
5845
5846   code += GET(code, 1);
5847   }
5848while (*code == OP_ALT);  /* Loop for each alternative */
5849return TRUE;
5850}
5851
5852
5853
5854/*************************************************
5855*       Check for asserted fixed first char      *
5856*************************************************/
5857
5858/* During compilation, the "first char" settings from forward assertions are
5859discarded, because they can cause conflicts with actual literals that follow.
5860However, if we end up without a first char setting for an unanchored pattern,
5861it is worth scanning the regex to see if there is an initial asserted first
5862char. If all branches start with the same asserted char, or with a bracket all
5863of whose alternatives start with the same asserted char (recurse ad lib), then
5864we return that char, otherwise -1.
5865
5866Arguments:
5867  code       points to start of expression (the bracket)
5868  options    pointer to the options (used to check casing changes)
5869  inassert   TRUE if in an assertion
5870
5871Returns:     -1 or the fixed first char
5872*/
5873
5874static int
5875find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5876{
5877register int c = -1;
5878do {
5879   int d;
5880   const uschar *scode =
5881     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5882   register int op = *scode;
5883
5884   switch(op)
5885     {
5886     default:
5887     return -1;
5888
5889     case OP_BRA:
5890     case OP_CBRA:
5891     case OP_ASSERT:
5892     case OP_ONCE:
5893     case OP_COND:
5894     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5895       return -1;
5896     if (c < 0) c = d; else if (c != d) return -1;
5897     break;
5898
5899     case OP_EXACT:       /* Fall through */
5900     scode += 2;
5901
5902     case OP_CHAR:
5903     case OP_CHARNC:
5904     case OP_PLUS:
5905     case OP_MINPLUS:
5906     case OP_POSPLUS:
5907     if (!inassert) return -1;
5908     if (c < 0)
5909       {
5910       c = scode[1];
5911       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5912       }
5913     else if (c != scode[1]) return -1;
5914     break;
5915     }
5916
5917   code += GET(code, 1);
5918   }
5919while (*code == OP_ALT);
5920return c;
5921}
5922
5923
5924
5925/*************************************************
5926*        Compile a Regular Expression            *
5927*************************************************/
5928
5929/* This function takes a string and returns a pointer to a block of store
5930holding a compiled version of the expression. The original API for this
5931function had no error code return variable; it is retained for backwards
5932compatibility. The new function is given a new name.
5933
5934Arguments:
5935  pattern       the regular expression
5936  options       various option bits
5937  errorcodeptr  pointer to error code variable (pcre_compile2() only)
5938                  can be NULL if you don't want a code value
5939  errorptr      pointer to pointer to error text
5940  erroroffset   ptr offset in pattern where error was detected
5941  tables        pointer to character tables or NULL
5942
5943Returns:        pointer to compiled data block, or NULL on error,
5944                with errorptr and erroroffset set
5945*/
5946
5947PCRE_EXP_DEFN pcre *
5948pcre_compile(const char *pattern, int options, const char **errorptr,
5949  int *erroroffset, const unsigned char *tables)
5950{
5951return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5952}
5953
5954
5955PCRE_EXP_DEFN pcre *
5956pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5957  const char **errorptr, int *erroroffset, const unsigned char *tables)
5958{
5959real_pcre *re;
5960int length = 1;  /* For final END opcode */
5961int firstbyte, reqbyte, newline;
5962int errorcode = 0;
5963int skipatstart = 0;
5964#ifdef SUPPORT_UTF8
5965BOOL utf8;
5966#endif
5967size_t size;
5968uschar *code;
5969const uschar *codestart;
5970const uschar *ptr;
5971compile_data compile_block;
5972compile_data *cd = &compile_block;
5973
5974/* This space is used for "compiling" into during the first phase, when we are
5975computing the amount of memory that is needed. Compiled items are thrown away
5976as soon as possible, so that a fairly large buffer should be sufficient for
5977this purpose. The same space is used in the second phase for remembering where
5978to fill in forward references to subpatterns. */
5979
5980uschar cworkspace[COMPILE_WORK_SIZE];
5981
5982/* Set this early so that early errors get offset 0. */
5983
5984ptr = (const uschar *)pattern;
5985
5986/* We can't pass back an error message if errorptr is NULL; I guess the best we
5987can do is just return NULL, but we can set a code value if there is a code
5988pointer. */
5989
5990if (errorptr == NULL)
5991  {
5992  if (errorcodeptr != NULL) *errorcodeptr = 99;
5993  return NULL;
5994  }
5995
5996*errorptr = NULL;
5997if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5998
5999/* However, we can give a message for this error */
6000
6001if (erroroffset == NULL)
6002  {
6003  errorcode = ERR16;
6004  goto PCRE_EARLY_ERROR_RETURN2;
6005  }
6006
6007*erroroffset = 0;
6008
6009/* Can't support UTF8 unless PCRE has been compiled to include the code. */
6010
6011#ifdef SUPPORT_UTF8
6012utf8 = (options & PCRE_UTF8) != 0;
6013if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6014     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6015  {
6016  errorcode = ERR44;
6017  goto PCRE_EARLY_ERROR_RETURN2;
6018  }
6019#else
6020if ((options & PCRE_UTF8) != 0)
6021  {
6022  errorcode = ERR32;
6023  goto PCRE_EARLY_ERROR_RETURN;
6024  }
6025#endif
6026
6027if ((options & ~PUBLIC_OPTIONS) != 0)
6028  {
6029  errorcode = ERR17;
6030  goto PCRE_EARLY_ERROR_RETURN;
6031  }
6032
6033/* Set up pointers to the individual character tables */
6034
6035if (tables == NULL) tables = _pcre_default_tables;
6036cd->lcc = tables + lcc_offset;
6037cd->fcc = tables + fcc_offset;
6038cd->cbits = tables + cbits_offset;
6039cd->ctypes = tables + ctypes_offset;
6040
6041/* Check for global one-time settings at the start of the pattern, and remember
6042the offset for later. */
6043
6044while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
6045  {
6046  int newnl = 0;
6047  int newbsr = 0;
6048
6049  if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
6050    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6051  else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
6052    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6053  else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
6054    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6055  else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
6056    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6057  else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
6058    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6059
6060  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
6061    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6062  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
6063    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6064
6065  if (newnl != 0)
6066    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6067  else if (newbsr != 0)
6068    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6069  else break;
6070  }
6071
6072/* Check validity of \R options. */
6073
6074switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6075  {
6076  case 0:
6077  case PCRE_BSR_ANYCRLF:
6078  case PCRE_BSR_UNICODE:
6079  break;
6080  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6081  }
6082
6083/* Handle different types of newline. The three bits give seven cases. The
6084current code allows for fixed one- or two-byte sequences, plus "any" and
6085"anycrlf". */
6086
6087switch (options & PCRE_NEWLINE_BITS)
6088  {
6089  case 0: newline = NEWLINE; break;   /* Build-time default */
6090  case PCRE_NEWLINE_CR: newline = '\r'; break;
6091  case PCRE_NEWLINE_LF: newline = '\n'; break;
6092  case PCRE_NEWLINE_CR+
6093       PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6094  case PCRE_NEWLINE_ANY: newline = -1; break;
6095  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6096  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6097  }
6098
6099if (newline == -2)
6100  {
6101  cd->nltype = NLTYPE_ANYCRLF;
6102  }
6103else if (newline < 0)
6104  {
6105  cd->nltype = NLTYPE_ANY;
6106  }
6107else
6108  {
6109  cd->nltype = NLTYPE_FIXED;
6110  if (newline > 255)
6111    {
6112    cd->nllen = 2;
6113    cd->nl[0] = (newline >> 8) & 255;
6114    cd->nl[1] = newline & 255;
6115    }
6116  else
6117    {
6118    cd->nllen = 1;
6119    cd->nl[0] = newline;
6120    }
6121  }
6122
6123/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6124references to help in deciding whether (.*) can be treated as anchored or not.
6125*/
6126
6127cd->top_backref = 0;
6128cd->backref_map = 0;
6129
6130/* Reflect pattern for debugging output */
6131
6132DPRINTF(("------------------------------------------------------------------\n"));
6133DPRINTF(("%s\n", pattern));
6134
6135/* Pretend to compile the pattern while actually just accumulating the length
6136of memory required. This behaviour is triggered by passing a non-NULL final
6137argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6138to compile parts of the pattern into; the compiled code is discarded when it is
6139no longer needed, so hopefully this workspace will never overflow, though there
6140is a test for its doing so. */
6141
6142cd->bracount = cd->final_bracount = 0;
6143cd->names_found = 0;
6144cd->name_entry_size = 0;
6145cd->name_table = NULL;
6146cd->start_workspace = cworkspace;
6147cd->start_code = cworkspace;
6148cd->hwm = cworkspace;
6149cd->start_pattern = (const uschar *)pattern;
6150cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6151cd->req_varyopt = 0;
6152cd->external_options = options;
6153cd->external_flags = 0;
6154
6155/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6156don't need to look at the result of the function here. The initial options have
6157been put into the cd block so that they can be changed if an option setting is
6158found within the regex right at the beginning. Bringing initial option settings
6159outside can help speed up starting point checks. */
6160
6161ptr += skipatstart;
6162code = cworkspace;
6163*code = OP_BRA;
6164(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6165  &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6166  &length);
6167if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6168
6169DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6170  cd->hwm - cworkspace));
6171
6172if (length > MAX_PATTERN_SIZE)
6173  {
6174  errorcode = ERR20;
6175  goto PCRE_EARLY_ERROR_RETURN;
6176  }
6177
6178/* Compute the size of data block needed and get it, either from malloc or
6179externally provided function. Integer overflow should no longer be possible
6180because nowadays we limit the maximum value of cd->names_found and
6181cd->name_entry_size. */
6182
6183size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6184re = (real_pcre *)(pcre_malloc)(size);
6185
6186if (re == NULL)
6187  {
6188  errorcode = ERR21;
6189  goto PCRE_EARLY_ERROR_RETURN;
6190  }
6191
6192/* Put in the magic number, and save the sizes, initial options, internal
6193flags, and character table pointer. NULL is used for the default character
6194tables. The nullpad field is at the end; it's there to help in the case when a
6195regex compiled on a system with 4-byte pointers is run on another with 8-byte
6196pointers. */
6197
6198re->magic_number = MAGIC_NUMBER;
6199re->size = size;
6200re->options = cd->external_options;
6201re->flags = cd->external_flags;
6202re->dummy1 = 0;
6203re->first_byte = 0;
6204re->req_byte = 0;
6205re->name_table_offset = sizeof(real_pcre);
6206re->name_entry_size = cd->name_entry_size;
6207re->name_count = cd->names_found;
6208re->ref_count = 0;
6209re->tables = (tables == _pcre_default_tables)? NULL : tables;
6210re->nullpad = NULL;
6211
6212/* The starting points of the name/number translation table and of the code are
6213passed around in the compile data block. The start/end pattern and initial
6214options are already set from the pre-compile phase, as is the name_entry_size
6215field. Reset the bracket count and the names_found field. Also reset the hwm
6216field; this time it's used for remembering forward references to subpatterns.
6217*/
6218
6219cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6220cd->bracount = 0;
6221cd->names_found = 0;
6222cd->name_table = (uschar *)re + re->name_table_offset;
6223codestart = cd->name_table + re->name_entry_size * re->name_count;
6224cd->start_code = codestart;
6225cd->hwm = cworkspace;
6226cd->req_varyopt = 0;
6227cd->had_accept = FALSE;
6228
6229/* Set up a starting, non-extracting bracket, then compile the expression. On
6230error, errorcode will be set non-zero, so we don't need to look at the result
6231of the function here. */
6232
6233ptr = (const uschar *)pattern + skipatstart;
6234code = (uschar *)codestart;
6235*code = OP_BRA;
6236(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6237  &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6238re->top_bracket = cd->bracount;
6239re->top_backref = cd->top_backref;
6240re->flags = cd->external_flags;
6241
6242if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6243
6244/* If not reached end of pattern on success, there's an excess bracket. */
6245
6246if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6247
6248/* Fill in the terminating state and check for disastrous overflow, but
6249if debugging, leave the test till after things are printed out. */
6250
6251*code++ = OP_END;
6252
6253#ifndef DEBUG
6254if (code - codestart > length) errorcode = ERR23;
6255#endif
6256
6257/* Fill in any forward references that are required. */
6258
6259while (errorcode == 0 && cd->hwm > cworkspace)
6260  {
6261  int offset, recno;
6262  const uschar *groupptr;
6263  cd->hwm -= LINK_SIZE;
6264  offset = GET(cd->hwm, 0);
6265  recno = GET(codestart, offset);
6266  groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6267  if (groupptr == NULL) errorcode = ERR53;
6268    else PUT(((uschar *)codestart), offset, groupptr - codestart);
6269  }
6270
6271/* Give an error if there's back reference to a non-existent capturing
6272subpattern. */
6273
6274if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6275
6276/* Failed to compile, or error while post-processing */