source: project/chicken/pcre/pcre_compile.c @ 2926

Last change on this file since 2926 was 2926, checked in by felix winkelmann, 14 years ago

new chicken version (pcre merged), aalib

File size: 156.1 KB
Line 
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2005 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#include "pcre_internal.h"
46
47
48/*************************************************
49*      Code parameters and static tables         *
50*************************************************/
51
52/* Maximum number of items on the nested bracket stacks at compile time. This
53applies to the nesting of all kinds of parentheses. It does not limit
54un-nested, non-capturing parentheses. This number can be made bigger if
55necessary - it is used to dimension one int and one unsigned char vector at
56compile time. */
57
58#define BRASTACK_SIZE 200
59
60
61/* Table for handling escaped characters in the range '0'-'z'. Positive returns
62are simple data values; negative values are for special things like \d and so
63on. Zero means further processing is needed (for things like \x), or the escape
64is invalid. */
65
66#if !EBCDIC   /* This is the "normal" table for ASCII systems */
67static const short int escapes[] = {
68     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
69     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
70   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
71     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
72-ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
73-ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
74   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
75     0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
76-ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
77     0,      0, -ESC_z                                            /* x - z */
78};
79
80#else         /* This is the "abnormal" table for EBCDIC systems */
81static const short int escapes[] = {
82/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
83/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
84/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
85/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
86/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
87/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
88/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
89/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
90/*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
91/*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,
92/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
93/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
94/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
95/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
96/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
97/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
98/*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
99/*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
100/*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,
101/*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
102/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
103/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
104/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
105};
106#endif
107
108
109/* Tables of names of POSIX character classes and their lengths. The list is
110terminated by a zero length entry. The first three must be alpha, upper, lower,
111as this is assumed for handling case independence. */
112
113static const char *const posix_names[] = {
114  "alpha", "lower", "upper",
115  "alnum", "ascii", "blank", "cntrl", "digit", "graph",
116  "print", "punct", "space", "word",  "xdigit" };
117
118static const uschar posix_name_lengths[] = {
119  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
120
121/* Table of class bit maps for each POSIX class; up to three may be combined
122to form the class. The table for [:blank:] is dynamically modified to remove
123the vertical space characters. */
124
125static const int posix_class_maps[] = {
126  cbit_lower, cbit_upper, -1,             /* alpha */
127  cbit_lower, -1,         -1,             /* lower */
128  cbit_upper, -1,         -1,             /* upper */
129  cbit_digit, cbit_lower, cbit_upper,     /* alnum */
130  cbit_print, cbit_cntrl, -1,             /* ascii */
131  cbit_space, -1,         -1,             /* blank - a GNU extension */
132  cbit_cntrl, -1,         -1,             /* cntrl */
133  cbit_digit, -1,         -1,             /* digit */
134  cbit_graph, -1,         -1,             /* graph */
135  cbit_print, -1,         -1,             /* print */
136  cbit_punct, -1,         -1,             /* punct */
137  cbit_space, -1,         -1,             /* space */
138  cbit_word,  -1,         -1,             /* word - a Perl extension */
139  cbit_xdigit,-1,         -1              /* xdigit */
140};
141
142
143/* The texts of compile-time error messages. These are "char *" because they
144are passed to the outside world. */
145
146static const char *error_texts[] = {
147  "no error",
148  "\\ at end of pattern",
149  "\\c at end of pattern",
150  "unrecognized character follows \\",
151  "numbers out of order in {} quantifier",
152  /* 5 */
153  "number too big in {} quantifier",
154  "missing terminating ] for character class",
155  "invalid escape sequence in character class",
156  "range out of order in character class",
157  "nothing to repeat",
158  /* 10 */
159  "operand of unlimited repeat could match the empty string",
160  "internal error: unexpected repeat",
161  "unrecognized character after (?",
162  "POSIX named classes are supported only within a class",
163  "missing )",
164  /* 15 */
165  "reference to non-existent subpattern",
166  "erroffset passed as NULL",
167  "unknown option bit(s) set",
168  "missing ) after comment",
169  "parentheses nested too deeply",
170  /* 20 */
171  "regular expression too large",
172  "failed to get memory",
173  "unmatched parentheses",
174  "internal error: code overflow",
175  "unrecognized character after (?<",
176  /* 25 */
177  "lookbehind assertion is not fixed length",
178  "malformed number after (?(",
179  "conditional group contains more than two branches",
180  "assertion expected after (?(",
181  "(?R or (?digits must be followed by )",
182  /* 30 */
183  "unknown POSIX class name",
184  "POSIX collating elements are not supported",
185  "this version of PCRE is not compiled with PCRE_UTF8 support",
186  "spare error",
187  "character value in \\x{...} sequence is too large",
188  /* 35 */
189  "invalid condition (?(0)",
190  "\\C not allowed in lookbehind assertion",
191  "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
192  "number after (?C is > 255",
193  "closing ) for (?C expected",
194  /* 40 */
195  "recursive call could loop indefinitely",
196  "unrecognized character after (?P",
197  "syntax error after (?P",
198  "two named groups have the same name",
199  "invalid UTF-8 string",
200  /* 45 */
201  "support for \\P, \\p, and \\X has not been compiled",
202  "malformed \\P or \\p sequence",
203  "unknown property name after \\P or \\p"
204};
205
206
207/* Table to identify digits and hex digits. This is used when compiling
208patterns. Note that the tables in chartables are dependent on the locale, and
209may mark arbitrary characters as digits - but the PCRE compiling code expects
210to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
211a private table here. It costs 256 bytes, but it is a lot faster than doing
212character value tests (at least in some simple cases I timed), and in some
213applications one wants PCRE to compile efficiently as well as match
214efficiently.
215
216For convenience, we use the same bit definitions as in chartables:
217
218  0x04   decimal digit
219  0x08   hexadecimal digit
220
221Then we can use ctype_digit and ctype_xdigit in the code. */
222
223#if !EBCDIC    /* This is the "normal" case, for ASCII systems */
224static const unsigned char digitab[] =
225  {
226  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
227  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
228  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
229  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
230  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
231  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
232  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
233  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
234  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
235  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
236  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
237  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
238  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
239  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
240  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
241  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
242  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
243  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
244  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
245  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
246  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
247  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
248  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
249  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
250  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
251  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
252  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
253  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
254  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
255  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
256  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
257  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
258
259#else          /* This is the "abnormal" case, for EBCDIC systems */
260static const unsigned char digitab[] =
261  {
262  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
263  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
264  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
265  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
266  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
267  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
268  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
269  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
270  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
271  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
272  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
273  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */
274  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
275  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
276  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
277  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
278  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
279  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
280  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
281  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
282  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
283  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
284  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
285  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
286  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
287  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
288  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
289  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
290  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
291  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
292  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
293  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
294
295static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
296  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
297  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
298  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
299  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
300  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
301  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
302  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
303  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
304  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
305  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
306  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
307  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */
308  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
309  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
310  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
311  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
312  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
313  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
314  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
315  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
316  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
317  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
318  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
319  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
320  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
321  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
322  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
323  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
324  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
325  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
326  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
327  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
328#endif
329
330
331/* Definition to allow mutual recursion */
332
333static BOOL
334  compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
335    int *, int *, branch_chain *, compile_data *);
336
337
338
339/*************************************************
340*            Handle escapes                      *
341*************************************************/
342
343/* This function is called when a \ has been encountered. It either returns a
344positive value for a simple escape such as \n, or a negative value which
345encodes one of the more complicated things such as \d. When UTF-8 is enabled,
346a positive value greater than 255 may be returned. On entry, ptr is pointing at
347the \. On exit, it is on the final character of the escape sequence.
348
349Arguments:
350  ptrptr         points to the pattern position pointer
351  errorcodeptr   points to the errorcode variable
352  bracount       number of previous extracting brackets
353  options        the options bits
354  isclass        TRUE if inside a character class
355
356Returns:         zero or positive => a data character
357                 negative => a special escape sequence
358                 on error, errorptr is set
359*/
360
361static int
362check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
363  int options, BOOL isclass)
364{
365const uschar *ptr = *ptrptr;
366int c, i;
367
368/* If backslash is at the end of the pattern, it's an error. */
369
370c = *(++ptr);
371if (c == 0) *errorcodeptr = ERR1;
372
373/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
374a table. A non-zero result is something that can be returned immediately.
375Otherwise further processing may be required. */
376
377#if !EBCDIC    /* ASCII coding */
378else if (c < '0' || c > 'z') {}                           /* Not alphameric */
379else if ((i = escapes[c - '0']) != 0) c = i;
380
381#else          /* EBCDIC coding */
382else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
383else if ((i = escapes[c - 0x48]) != 0)  c = i;
384#endif
385
386/* Escapes that need further processing, or are illegal. */
387
388else
389  {
390  const uschar *oldptr;
391  switch (c)
392    {
393    /* A number of Perl escapes are not handled by PCRE. We give an explicit
394    error. */
395
396    case 'l':
397    case 'L':
398    case 'N':
399    case 'u':
400    case 'U':
401    *errorcodeptr = ERR37;
402    break;
403
404    /* The handling of escape sequences consisting of a string of digits
405    starting with one that is not zero is not straightforward. By experiment,
406    the way Perl works seems to be as follows:
407
408    Outside a character class, the digits are read as a decimal number. If the
409    number is less than 10, or if there are that many previous extracting
410    left brackets, then it is a back reference. Otherwise, up to three octal
411    digits are read to form an escaped byte. Thus \123 is likely to be octal
412    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
413    value is greater than 377, the least significant 8 bits are taken. Inside a
414    character class, \ followed by a digit is always an octal number. */
415
416    case '1': case '2': case '3': case '4': case '5':
417    case '6': case '7': case '8': case '9':
418
419    if (!isclass)
420      {
421      oldptr = ptr;
422      c -= '0';
423      while ((digitab[ptr[1]] & ctype_digit) != 0)
424        c = c * 10 + *(++ptr) - '0';
425      if (c < 10 || c <= bracount)
426        {
427        c = -(ESC_REF + c);
428        break;
429        }
430      ptr = oldptr;      /* Put the pointer back and fall through */
431      }
432
433    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
434    generates a binary zero byte and treats the digit as a following literal.
435    Thus we have to pull back the pointer by one. */
436
437    if ((c = *ptr) >= '8')
438      {
439      ptr--;
440      c = 0;
441      break;
442      }
443
444    /* \0 always starts an octal number, but we may drop through to here with a
445    larger first octal digit. */
446
447    case '0':
448    c -= '0';
449    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
450        c = c * 8 + *(++ptr) - '0';
451    c &= 255;     /* Take least significant 8 bits */
452    break;
453
454    /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
455    which can be greater than 0xff, but only if the ddd are hex digits. */
456
457    case 'x':
458#ifdef SUPPORT_UTF8
459    if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
460      {
461      const uschar *pt = ptr + 2;
462      register int count = 0;
463      c = 0;
464      while ((digitab[*pt] & ctype_xdigit) != 0)
465        {
466        int cc = *pt++;
467        count++;
468#if !EBCDIC    /* ASCII coding */
469        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
470        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
471#else          /* EBCDIC coding */
472        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
473        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
474#endif
475        }
476      if (*pt == '}')
477        {
478        if (c < 0 || count > 8) *errorcodeptr = ERR34;
479        ptr = pt;
480        break;
481        }
482      /* If the sequence of hex digits does not end with '}', then we don't
483      recognize this construct; fall through to the normal \x handling. */
484      }
485#endif
486
487    /* Read just a single hex char */
488
489    c = 0;
490    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
491      {
492      int cc;                               /* Some compilers don't like ++ */
493      cc = *(++ptr);                        /* in initializers */
494#if !EBCDIC    /* ASCII coding */
495      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
496      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
497#else          /* EBCDIC coding */
498      if (cc <= 'z') cc += 64;              /* Convert to upper case */
499      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
500#endif
501      }
502    break;
503
504    /* Other special escapes not starting with a digit are straightforward */
505
506    case 'c':
507    c = *(++ptr);
508    if (c == 0)
509      {
510      *errorcodeptr = ERR2;
511      return 0;
512      }
513
514    /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
515    is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
516    (However, an EBCDIC equivalent has now been added.) */
517
518#if !EBCDIC    /* ASCII coding */
519    if (c >= 'a' && c <= 'z') c -= 32;
520    c ^= 0x40;
521#else          /* EBCDIC coding */
522    if (c >= 'a' && c <= 'z') c += 64;
523    c ^= 0xC0;
524#endif
525    break;
526
527    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
528    other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
529    for Perl compatibility, it is a literal. This code looks a bit odd, but
530    there used to be some cases other than the default, and there may be again
531    in future, so I haven't "optimized" it. */
532
533    default:
534    if ((options & PCRE_EXTRA) != 0) switch(c)
535      {
536      default:
537      *errorcodeptr = ERR3;
538      break;
539      }
540    break;
541    }
542  }
543
544*ptrptr = ptr;
545return c;
546}
547
548
549
550#ifdef SUPPORT_UCP
551/*************************************************
552*               Handle \P and \p                 *
553*************************************************/
554
555/* This function is called after \P or \p has been encountered, provided that
556PCRE is compiled with support for Unicode properties. On entry, ptrptr is
557pointing at the P or p. On exit, it is pointing at the final character of the
558escape sequence.
559
560Argument:
561  ptrptr         points to the pattern position pointer
562  negptr         points to a boolean that is set TRUE for negation else FALSE
563  errorcodeptr   points to the error code variable
564
565Returns:     value from ucp_type_table, or -1 for an invalid type
566*/
567
568static int
569get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
570{
571int c, i, bot, top;
572const uschar *ptr = *ptrptr;
573char name[4];
574
575c = *(++ptr);
576if (c == 0) goto ERROR_RETURN;
577
578*negptr = FALSE;
579
580/* \P or \p can be followed by a one- or two-character name in {}, optionally
581preceded by ^ for negation. */
582
583if (c == '{')
584  {
585  if (ptr[1] == '^')
586    {
587    *negptr = TRUE;
588    ptr++;
589    }
590  for (i = 0; i <= 2; i++)
591    {
592    c = *(++ptr);
593    if (c == 0) goto ERROR_RETURN;
594    if (c == '}') break;
595    name[i] = c;
596    }
597  if (c !='}')   /* Try to distinguish error cases */
598    {
599    while (*(++ptr) != 0 && *ptr != '}');
600    if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
601    }
602  name[i] = 0;
603  }
604
605/* Otherwise there is just one following character */
606
607else
608  {
609  name[0] = c;
610  name[1] = 0;
611  }
612
613*ptrptr = ptr;
614
615/* Search for a recognized property name using binary chop */
616
617bot = 0;
618top = _pcre_utt_size;
619
620while (bot < top)
621  {
622  i = (bot + top)/2;
623  c = strcmp(name, _pcre_utt[i].name);
624  if (c == 0) return _pcre_utt[i].value;
625  if (c > 0) bot = i + 1; else top = i;
626  }
627
628UNKNOWN_RETURN:
629*errorcodeptr = ERR47;
630*ptrptr = ptr;
631return -1;
632
633ERROR_RETURN:
634*errorcodeptr = ERR46;
635*ptrptr = ptr;
636return -1;
637}
638#endif
639
640
641
642
643/*************************************************
644*            Check for counted repeat            *
645*************************************************/
646
647/* This function is called when a '{' is encountered in a place where it might
648start a quantifier. It looks ahead to see if it really is a quantifier or not.
649It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
650where the ddds are digits.
651
652Arguments:
653  p         pointer to the first char after '{'
654
655Returns:    TRUE or FALSE
656*/
657
658static BOOL
659is_counted_repeat(const uschar *p)
660{
661if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
662while ((digitab[*p] & ctype_digit) != 0) p++;
663if (*p == '}') return TRUE;
664
665if (*p++ != ',') return FALSE;
666if (*p == '}') return TRUE;
667
668if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
669while ((digitab[*p] & ctype_digit) != 0) p++;
670
671return (*p == '}');
672}
673
674
675
676/*************************************************
677*         Read repeat counts                     *
678*************************************************/
679
680/* Read an item of the form {n,m} and return the values. This is called only
681after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
682so the syntax is guaranteed to be correct, but we need to check the values.
683
684Arguments:
685  p              pointer to first char after '{'
686  minp           pointer to int for min
687  maxp           pointer to int for max
688                 returned as -1 if no max
689  errorcodeptr   points to error code variable
690
691Returns:         pointer to '}' on success;
692                 current ptr on error, with errorcodeptr set non-zero
693*/
694
695static const uschar *
696read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
697{
698int min = 0;
699int max = -1;
700
701/* Read the minimum value and do a paranoid check: a negative value indicates
702an integer overflow. */
703
704while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
705if (min < 0 || min > 65535)
706  {
707  *errorcodeptr = ERR5;
708  return p;
709  }
710
711/* Read the maximum value if there is one, and again do a paranoid on its size.
712Also, max must not be less than min. */
713
714if (*p == '}') max = min; else
715  {
716  if (*(++p) != '}')
717    {
718    max = 0;
719    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
720    if (max < 0 || max > 65535)
721      {
722      *errorcodeptr = ERR5;
723      return p;
724      }
725    if (max < min)
726      {
727      *errorcodeptr = ERR4;
728      return p;
729      }
730    }
731  }
732
733/* Fill in the required variables, and pass back the pointer to the terminating
734'}'. */
735
736*minp = min;
737*maxp = max;
738return p;
739}
740
741
742
743/*************************************************
744*      Find first significant op code            *
745*************************************************/
746
747/* This is called by several functions that scan a compiled expression looking
748for a fixed first character, or an anchoring op code etc. It skips over things
749that do not influence this. For some calls, a change of option is important.
750For some calls, it makes sense to skip negative forward and all backward
751assertions, and also the \b assertion; for others it does not.
752
753Arguments:
754  code         pointer to the start of the group
755  options      pointer to external options
756  optbit       the option bit whose changing is significant, or
757                 zero if none are
758  skipassert   TRUE if certain assertions are to be skipped
759
760Returns:       pointer to the first significant opcode
761*/
762
763static const uschar*
764first_significant_code(const uschar *code, int *options, int optbit,
765  BOOL skipassert)
766{
767for (;;)
768  {
769  switch ((int)*code)
770    {
771    case OP_OPT:
772    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
773      *options = (int)code[1];
774    code += 2;
775    break;
776
777    case OP_ASSERT_NOT:
778    case OP_ASSERTBACK:
779    case OP_ASSERTBACK_NOT:
780    if (!skipassert) return code;
781    do code += GET(code, 1); while (*code == OP_ALT);
782    code += _pcre_OP_lengths[*code];
783    break;
784
785    case OP_WORD_BOUNDARY:
786    case OP_NOT_WORD_BOUNDARY:
787    if (!skipassert) return code;
788    /* Fall through */
789
790    case OP_CALLOUT:
791    case OP_CREF:
792    case OP_BRANUMBER:
793    code += _pcre_OP_lengths[*code];
794    break;
795
796    default:
797    return code;
798    }
799  }
800/* Control never reaches here */
801}
802
803
804
805
806/*************************************************
807*        Find the fixed length of a pattern      *
808*************************************************/
809
810/* Scan a pattern and compute the fixed length of subject that will match it,
811if the length is fixed. This is needed for dealing with backward assertions.
812In UTF8 mode, the result is in characters rather than bytes.
813
814Arguments:
815  code     points to the start of the pattern (the bracket)
816  options  the compiling options
817
818Returns:   the fixed length, or -1 if there is no fixed length,
819             or -2 if \C was encountered
820*/
821
822static int
823find_fixedlength(uschar *code, int options)
824{
825int length = -1;
826
827register int branchlength = 0;
828register uschar *cc = code + 1 + LINK_SIZE;
829
830/* Scan along the opcodes for this branch. If we get to the end of the
831branch, check the length against that of the other branches. */
832
833for (;;)
834  {
835  int d;
836  register int op = *cc;
837  if (op >= OP_BRA) op = OP_BRA;
838
839  switch (op)
840    {
841    case OP_BRA:
842    case OP_ONCE:
843    case OP_COND:
844    d = find_fixedlength(cc, options);
845    if (d < 0) return d;
846    branchlength += d;
847    do cc += GET(cc, 1); while (*cc == OP_ALT);
848    cc += 1 + LINK_SIZE;
849    break;
850
851    /* Reached end of a branch; if it's a ket it is the end of a nested
852    call. If it's ALT it is an alternation in a nested call. If it is
853    END it's the end of the outer call. All can be handled by the same code. */
854
855    case OP_ALT:
856    case OP_KET:
857    case OP_KETRMAX:
858    case OP_KETRMIN:
859    case OP_END:
860    if (length < 0) length = branchlength;
861      else if (length != branchlength) return -1;
862    if (*cc != OP_ALT) return length;
863    cc += 1 + LINK_SIZE;
864    branchlength = 0;
865    break;
866
867    /* Skip over assertive subpatterns */
868
869    case OP_ASSERT:
870    case OP_ASSERT_NOT:
871    case OP_ASSERTBACK:
872    case OP_ASSERTBACK_NOT:
873    do cc += GET(cc, 1); while (*cc == OP_ALT);
874    /* Fall through */
875
876    /* Skip over things that don't match chars */
877
878    case OP_REVERSE:
879    case OP_BRANUMBER:
880    case OP_CREF:
881    case OP_OPT:
882    case OP_CALLOUT:
883    case OP_SOD:
884    case OP_SOM:
885    case OP_EOD:
886    case OP_EODN:
887    case OP_CIRC:
888    case OP_DOLL:
889    case OP_NOT_WORD_BOUNDARY:
890    case OP_WORD_BOUNDARY:
891    cc += _pcre_OP_lengths[*cc];
892    break;
893
894    /* Handle literal characters */
895
896    case OP_CHAR:
897    case OP_CHARNC:
898    branchlength++;
899    cc += 2;
900#ifdef SUPPORT_UTF8
901    if ((options & PCRE_UTF8) != 0)
902      {
903      while ((*cc & 0xc0) == 0x80) cc++;
904      }
905#endif
906    break;
907
908    /* Handle exact repetitions. The count is already in characters, but we
909    need to skip over a multibyte character in UTF8 mode.  */
910
911    case OP_EXACT:
912    branchlength += GET2(cc,1);
913    cc += 4;
914#ifdef SUPPORT_UTF8
915    if ((options & PCRE_UTF8) != 0)
916      {
917      while((*cc & 0x80) == 0x80) cc++;
918      }
919#endif
920    break;
921
922    case OP_TYPEEXACT:
923    branchlength += GET2(cc,1);
924    cc += 4;
925    break;
926
927    /* Handle single-char matchers */
928
929    case OP_PROP:
930    case OP_NOTPROP:
931    cc++;
932    /* Fall through */
933
934    case OP_NOT_DIGIT:
935    case OP_DIGIT:
936    case OP_NOT_WHITESPACE:
937    case OP_WHITESPACE:
938    case OP_NOT_WORDCHAR:
939    case OP_WORDCHAR:
940    case OP_ANY:
941    branchlength++;
942    cc++;
943    break;
944
945    /* The single-byte matcher isn't allowed */
946
947    case OP_ANYBYTE:
948    return -2;
949
950    /* Check a class for variable quantification */
951
952#ifdef SUPPORT_UTF8
953    case OP_XCLASS:
954    cc += GET(cc, 1) - 33;
955    /* Fall through */
956#endif
957
958    case OP_CLASS:
959    case OP_NCLASS:
960    cc += 33;
961
962    switch (*cc)
963      {
964      case OP_CRSTAR:
965      case OP_CRMINSTAR:
966      case OP_CRQUERY:
967      case OP_CRMINQUERY:
968      return -1;
969
970      case OP_CRRANGE:
971      case OP_CRMINRANGE:
972      if (GET2(cc,1) != GET2(cc,3)) return -1;
973      branchlength += GET2(cc,1);
974      cc += 5;
975      break;
976
977      default:
978      branchlength++;
979      }
980    break;
981
982    /* Anything else is variable length */
983
984    default:
985    return -1;
986    }
987  }
988/* Control never gets here */
989}
990
991
992
993
994/*************************************************
995*    Scan compiled regex for numbered bracket    *
996*************************************************/
997
998/* This little function scans through a compiled pattern until it finds a
999capturing bracket with the given number.
1000
1001Arguments:
1002  code        points to start of expression
1003  utf8        TRUE in UTF-8 mode
1004  number      the required bracket number
1005
1006Returns:      pointer to the opcode for the bracket, or NULL if not found
1007*/
1008
1009static const uschar *
1010find_bracket(const uschar *code, BOOL utf8, int number)
1011{
1012#ifndef SUPPORT_UTF8
1013utf8 = utf8;               /* Stop pedantic compilers complaining */
1014#endif
1015
1016for (;;)
1017  {
1018  register int c = *code;
1019  if (c == OP_END) return NULL;
1020  else if (c > OP_BRA)
1021    {
1022    int n = c - OP_BRA;
1023    if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1024    if (n == number) return (uschar *)code;
1025    code += _pcre_OP_lengths[OP_BRA];
1026    }
1027  else
1028    {
1029    code += _pcre_OP_lengths[c];
1030
1031#ifdef SUPPORT_UTF8
1032
1033    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1034    by a multi-byte character. The length in the table is a minimum, so we have
1035    to scan along to skip the extra bytes. All opcodes are less than 128, so we
1036    can use relatively efficient code. */
1037
1038    if (utf8) switch(c)
1039      {
1040      case OP_CHAR:
1041      case OP_CHARNC:
1042      case OP_EXACT:
1043      case OP_UPTO:
1044      case OP_MINUPTO:
1045      case OP_STAR:
1046      case OP_MINSTAR:
1047      case OP_PLUS:
1048      case OP_MINPLUS:
1049      case OP_QUERY:
1050      case OP_MINQUERY:
1051      while ((*code & 0xc0) == 0x80) code++;
1052      break;
1053
1054      /* XCLASS is used for classes that cannot be represented just by a bit
1055      map. This includes negated single high-valued characters. The length in
1056      the table is zero; the actual length is stored in the compiled code. */
1057
1058      case OP_XCLASS:
1059      code += GET(code, 1) + 1;
1060      break;
1061      }
1062#endif
1063    }
1064  }
1065}
1066
1067
1068
1069/*************************************************
1070*   Scan compiled regex for recursion reference  *
1071*************************************************/
1072
1073/* This little function scans through a compiled pattern until it finds an
1074instance of OP_RECURSE.
1075
1076Arguments:
1077  code        points to start of expression
1078  utf8        TRUE in UTF-8 mode
1079
1080Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1081*/
1082
1083static const uschar *
1084find_recurse(const uschar *code, BOOL utf8)
1085{
1086#ifndef SUPPORT_UTF8
1087utf8 = utf8;               /* Stop pedantic compilers complaining */
1088#endif
1089
1090for (;;)
1091  {
1092  register int c = *code;
1093  if (c == OP_END) return NULL;
1094  else if (c == OP_RECURSE) return code;
1095  else if (c > OP_BRA)
1096    {
1097    code += _pcre_OP_lengths[OP_BRA];
1098    }
1099  else
1100    {
1101    code += _pcre_OP_lengths[c];
1102
1103#ifdef SUPPORT_UTF8
1104
1105    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1106    by a multi-byte character. The length in the table is a minimum, so we have
1107    to scan along to skip the extra bytes. All opcodes are less than 128, so we
1108    can use relatively efficient code. */
1109
1110    if (utf8) switch(c)
1111      {
1112      case OP_CHAR:
1113      case OP_CHARNC:
1114      case OP_EXACT:
1115      case OP_UPTO:
1116      case OP_MINUPTO:
1117      case OP_STAR:
1118      case OP_MINSTAR:
1119      case OP_PLUS:
1120      case OP_MINPLUS:
1121      case OP_QUERY:
1122      case OP_MINQUERY:
1123      while ((*code & 0xc0) == 0x80) code++;
1124      break;
1125
1126      /* XCLASS is used for classes that cannot be represented just by a bit
1127      map. This includes negated single high-valued characters. The length in
1128      the table is zero; the actual length is stored in the compiled code. */
1129
1130      case OP_XCLASS:
1131      code += GET(code, 1) + 1;
1132      break;
1133      }
1134#endif
1135    }
1136  }
1137}
1138
1139
1140
1141/*************************************************
1142*    Scan compiled branch for non-emptiness      *
1143*************************************************/
1144
1145/* This function scans through a branch of a compiled pattern to see whether it
1146can match the empty string or not. It is called only from could_be_empty()
1147below. Note that first_significant_code() skips over assertions. If we hit an
1148unclosed bracket, we return "empty" - this means we've struck an inner bracket
1149whose current branch will already have been scanned.
1150
1151Arguments:
1152  code        points to start of search
1153  endcode     points to where to stop
1154  utf8        TRUE if in UTF8 mode
1155
1156Returns:      TRUE if what is matched could be empty
1157*/
1158
1159static BOOL
1160could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1161{
1162register int c;
1163for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1164     code < endcode;
1165     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1166  {
1167  const uschar *ccode;
1168
1169  c = *code;
1170
1171  if (c >= OP_BRA)
1172    {
1173    BOOL empty_branch;
1174    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1175
1176    /* Scan a closed bracket */
1177
1178    empty_branch = FALSE;
1179    do
1180      {
1181      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1182        empty_branch = TRUE;
1183      code += GET(code, 1);
1184      }
1185    while (*code == OP_ALT);
1186    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1187    code += 1 + LINK_SIZE;
1188    c = *code;
1189    }
1190
1191  else switch (c)
1192    {
1193    /* Check for quantifiers after a class */
1194
1195#ifdef SUPPORT_UTF8
1196    case OP_XCLASS:
1197    ccode = code + GET(code, 1);
1198    goto CHECK_CLASS_REPEAT;
1199#endif
1200
1201    case OP_CLASS:
1202    case OP_NCLASS:
1203    ccode = code + 33;
1204
1205#ifdef SUPPORT_UTF8
1206    CHECK_CLASS_REPEAT:
1207#endif
1208
1209    switch (*ccode)
1210      {
1211      case OP_CRSTAR:            /* These could be empty; continue */
1212      case OP_CRMINSTAR:
1213      case OP_CRQUERY:
1214      case OP_CRMINQUERY:
1215      break;
1216
1217      default:                   /* Non-repeat => class must match */
1218      case OP_CRPLUS:            /* These repeats aren't empty */
1219      case OP_CRMINPLUS:
1220      return FALSE;
1221
1222      case OP_CRRANGE:
1223      case OP_CRMINRANGE:
1224      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1225      break;
1226      }
1227    break;
1228
1229    /* Opcodes that must match a character */
1230
1231    case OP_PROP:
1232    case OP_NOTPROP:
1233    case OP_EXTUNI:
1234    case OP_NOT_DIGIT:
1235    case OP_DIGIT:
1236    case OP_NOT_WHITESPACE:
1237    case OP_WHITESPACE:
1238    case OP_NOT_WORDCHAR:
1239    case OP_WORDCHAR:
1240    case OP_ANY:
1241    case OP_ANYBYTE:
1242    case OP_CHAR:
1243    case OP_CHARNC:
1244    case OP_NOT:
1245    case OP_PLUS:
1246    case OP_MINPLUS:
1247    case OP_EXACT:
1248    case OP_NOTPLUS:
1249    case OP_NOTMINPLUS:
1250    case OP_NOTEXACT:
1251    case OP_TYPEPLUS:
1252    case OP_TYPEMINPLUS:
1253    case OP_TYPEEXACT:
1254    return FALSE;
1255
1256    /* End of branch */
1257
1258    case OP_KET:
1259    case OP_KETRMAX:
1260    case OP_KETRMIN:
1261    case OP_ALT:
1262    return TRUE;
1263
1264    /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1265    followed by a multibyte character */
1266
1267#ifdef SUPPORT_UTF8
1268    case OP_STAR:
1269    case OP_MINSTAR:
1270    case OP_QUERY:
1271    case OP_MINQUERY:
1272    case OP_UPTO:
1273    case OP_MINUPTO:
1274    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1275    break;
1276#endif
1277    }
1278  }
1279
1280return TRUE;
1281}
1282
1283
1284
1285/*************************************************
1286*    Scan compiled regex for non-emptiness       *
1287*************************************************/
1288
1289/* This function is called to check for left recursive calls. We want to check
1290the current branch of the current pattern to see if it could match the empty
1291string. If it could, we must look outwards for branches at other levels,
1292stopping when we pass beyond the bracket which is the subject of the recursion.
1293
1294Arguments:
1295  code        points to start of the recursion
1296  endcode     points to where to stop (current RECURSE item)
1297  bcptr       points to the chain of current (unclosed) branch starts
1298  utf8        TRUE if in UTF-8 mode
1299
1300Returns:      TRUE if what is matched could be empty
1301*/
1302
1303static BOOL
1304could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1305  BOOL utf8)
1306{
1307while (bcptr != NULL && bcptr->current >= code)
1308  {
1309  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1310  bcptr = bcptr->outer;
1311  }
1312return TRUE;
1313}
1314
1315
1316
1317/*************************************************
1318*           Check for POSIX class syntax         *
1319*************************************************/
1320
1321/* This function is called when the sequence "[:" or "[." or "[=" is
1322encountered in a character class. It checks whether this is followed by an
1323optional ^ and then a sequence of letters, terminated by a matching ":]" or
1324".]" or "=]".
1325
1326Argument:
1327  ptr      pointer to the initial [
1328  endptr   where to return the end pointer
1329  cd       pointer to compile data
1330
1331Returns:   TRUE or FALSE
1332*/
1333
1334static BOOL
1335check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1336{
1337int terminator;          /* Don't combine these lines; the Solaris cc */
1338terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1339if (*(++ptr) == '^') ptr++;
1340while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1341if (*ptr == terminator && ptr[1] == ']')
1342  {
1343  *endptr = ptr;
1344  return TRUE;
1345  }
1346return FALSE;
1347}
1348
1349
1350
1351
1352/*************************************************
1353*          Check POSIX class name                *
1354*************************************************/
1355
1356/* This function is called to check the name given in a POSIX-style class entry
1357such as [:alnum:].
1358
1359Arguments:
1360  ptr        points to the first letter
1361  len        the length of the name
1362
1363Returns:     a value representing the name, or -1 if unknown
1364*/
1365
1366static int
1367check_posix_name(const uschar *ptr, int len)
1368{
1369register int yield = 0;
1370while (posix_name_lengths[yield] != 0)
1371  {
1372  if (len == posix_name_lengths[yield] &&
1373    strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1374  yield++;
1375  }
1376return -1;
1377}
1378
1379
1380/*************************************************
1381*    Adjust OP_RECURSE items in repeated group   *
1382*************************************************/
1383
1384/* OP_RECURSE items contain an offset from the start of the regex to the group
1385that is referenced. This means that groups can be replicated for fixed
1386repetition simply by copying (because the recursion is allowed to refer to
1387earlier groups that are outside the current group). However, when a group is
1388optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1389it, after it has been compiled. This means that any OP_RECURSE items within it
1390that refer to the group itself or any contained groups have to have their
1391offsets adjusted. That is the job of this function. Before it is called, the
1392partially compiled regex must be temporarily terminated with OP_END.
1393
1394Arguments:
1395  group      points to the start of the group
1396  adjust     the amount by which the group is to be moved
1397  utf8       TRUE in UTF-8 mode
1398  cd         contains pointers to tables etc.
1399
1400Returns:     nothing
1401*/
1402
1403static void
1404adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1405{
1406uschar *ptr = group;
1407while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1408  {
1409  int offset = GET(ptr, 1);
1410  if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1411  ptr += 1 + LINK_SIZE;
1412  }
1413}
1414
1415
1416
1417/*************************************************
1418*        Insert an automatic callout point       *
1419*************************************************/
1420
1421/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1422callout points before each pattern item.
1423
1424Arguments:
1425  code           current code pointer
1426  ptr            current pattern pointer
1427  cd             pointers to tables etc
1428
1429Returns:         new code pointer
1430*/
1431
1432static uschar *
1433auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1434{
1435*code++ = OP_CALLOUT;
1436*code++ = 255;
1437PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1438PUT(code, LINK_SIZE, 0);                /* Default length */
1439return code + 2*LINK_SIZE;
1440}
1441
1442
1443
1444/*************************************************
1445*         Complete a callout item                *
1446*************************************************/
1447
1448/* A callout item contains the length of the next item in the pattern, which
1449we can't fill in till after we have reached the relevant point. This is used
1450for both automatic and manual callouts.
1451
1452Arguments:
1453  previous_callout   points to previous callout item
1454  ptr                current pattern pointer
1455  cd                 pointers to tables etc
1456
1457Returns:             nothing
1458*/
1459
1460static void
1461complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1462{
1463int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1464PUT(previous_callout, 2 + LINK_SIZE, length);
1465}
1466
1467
1468
1469#ifdef SUPPORT_UCP
1470/*************************************************
1471*           Get othercase range                  *
1472*************************************************/
1473
1474/* This function is passed the start and end of a class range, in UTF-8 mode
1475with UCP support. It searches up the characters, looking for internal ranges of
1476characters in the "other" case. Each call returns the next one, updating the
1477start address.
1478
1479Arguments:
1480  cptr        points to starting character value; updated
1481  d           end value
1482  ocptr       where to put start of othercase range
1483  odptr       where to put end of othercase range
1484
1485Yield:        TRUE when range returned; FALSE when no more
1486*/
1487
1488static BOOL
1489get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1490{
1491int c, chartype, othercase, next;
1492
1493for (c = *cptr; c <= d; c++)
1494  {
1495  if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1496    break;
1497  }
1498
1499if (c > d) return FALSE;
1500
1501*ocptr = othercase;
1502next = othercase + 1;
1503
1504for (++c; c <= d; c++)
1505  {
1506  if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1507        othercase != next)
1508    break;
1509  next++;
1510  }
1511
1512*odptr = next - 1;
1513*cptr = c;
1514
1515return TRUE;
1516}
1517#endif  /* SUPPORT_UCP */
1518
1519
1520/*************************************************
1521*           Compile one branch                   *
1522*************************************************/
1523
1524/* Scan the pattern, compiling it into the code vector. If the options are
1525changed during the branch, the pointer is used to change the external options
1526bits.
1527
1528Arguments:
1529  optionsptr     pointer to the option bits
1530  brackets       points to number of extracting brackets used
1531  codeptr        points to the pointer to the current code point
1532  ptrptr         points to the current pattern pointer
1533  errorcodeptr   points to error code variable
1534  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1535  reqbyteptr     set to the last literal character required, else < 0
1536  bcptr          points to current branch chain
1537  cd             contains pointers to tables etc.
1538
1539Returns:         TRUE on success
1540                 FALSE, with *errorcodeptr set non-zero on error
1541*/
1542
1543static BOOL
1544compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1545  const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1546  int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1547{
1548int repeat_type, op_type;
1549int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1550int bravalue = 0;
1551int greedy_default, greedy_non_default;
1552int firstbyte, reqbyte;
1553int zeroreqbyte, zerofirstbyte;
1554int req_caseopt, reqvary, tempreqvary;
1555int condcount = 0;
1556int options = *optionsptr;
1557int after_manual_callout = 0;
1558register int c;
1559register uschar *code = *codeptr;
1560uschar *tempcode;
1561BOOL inescq = FALSE;
1562BOOL groupsetfirstbyte = FALSE;
1563const uschar *ptr = *ptrptr;
1564const uschar *tempptr;
1565uschar *previous = NULL;
1566uschar *previous_callout = NULL;
1567uschar classbits[32];
1568
1569#ifdef SUPPORT_UTF8
1570BOOL class_utf8;
1571BOOL utf8 = (options & PCRE_UTF8) != 0;
1572uschar *class_utf8data;
1573uschar utf8_char[6];
1574#else
1575BOOL utf8 = FALSE;
1576#endif
1577
1578/* Set up the default and non-default settings for greediness */
1579
1580greedy_default = ((options & PCRE_UNGREEDY) != 0);
1581greedy_non_default = greedy_default ^ 1;
1582
1583/* Initialize no first byte, no required byte. REQ_UNSET means "no char
1584matching encountered yet". It gets changed to REQ_NONE if we hit something that
1585matches a non-fixed char first char; reqbyte just remains unset if we never
1586find one.
1587
1588When we hit a repeat whose minimum is zero, we may have to adjust these values
1589to take the zero repeat into account. This is implemented by setting them to
1590zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1591item types that can be repeated set these backoff variables appropriately. */
1592
1593firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1594
1595/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1596according to the current setting of the caseless flag. REQ_CASELESS is a bit
1597value > 255. It is added into the firstbyte or reqbyte variables to record the
1598case status of the value. This is used only for ASCII characters. */
1599
1600req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1601
1602/* Switch on next character until the end of the branch */
1603
1604for (;; ptr++)
1605  {
1606  BOOL negate_class;
1607  BOOL possessive_quantifier;
1608  BOOL is_quantifier;
1609  int class_charcount;
1610  int class_lastchar;
1611  int newoptions;
1612  int recno;
1613  int skipbytes;
1614  int subreqbyte;
1615  int subfirstbyte;
1616  int mclength;
1617  uschar mcbuffer[8];
1618
1619  /* Next byte in the pattern */
1620
1621  c = *ptr;
1622
1623  /* If in \Q...\E, check for the end; if not, we have a literal */
1624
1625  if (inescq && c != 0)
1626    {
1627    if (c == '\\' && ptr[1] == 'E')
1628      {
1629      inescq = FALSE;
1630      ptr++;
1631      continue;
1632      }
1633    else
1634      {
1635      if (previous_callout != NULL)
1636        {
1637        complete_callout(previous_callout, ptr, cd);
1638        previous_callout = NULL;
1639        }
1640      if ((options & PCRE_AUTO_CALLOUT) != 0)
1641        {
1642        previous_callout = code;
1643        code = auto_callout(code, ptr, cd);
1644        }
1645      goto NORMAL_CHAR;
1646      }
1647    }
1648
1649  /* Fill in length of a previous callout, except when the next thing is
1650  a quantifier. */
1651
1652  is_quantifier = c == '*' || c == '+' || c == '?' ||
1653    (c == '{' && is_counted_repeat(ptr+1));
1654
1655  if (!is_quantifier && previous_callout != NULL &&
1656       after_manual_callout-- <= 0)
1657    {
1658    complete_callout(previous_callout, ptr, cd);
1659    previous_callout = NULL;
1660    }
1661
1662  /* In extended mode, skip white space and comments */
1663
1664  if ((options & PCRE_EXTENDED) != 0)
1665    {
1666    if ((cd->ctypes[c] & ctype_space) != 0) continue;
1667    if (c == '#')
1668      {
1669      /* The space before the ; is to avoid a warning on a silly compiler
1670      on the Macintosh. */
1671      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1672      if (c != 0) continue;   /* Else fall through to handle end of string */
1673      }
1674    }
1675
1676  /* No auto callout for quantifiers. */
1677
1678  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1679    {
1680    previous_callout = code;
1681    code = auto_callout(code, ptr, cd);
1682    }
1683
1684  switch(c)
1685    {
1686    /* The branch terminates at end of string, |, or ). */
1687
1688    case 0:
1689    case '|':
1690    case ')':
1691    *firstbyteptr = firstbyte;
1692    *reqbyteptr = reqbyte;
1693    *codeptr = code;
1694    *ptrptr = ptr;
1695    return TRUE;
1696
1697    /* Handle single-character metacharacters. In multiline mode, ^ disables
1698    the setting of any following char as a first character. */
1699
1700    case '^':
1701    if ((options & PCRE_MULTILINE) != 0)
1702      {
1703      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1704      }
1705    previous = NULL;
1706    *code++ = OP_CIRC;
1707    break;
1708
1709    case '$':
1710    previous = NULL;
1711    *code++ = OP_DOLL;
1712    break;
1713
1714    /* There can never be a first char if '.' is first, whatever happens about
1715    repeats. The value of reqbyte doesn't change either. */
1716
1717    case '.':
1718    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1719    zerofirstbyte = firstbyte;
1720    zeroreqbyte = reqbyte;
1721    previous = code;
1722    *code++ = OP_ANY;
1723    break;
1724
1725    /* Character classes. If the included characters are all < 255 in value, we
1726    build a 32-byte bitmap of the permitted characters, except in the special
1727    case where there is only one such character. For negated classes, we build
1728    the map as usual, then invert it at the end. However, we use a different
1729    opcode so that data characters > 255 can be handled correctly.
1730
1731    If the class contains characters outside the 0-255 range, a different
1732    opcode is compiled. It may optionally have a bit map for characters < 256,
1733    but those above are are explicitly listed afterwards. A flag byte tells
1734    whether the bitmap is present, and whether this is a negated class or not.
1735    */
1736
1737    case '[':
1738    previous = code;
1739
1740    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1741    they are encountered at the top level, so we'll do that too. */
1742
1743    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1744        check_posix_syntax(ptr, &tempptr, cd))
1745      {
1746      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1747      goto FAILED;
1748      }
1749
1750    /* If the first character is '^', set the negation flag and skip it. */
1751
1752    if ((c = *(++ptr)) == '^')
1753      {
1754      negate_class = TRUE;
1755      c = *(++ptr);
1756      }
1757    else
1758      {
1759      negate_class = FALSE;
1760      }
1761
1762    /* Keep a count of chars with values < 256 so that we can optimize the case
1763    of just a single character (as long as it's < 256). For higher valued UTF-8
1764    characters, we don't yet do any optimization. */
1765
1766    class_charcount = 0;
1767    class_lastchar = -1;
1768
1769#ifdef SUPPORT_UTF8
1770    class_utf8 = FALSE;                       /* No chars >= 256 */
1771    class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1772#endif
1773
1774    /* Initialize the 32-char bit map to all zeros. We have to build the
1775    map in a temporary bit of store, in case the class contains only 1
1776    character (< 256), because in that case the compiled code doesn't use the
1777    bit map. */
1778
1779    memset(classbits, 0, 32 * sizeof(uschar));
1780
1781    /* Process characters until ] is reached. By writing this as a "do" it
1782    means that an initial ] is taken as a data character. The first pass
1783    through the regex checked the overall syntax, so we don't need to be very
1784    strict here. At the start of the loop, c contains the first byte of the
1785    character. */
1786
1787    do
1788      {
1789#ifdef SUPPORT_UTF8
1790      if (utf8 && c > 127)
1791        {                           /* Braces are required because the */
1792        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
1793        }
1794#endif
1795
1796      /* Inside \Q...\E everything is literal except \E */
1797
1798      if (inescq)
1799        {
1800        if (c == '\\' && ptr[1] == 'E')
1801          {
1802          inescq = FALSE;
1803          ptr++;
1804          continue;
1805          }
1806        else goto LONE_SINGLE_CHARACTER;
1807        }
1808
1809      /* Handle POSIX class names. Perl allows a negation extension of the
1810      form [:^name:]. A square bracket that doesn't match the syntax is
1811      treated as a literal. We also recognize the POSIX constructions
1812      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1813      5.6 and 5.8 do. */
1814
1815      if (c == '[' &&
1816          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1817          check_posix_syntax(ptr, &tempptr, cd))
1818        {
1819        BOOL local_negate = FALSE;
1820        int posix_class, i;
1821        register const uschar *cbits = cd->cbits;
1822
1823        if (ptr[1] != ':')
1824          {
1825          *errorcodeptr = ERR31;
1826          goto FAILED;
1827          }
1828
1829        ptr += 2;
1830        if (*ptr == '^')
1831          {
1832          local_negate = TRUE;
1833          ptr++;
1834          }
1835
1836        posix_class = check_posix_name(ptr, tempptr - ptr);
1837        if (posix_class < 0)
1838          {
1839          *errorcodeptr = ERR30;
1840          goto FAILED;
1841          }
1842
1843        /* If matching is caseless, upper and lower are converted to
1844        alpha. This relies on the fact that the class table starts with
1845        alpha, lower, upper as the first 3 entries. */
1846
1847        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1848          posix_class = 0;
1849
1850        /* Or into the map we are building up to 3 of the static class
1851        tables, or their negations. The [:blank:] class sets up the same
1852        chars as the [:space:] class (all white space). We remove the vertical
1853        white space chars afterwards. */
1854
1855        posix_class *= 3;
1856        for (i = 0; i < 3; i++)
1857          {
1858          BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1859          int taboffset = posix_class_maps[posix_class + i];
1860          if (taboffset < 0) break;
1861          if (local_negate)
1862            {
1863            if (i == 0)
1864              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1865            else
1866              for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1867            if (blankclass) classbits[1] |= 0x3c;
1868            }
1869          else
1870            {
1871            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1872            if (blankclass) classbits[1] &= ~0x3c;
1873            }
1874          }
1875
1876        ptr = tempptr + 1;
1877        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1878        continue;    /* End of POSIX syntax handling */
1879        }
1880
1881      /* Backslash may introduce a single character, or it may introduce one
1882      of the specials, which just set a flag. Escaped items are checked for
1883      validity in the pre-compiling pass. The sequence \b is a special case.
1884      Inside a class (and only there) it is treated as backspace. Elsewhere
1885      it marks a word boundary. Other escapes have preset maps ready to
1886      or into the one we are building. We assume they have more than one
1887      character in them, so set class_charcount bigger than one. */
1888
1889      if (c == '\\')
1890        {
1891        c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1892
1893        if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
1894        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
1895        else if (-c == ESC_Q)            /* Handle start of quoted string */
1896          {
1897          if (ptr[1] == '\\' && ptr[2] == 'E')
1898            {
1899            ptr += 2; /* avoid empty string */
1900            }
1901          else inescq = TRUE;
1902          continue;
1903          }
1904
1905        if (c < 0)
1906          {
1907          register const uschar *cbits = cd->cbits;
1908          class_charcount += 2;     /* Greater than 1 is what matters */
1909          switch (-c)
1910            {
1911            case ESC_d:
1912            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1913            continue;
1914
1915            case ESC_D:
1916            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1917            continue;
1918
1919            case ESC_w:
1920            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1921            continue;
1922
1923            case ESC_W:
1924            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1925            continue;
1926
1927            case ESC_s:
1928            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1929            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1930            continue;
1931
1932            case ESC_S:
1933            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1934            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1935            continue;
1936
1937#ifdef SUPPORT_UCP
1938            case ESC_p:
1939            case ESC_P:
1940              {
1941              BOOL negated;
1942              int property = get_ucp(&ptr, &negated, errorcodeptr);
1943              if (property < 0) goto FAILED;
1944              class_utf8 = TRUE;
1945              *class_utf8data++ = ((-c == ESC_p) != negated)?
1946                XCL_PROP : XCL_NOTPROP;
1947              *class_utf8data++ = property;
1948              class_charcount -= 2;   /* Not a < 256 character */
1949              }
1950            continue;
1951#endif
1952
1953            /* Unrecognized escapes are faulted if PCRE is running in its
1954            strict mode. By default, for compatibility with Perl, they are
1955            treated as literals. */
1956
1957            default:
1958            if ((options & PCRE_EXTRA) != 0)
1959              {
1960              *errorcodeptr = ERR7;
1961              goto FAILED;
1962              }
1963            c = *ptr;              /* The final character */
1964            class_charcount -= 2;  /* Undo the default count from above */
1965            }
1966          }
1967
1968        /* Fall through if we have a single character (c >= 0). This may be
1969        > 256 in UTF-8 mode. */
1970
1971        }   /* End of backslash handling */
1972
1973      /* A single character may be followed by '-' to form a range. However,
1974      Perl does not permit ']' to be the end of the range. A '-' character
1975      here is treated as a literal. */
1976
1977      if (ptr[1] == '-' && ptr[2] != ']')
1978        {
1979        int d;
1980        ptr += 2;
1981
1982#ifdef SUPPORT_UTF8
1983        if (utf8)
1984          {                           /* Braces are required because the */
1985          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1986          }
1987        else
1988#endif
1989        d = *ptr;  /* Not UTF-8 mode */
1990
1991        /* The second part of a range can be a single-character escape, but
1992        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1993        in such circumstances. */
1994
1995        if (d == '\\')
1996          {
1997          const uschar *oldptr = ptr;
1998          d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1999
2000          /* \b is backslash; \X is literal X; any other special means the '-'
2001          was literal */
2002
2003          if (d < 0)
2004            {
2005            if (d == -ESC_b) d = '\b';
2006            else if (d == -ESC_X) d = 'X'; else
2007              {
2008              ptr = oldptr - 2;
2009              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2010              }
2011            }
2012          }
2013
2014        /* The check that the two values are in the correct order happens in
2015        the pre-pass. Optimize one-character ranges */
2016
2017        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2018
2019        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2020        matching, we have to use an XCLASS with extra data items. Caseless
2021        matching for characters > 127 is available only if UCP support is
2022        available. */
2023
2024#ifdef SUPPORT_UTF8
2025        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2026          {
2027          class_utf8 = TRUE;
2028
2029          /* With UCP support, we can find the other case equivalents of
2030          the relevant characters. There may be several ranges. Optimize how
2031          they fit with the basic range. */
2032
2033#ifdef SUPPORT_UCP
2034          if ((options & PCRE_CASELESS) != 0)
2035            {
2036            int occ, ocd;
2037            int cc = c;
2038            int origd = d;
2039            while (get_othercase_range(&cc, origd, &occ, &ocd))
2040              {
2041              if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2042
2043              if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2044                {                                  /* if there is overlap,   */
2045                c = occ;                           /* noting that if occ < c */
2046                continue;                          /* we can't have ocd > d  */
2047                }                                  /* because a subrange is  */
2048              if (ocd > d && occ <= d + 1)         /* always shorter than    */
2049                {                                  /* the basic range.       */
2050                d = ocd;
2051                continue;
2052                }
2053
2054              if (occ == ocd)
2055                {
2056                *class_utf8data++ = XCL_SINGLE;
2057                }
2058              else
2059                {
2060                *class_utf8data++ = XCL_RANGE;
2061                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2062                }
2063              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2064              }
2065            }
2066#endif  /* SUPPORT_UCP */
2067
2068          /* Now record the original range, possibly modified for UCP caseless
2069          overlapping ranges. */
2070
2071          *class_utf8data++ = XCL_RANGE;
2072          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2073          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2074
2075          /* With UCP support, we are done. Without UCP support, there is no
2076          caseless matching for UTF-8 characters > 127; we can use the bit map
2077          for the smaller ones. */
2078
2079#ifdef SUPPORT_UCP
2080          continue;    /* With next character in the class */
2081#else
2082          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2083
2084          /* Adjust upper limit and fall through to set up the map */
2085
2086          d = 127;
2087
2088#endif  /* SUPPORT_UCP */
2089          }
2090#endif  /* SUPPORT_UTF8 */
2091
2092        /* We use the bit map for all cases when not in UTF-8 mode; else
2093        ranges that lie entirely within 0-127 when there is UCP support; else
2094        for partial ranges without UCP support. */
2095
2096        for (; c <= d; c++)
2097          {
2098          classbits[c/8] |= (1 << (c&7));
2099          if ((options & PCRE_CASELESS) != 0)
2100            {
2101            int uc = cd->fcc[c];           /* flip case */
2102            classbits[uc/8] |= (1 << (uc&7));
2103            }
2104          class_charcount++;                /* in case a one-char range */
2105          class_lastchar = c;
2106          }
2107
2108        continue;   /* Go get the next char in the class */
2109        }
2110
2111      /* Handle a lone single character - we can get here for a normal
2112      non-escape char, or after \ that introduces a single character or for an
2113      apparent range that isn't. */
2114
2115      LONE_SINGLE_CHARACTER:
2116
2117      /* Handle a character that cannot go in the bit map */
2118
2119#ifdef SUPPORT_UTF8
2120      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2121        {
2122        class_utf8 = TRUE;
2123        *class_utf8data++ = XCL_SINGLE;
2124        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2125
2126#ifdef SUPPORT_UCP
2127        if ((options & PCRE_CASELESS) != 0)
2128          {
2129          int chartype;
2130          int othercase;
2131          if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2132               othercase > 0)
2133            {
2134            *class_utf8data++ = XCL_SINGLE;
2135            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2136            }
2137          }
2138#endif  /* SUPPORT_UCP */
2139
2140        }
2141      else
2142#endif  /* SUPPORT_UTF8 */
2143
2144      /* Handle a single-byte character */
2145        {
2146        classbits[c/8] |= (1 << (c&7));
2147        if ((options & PCRE_CASELESS) != 0)
2148          {
2149          c = cd->fcc[c];   /* flip case */
2150          classbits[c/8] |= (1 << (c&7));
2151          }
2152        class_charcount++;
2153        class_lastchar = c;
2154        }
2155      }
2156
2157    /* Loop until ']' reached; the check for end of string happens inside the
2158    loop. This "while" is the end of the "do" above. */
2159
2160    while ((c = *(++ptr)) != ']' || inescq);
2161
2162    /* If class_charcount is 1, we saw precisely one character whose value is
2163    less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2164    can optimize the negative case only if there were no characters >= 128
2165    because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2166    single-bytes only. This is an historical hangover. Maybe one day we can
2167    tidy these opcodes to handle multi-byte characters.
2168
2169    The optimization throws away the bit map. We turn the item into a
2170    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2171    that OP_NOT does not support multibyte characters. In the positive case, it
2172    can cause firstbyte to be set. Otherwise, there can be no first char if
2173    this item is first, whatever repeat count may follow. In the case of
2174    reqbyte, save the previous value for reinstating. */
2175
2176#ifdef SUPPORT_UTF8
2177    if (class_charcount == 1 &&
2178          (!utf8 ||
2179          (!class_utf8 && (!negate_class || class_lastchar < 128))))
2180
2181#else
2182    if (class_charcount == 1)
2183#endif
2184      {
2185      zeroreqbyte = reqbyte;
2186
2187      /* The OP_NOT opcode works on one-byte characters only. */
2188
2189      if (negate_class)
2190        {
2191        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2192        zerofirstbyte = firstbyte;
2193        *code++ = OP_NOT;
2194        *code++ = class_lastchar;
2195        break;
2196        }
2197
2198      /* For a single, positive character, get the value into mcbuffer, and
2199      then we can handle this with the normal one-character code. */
2200
2201#ifdef SUPPORT_UTF8
2202      if (utf8 && class_lastchar > 127)
2203        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2204      else
2205#endif
2206        {
2207        mcbuffer[0] = class_lastchar;
2208        mclength = 1;
2209        }
2210      goto ONE_CHAR;
2211      }       /* End of 1-char optimization */
2212
2213    /* The general case - not the one-char optimization. If this is the first
2214    thing in the branch, there can be no first char setting, whatever the
2215    repeat count. Any reqbyte setting must remain unchanged after any kind of
2216    repeat. */
2217
2218    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2219    zerofirstbyte = firstbyte;
2220    zeroreqbyte = reqbyte;
2221
2222    /* If there are characters with values > 255, we have to compile an
2223    extended class, with its own opcode. If there are no characters < 256,
2224    we can omit the bitmap. */
2225
2226#ifdef SUPPORT_UTF8
2227    if (class_utf8)
2228      {
2229      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2230      *code++ = OP_XCLASS;
2231      code += LINK_SIZE;
2232      *code = negate_class? XCL_NOT : 0;
2233
2234      /* If the map is required, install it, and move on to the end of
2235      the extra data */
2236
2237      if (class_charcount > 0)
2238        {
2239        *code++ |= XCL_MAP;
2240        memcpy(code, classbits, 32);
2241        code = class_utf8data;
2242        }
2243
2244      /* If the map is not required, slide down the extra data. */
2245
2246      else
2247        {
2248        int len = class_utf8data - (code + 33);
2249        memmove(code + 1, code + 33, len);
2250        code += len + 1;
2251        }
2252
2253      /* Now fill in the complete length of the item */
2254
2255      PUT(previous, 1, code - previous);
2256      break;   /* End of class handling */
2257      }
2258#endif
2259
2260    /* If there are no characters > 255, negate the 32-byte map if necessary,
2261    and copy it into the code vector. If this is the first thing in the branch,
2262    there can be no first char setting, whatever the repeat count. Any reqbyte
2263    setting must remain unchanged after any kind of repeat. */
2264
2265    if (negate_class)
2266      {
2267      *code++ = OP_NCLASS;
2268      for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2269      }
2270    else
2271      {
2272      *code++ = OP_CLASS;
2273      memcpy(code, classbits, 32);
2274      }
2275    code += 32;
2276    break;
2277
2278    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2279    has been tested above. */
2280
2281    case '{':
2282    if (!is_quantifier) goto NORMAL_CHAR;
2283    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2284    if (*errorcodeptr != 0) goto FAILED;
2285    goto REPEAT;
2286
2287    case '*':
2288    repeat_min = 0;
2289    repeat_max = -1;
2290    goto REPEAT;
2291
2292    case '+':
2293    repeat_min = 1;
2294    repeat_max = -1;
2295    goto REPEAT;
2296
2297    case '?':
2298    repeat_min = 0;
2299    repeat_max = 1;
2300
2301    REPEAT:
2302    if (previous == NULL)
2303      {
2304      *errorcodeptr = ERR9;
2305      goto FAILED;
2306      }
2307
2308    if (repeat_min == 0)
2309      {
2310      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2311      reqbyte = zeroreqbyte;        /* Ditto */
2312      }
2313
2314    /* Remember whether this is a variable length repeat */
2315
2316    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2317
2318    op_type = 0;                    /* Default single-char op codes */
2319    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2320
2321    /* Save start of previous item, in case we have to move it up to make space
2322    for an inserted OP_ONCE for the additional '+' extension. */
2323
2324    tempcode = previous;
2325
2326    /* If the next character is '+', we have a possessive quantifier. This
2327    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2328    If the next character is '?' this is a minimizing repeat, by default,
2329    but if PCRE_UNGREEDY is set, it works the other way round. We change the
2330    repeat type to the non-default. */
2331
2332    if (ptr[1] == '+')
2333      {
2334      repeat_type = 0;                  /* Force greedy */
2335      possessive_quantifier = TRUE;
2336      ptr++;
2337      }
2338    else if (ptr[1] == '?')
2339      {
2340      repeat_type = greedy_non_default;
2341      ptr++;
2342      }
2343    else repeat_type = greedy_default;
2344
2345    /* If previous was a recursion, we need to wrap it inside brackets so that
2346    it can be replicated if necessary. */
2347
2348    if (*previous == OP_RECURSE)
2349      {
2350      memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2351      code += 1 + LINK_SIZE;
2352      *previous = OP_BRA;
2353      PUT(previous, 1, code - previous);
2354      *code = OP_KET;
2355      PUT(code, 1, code - previous);
2356      code += 1 + LINK_SIZE;
2357      }
2358
2359    /* If previous was a character match, abolish the item and generate a
2360    repeat item instead. If a char item has a minumum of more than one, ensure
2361    that it is set in reqbyte - it might not be if a sequence such as x{3} is
2362    the first thing in a branch because the x will have gone into firstbyte
2363    instead.  */
2364
2365    if (*previous == OP_CHAR || *previous == OP_CHARNC)
2366      {
2367      /* Deal with UTF-8 characters that take up more than one byte. It's
2368      easier to write this out separately than try to macrify it. Use c to
2369      hold the length of the character in bytes, plus 0x80 to flag that it's a
2370      length rather than a small character. */
2371
2372#ifdef SUPPORT_UTF8
2373      if (utf8 && (code[-1] & 0x80) != 0)
2374        {
2375        uschar *lastchar = code - 1;
2376        while((*lastchar & 0xc0) == 0x80) lastchar--;
2377        c = code - lastchar;            /* Length of UTF-8 character */
2378        memcpy(utf8_char, lastchar, c); /* Save the char */
2379        c |= 0x80;                      /* Flag c as a length */
2380        }
2381      else
2382#endif
2383
2384      /* Handle the case of a single byte - either with no UTF8 support, or
2385      with UTF-8 disabled, or for a UTF-8 character < 128. */
2386
2387        {
2388        c = code[-1];
2389        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2390        }
2391
2392      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2393      }
2394
2395    /* If previous was a single negated character ([^a] or similar), we use
2396    one of the special opcodes, replacing it. The code is shared with single-
2397    character repeats by setting opt_type to add a suitable offset into
2398    repeat_type. OP_NOT is currently used only for single-byte chars. */
2399
2400    else if (*previous == OP_NOT)
2401      {
2402      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2403      c = previous[1];
2404      goto OUTPUT_SINGLE_REPEAT;
2405      }
2406
2407    /* If previous was a character type match (\d or similar), abolish it and
2408    create a suitable repeat item. The code is shared with single-character
2409    repeats by setting op_type to add a suitable offset into repeat_type. Note
2410    the the Unicode property types will be present only when SUPPORT_UCP is
2411    defined, but we don't wrap the little bits of code here because it just
2412    makes it horribly messy. */
2413
2414    else if (*previous < OP_EODN)
2415      {
2416      uschar *oldcode;
2417      int prop_type;
2418      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2419      c = *previous;
2420
2421      OUTPUT_SINGLE_REPEAT:
2422      prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2423        previous[1] : -1;
2424
2425      oldcode = code;
2426      code = previous;                  /* Usually overwrite previous item */
2427
2428      /* If the maximum is zero then the minimum must also be zero; Perl allows
2429      this case, so we do too - by simply omitting the item altogether. */
2430
2431      if (repeat_max == 0) goto END_REPEAT;
2432
2433      /* All real repeats make it impossible to handle partial matching (maybe
2434      one day we will be able to remove this restriction). */
2435
2436      if (repeat_max != 1) cd->nopartial = TRUE;
2437
2438      /* Combine the op_type with the repeat_type */
2439
2440      repeat_type += op_type;
2441
2442      /* A minimum of zero is handled either as the special case * or ?, or as
2443      an UPTO, with the maximum given. */
2444
2445      if (repeat_min == 0)
2446        {
2447        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2448          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2449        else
2450          {
2451          *code++ = OP_UPTO + repeat_type;
2452          PUT2INC(code, 0, repeat_max);
2453          }
2454        }
2455
2456      /* A repeat minimum of 1 is optimized into some special cases. If the
2457      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2458      left in place and, if the maximum is greater than 1, we use OP_UPTO with
2459      one less than the maximum. */
2460
2461      else if (repeat_min == 1)
2462        {
2463        if (repeat_max == -1)
2464          *code++ = OP_PLUS + repeat_type;
2465        else
2466          {
2467          code = oldcode;                 /* leave previous item in place */
2468          if (repeat_max == 1) goto END_REPEAT;
2469          *code++ = OP_UPTO + repeat_type;
2470          PUT2INC(code, 0, repeat_max - 1);
2471          }
2472        }
2473
2474      /* The case {n,n} is just an EXACT, while the general case {n,m} is
2475      handled as an EXACT followed by an UPTO. */
2476
2477      else
2478        {
2479        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2480        PUT2INC(code, 0, repeat_min);
2481
2482        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2483        we have to insert the character for the previous code. For a repeated
2484        Unicode property match, there is an extra byte that defines the
2485        required property. In UTF-8 mode, long characters have their length in
2486        c, with the 0x80 bit as a flag. */
2487
2488        if (repeat_max < 0)
2489          {
2490#ifdef SUPPORT_UTF8
2491          if (utf8 && c >= 128)
2492            {
2493            memcpy(code, utf8_char, c & 7);
2494            code += c & 7;
2495            }
2496          else
2497#endif
2498            {
2499            *code++ = c;
2500            if (prop_type >= 0) *code++ = prop_type;
2501            }
2502          *code++ = OP_STAR + repeat_type;
2503          }
2504
2505        /* Else insert an UPTO if the max is greater than the min, again
2506        preceded by the character, for the previously inserted code. */
2507
2508        else if (repeat_max != repeat_min)
2509          {
2510#ifdef SUPPORT_UTF8
2511          if (utf8 && c >= 128)
2512            {
2513            memcpy(code, utf8_char, c & 7);
2514            code += c & 7;
2515            }
2516          else
2517#endif
2518          *code++ = c;
2519          if (prop_type >= 0) *code++ = prop_type;
2520          repeat_max -= repeat_min;
2521          *code++ = OP_UPTO + repeat_type;
2522          PUT2INC(code, 0, repeat_max);
2523          }
2524        }
2525
2526      /* The character or character type itself comes last in all cases. */
2527
2528#ifdef SUPPORT_UTF8
2529      if (utf8 && c >= 128)
2530        {
2531        memcpy(code, utf8_char, c & 7);
2532        code += c & 7;
2533        }
2534      else
2535#endif
2536      *code++ = c;
2537
2538      /* For a repeated Unicode property match, there is an extra byte that
2539      defines the required property. */
2540
2541#ifdef SUPPORT_UCP
2542      if (prop_type >= 0) *code++ = prop_type;
2543#endif
2544      }
2545
2546    /* If previous was a character class or a back reference, we put the repeat
2547    stuff after it, but just skip the item if the repeat was {0,0}. */
2548
2549    else if (*previous == OP_CLASS ||
2550             *previous == OP_NCLASS ||
2551#ifdef SUPPORT_UTF8
2552             *previous == OP_XCLASS ||
2553#endif
2554             *previous == OP_REF)
2555      {
2556      if (repeat_max == 0)
2557        {
2558        code = previous;
2559        goto END_REPEAT;
2560        }
2561
2562      /* All real repeats make it impossible to handle partial matching (maybe
2563      one day we will be able to remove this restriction). */
2564
2565      if (repeat_max != 1) cd->nopartial = TRUE;
2566
2567      if (repeat_min == 0 && repeat_max == -1)
2568        *code++ = OP_CRSTAR + repeat_type;
2569      else if (repeat_min == 1 && repeat_max == -1)
2570        *code++ = OP_CRPLUS + repeat_type;
2571      else if (repeat_min == 0 && repeat_max == 1)
2572        *code++ = OP_CRQUERY + repeat_type;
2573      else
2574        {
2575        *code++ = OP_CRRANGE + repeat_type;
2576        PUT2INC(code, 0, repeat_min);
2577        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2578        PUT2INC(code, 0, repeat_max);
2579        }
2580      }
2581
2582    /* If previous was a bracket group, we may have to replicate it in certain
2583    cases. */
2584
2585    else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2586             *previous == OP_COND)
2587      {
2588      register int i;
2589      int ketoffset = 0;
2590      int len = code - previous;
2591      uschar *bralink = NULL;
2592
2593      /* If the maximum repeat count is unlimited, find the end of the bracket
2594      by scanning through from the start, and compute the offset back to it
2595      from the current code pointer. There may be an OP_OPT setting following
2596      the final KET, so we can't find the end just by going back from the code
2597      pointer. */
2598
2599      if (repeat_max == -1)
2600        {
2601        register uschar *ket = previous;
2602        do ket += GET(ket, 1); while (*ket != OP_KET);
2603        ketoffset = code - ket;
2604        }
2605
2606      /* The case of a zero minimum is special because of the need to stick
2607      OP_BRAZERO in front of it, and because the group appears once in the
2608      data, whereas in other cases it appears the minimum number of times. For
2609      this reason, it is simplest to treat this case separately, as otherwise
2610      the code gets far too messy. There are several special subcases when the
2611      minimum is zero. */
2612
2613      if (repeat_min == 0)
2614        {
2615        /* If the maximum is also zero, we just omit the group from the output
2616        altogether. */
2617
2618        if (repeat_max == 0)
2619          {
2620          code = previous;
2621          goto END_REPEAT;
2622          }
2623
2624        /* If the maximum is 1 or unlimited, we just have to stick in the
2625        BRAZERO and do no more at this point. However, we do need to adjust
2626        any OP_RECURSE calls inside the group that refer to the group itself or
2627        any internal group, because the offset is from the start of the whole
2628        regex. Temporarily terminate the pattern while doing this. */
2629
2630        if (repeat_max <= 1)
2631          {
2632          *code = OP_END;
2633          adjust_recurse(previous, 1, utf8, cd);
2634          memmove(previous+1, previous, len);
2635          code++;
2636          *previous++ = OP_BRAZERO + repeat_type;
2637          }
2638
2639        /* If the maximum is greater than 1 and limited, we have to replicate
2640        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2641        The first one has to be handled carefully because it's the original
2642        copy, which has to be moved up. The remainder can be handled by code
2643        that is common with the non-zero minimum case below. We have to
2644        adjust the value or repeat_max, since one less copy is required. Once
2645        again, we may have to adjust any OP_RECURSE calls inside the group. */
2646
2647        else
2648          {
2649          int offset;
2650          *code = OP_END;
2651          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2652          memmove(previous + 2 + LINK_SIZE, previous, len);
2653          code += 2 + LINK_SIZE;
2654          *previous++ = OP_BRAZERO + repeat_type;
2655          *previous++ = OP_BRA;
2656
2657          /* We chain together the bracket offset fields that have to be
2658          filled in later when the ends of the brackets are reached. */
2659
2660          offset = (bralink == NULL)? 0 : previous - bralink;
2661          bralink = previous;
2662          PUTINC(previous, 0, offset);
2663          }
2664
2665        repeat_max--;
2666        }
2667
2668      /* If the minimum is greater than zero, replicate the group as many
2669      times as necessary, and adjust the maximum to the number of subsequent
2670      copies that we need. If we set a first char from the group, and didn't
2671      set a required char, copy the latter from the former. */
2672
2673      else
2674        {
2675        if (repeat_min > 1)
2676          {
2677          if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2678          for (i = 1; i < repeat_min; i++)
2679            {
2680            memcpy(code, previous, len);
2681            code += len;
2682            }
2683          }
2684        if (repeat_max > 0) repeat_max -= repeat_min;
2685        }
2686
2687      /* This code is common to both the zero and non-zero minimum cases. If
2688      the maximum is limited, it replicates the group in a nested fashion,
2689      remembering the bracket starts on a stack. In the case of a zero minimum,
2690      the first one was set up above. In all cases the repeat_max now specifies
2691      the number of additional copies needed. */
2692
2693      if (repeat_max >= 0)
2694        {
2695        for (i = repeat_max - 1; i >= 0; i--)
2696          {
2697          *code++ = OP_BRAZERO + repeat_type;
2698
2699          /* All but the final copy start a new nesting, maintaining the
2700          chain of brackets outstanding. */
2701
2702          if (i != 0)
2703            {
2704            int offset;
2705            *code++ = OP_BRA;
2706            offset = (bralink == NULL)? 0 : code - bralink;
2707            bralink = code;
2708            PUTINC(code, 0, offset);
2709            }
2710
2711          memcpy(code, previous, len);
2712          code += len;
2713          }
2714
2715        /* Now chain through the pending brackets, and fill in their length
2716        fields (which are holding the chain links pro tem). */
2717
2718        while (bralink != NULL)
2719          {
2720          int oldlinkoffset;
2721          int offset = code - bralink + 1;
2722          uschar *bra = code - offset;
2723          oldlinkoffset = GET(bra, 1);
2724          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2725          *code++ = OP_KET;
2726          PUTINC(code, 0, offset);
2727          PUT(bra, 1, offset);
2728          }
2729        }
2730
2731      /* If the maximum is unlimited, set a repeater in the final copy. We
2732      can't just offset backwards from the current code point, because we
2733      don't know if there's been an options resetting after the ket. The
2734      correct offset was computed above. */
2735
2736      else code[-ketoffset] = OP_KETRMAX + repeat_type;
2737      }
2738
2739    /* Else there's some kind of shambles */
2740
2741    else
2742      {
2743      *errorcodeptr = ERR11;
2744      goto FAILED;
2745      }
2746
2747    /* If the character following a repeat is '+', we wrap the entire repeated
2748    item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2749    Sun's Java package. The repeated item starts at tempcode, not at previous,
2750    which might be the first part of a string whose (former) last char we
2751    repeated. However, we don't support '+' after a greediness '?'. */
2752
2753    if (possessive_quantifier)
2754      {
2755      int len = code - tempcode;
2756      memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2757      code += 1 + LINK_SIZE;
2758      len += 1 + LINK_SIZE;
2759      tempcode[0] = OP_ONCE;
2760      *code++ = OP_KET;
2761      PUTINC(code, 0, len);
2762      PUT(tempcode, 1, len);
2763      }
2764
2765    /* In all case we no longer have a previous item. We also set the
2766    "follows varying string" flag for subsequently encountered reqbytes if
2767    it isn't already set and we have just passed a varying length item. */
2768
2769    END_REPEAT:
2770    previous = NULL;
2771    cd->req_varyopt |= reqvary;
2772    break;
2773
2774
2775    /* Start of nested bracket sub-expression, or comment or lookahead or
2776    lookbehind or option setting or condition. First deal with special things
2777    that can come after a bracket; all are introduced by ?, and the appearance
2778    of any of them means that this is not a referencing group. They were
2779    checked for validity in the first pass over the string, so we don't have to
2780    check for syntax errors here.  */
2781
2782    case '(':
2783    newoptions = options;
2784    skipbytes = 0;
2785
2786    if (*(++ptr) == '?')
2787      {
2788      int set, unset;
2789      int *optset;
2790
2791      switch (*(++ptr))
2792        {
2793        case '#':                 /* Comment; skip to ket */
2794        ptr++;
2795        while (*ptr != ')') ptr++;
2796        continue;
2797
2798        case ':':                 /* Non-extracting bracket */
2799        bravalue = OP_BRA;
2800        ptr++;
2801        break;
2802
2803        case '(':
2804        bravalue = OP_COND;       /* Conditional group */
2805
2806        /* Condition to test for recursion */
2807
2808        if (ptr[1] == 'R')
2809          {
2810          code[1+LINK_SIZE] = OP_CREF;
2811          PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2812          skipbytes = 3;
2813          ptr += 3;
2814          }
2815
2816        /* Condition to test for a numbered subpattern match. We know that
2817        if a digit follows ( then there will just be digits until ) because
2818        the syntax was checked in the first pass. */
2819
2820        else if ((digitab[ptr[1]] && ctype_digit) != 0)
2821          {
2822          int condref;                 /* Don't amalgamate; some compilers */
2823          condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
2824          while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2825          if (condref == 0)
2826            {
2827            *errorcodeptr = ERR35;
2828            goto FAILED;
2829            }
2830          ptr++;
2831          code[1+LINK_SIZE] = OP_CREF;
2832          PUT2(code, 2+LINK_SIZE, condref);
2833          skipbytes = 3;
2834          }
2835        /* For conditions that are assertions, we just fall through, having
2836        set bravalue above. */
2837        break;
2838
2839        case '=':                 /* Positive lookahead */
2840        bravalue = OP_ASSERT;
2841        ptr++;
2842        break;
2843
2844        case '!':                 /* Negative lookahead */
2845        bravalue = OP_ASSERT_NOT;
2846        ptr++;
2847        break;
2848
2849        case '<':                 /* Lookbehinds */
2850        switch (*(++ptr))
2851          {
2852          case '=':               /* Positive lookbehind */
2853          bravalue = OP_ASSERTBACK;
2854          ptr++;
2855          break;
2856
2857          case '!':               /* Negative lookbehind */
2858          bravalue = OP_ASSERTBACK_NOT;
2859          ptr++;
2860          break;
2861          }
2862        break;
2863
2864        case '>':                 /* One-time brackets */
2865        bravalue = OP_ONCE;
2866        ptr++;
2867        break;
2868
2869        case 'C':                 /* Callout - may be followed by digits; */
2870        previous_callout = code;  /* Save for later completion */
2871        after_manual_callout = 1; /* Skip one item before completing */
2872        *code++ = OP_CALLOUT;     /* Already checked that the terminating */
2873          {                       /* closing parenthesis is present. */
2874          int n = 0;
2875          while ((digitab[*(++ptr)] & ctype_digit) != 0)
2876            n = n * 10 + *ptr - '0';
2877          if (n > 255)
2878            {
2879            *errorcodeptr = ERR38;
2880            goto FAILED;
2881            }
2882          *code++ = n;
2883          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
2884          PUT(code, LINK_SIZE, 0);                    /* Default length */
2885          code += 2 * LINK_SIZE;
2886          }
2887        previous = NULL;
2888        continue;
2889
2890        case 'P':                 /* Named subpattern handling */
2891        if (*(++ptr) == '<')      /* Definition */
2892          {
2893          int i, namelen;
2894          uschar *slot = cd->name_table;
2895          const uschar *name;     /* Don't amalgamate; some compilers */
2896          name = ++ptr;           /* grumble at autoincrement in declaration */
2897
2898          while (*ptr++ != '>');
2899          namelen = ptr - name - 1;
2900
2901          for (i = 0; i < cd->names_found; i++)
2902            {
2903            int crc = memcmp(name, slot+2, namelen);
2904            if (crc == 0)
2905              {
2906              if (slot[2+namelen] == 0)
2907                {
2908                *errorcodeptr = ERR43;
2909                goto FAILED;
2910                }
2911              crc = -1;             /* Current name is substring */
2912              }
2913            if (crc < 0)
2914              {
2915              memmove(slot + cd->name_entry_size, slot,
2916                (cd->names_found - i) * cd->name_entry_size);
2917              break;
2918              }
2919            slot += cd->name_entry_size;
2920            }
2921
2922          PUT2(slot, 0, *brackets + 1);
2923          memcpy(slot + 2, name, namelen);
2924          slot[2+namelen] = 0;
2925          cd->names_found++;
2926          goto NUMBERED_GROUP;
2927          }
2928
2929        if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2930          {
2931          int i, namelen;
2932          int type = *ptr++;
2933          const uschar *name = ptr;
2934          uschar *slot = cd->name_table;
2935
2936          while (*ptr != ')') ptr++;
2937          namelen = ptr - name;
2938
2939          for (i = 0; i < cd->names_found; i++)
2940            {
2941            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2942            slot += cd->name_entry_size;
2943            }
2944          if (i >= cd->names_found)
2945            {
2946            *errorcodeptr = ERR15;
2947            goto FAILED;
2948            }
2949
2950          recno = GET2(slot, 0);
2951
2952          if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2953
2954          /* Back reference */
2955
2956          previous = code;
2957          *code++ = OP_REF;
2958          PUT2INC(code, 0, recno);
2959          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2960          if (recno > cd->top_backref) cd->top_backref = recno;
2961          continue;
2962          }
2963
2964        /* Should never happen */
2965        break;
2966
2967        case 'R':                 /* Pattern recursion */
2968        ptr++;                    /* Same as (?0)      */
2969        /* Fall through */
2970
2971        /* Recursion or "subroutine" call */
2972
2973        case '0': case '1': case '2': case '3': case '4':
2974        case '5': case '6': case '7': case '8': case '9':
2975          {
2976          const uschar *called;
2977          recno = 0;
2978          while((digitab[*ptr] & ctype_digit) != 0)
2979            recno = recno * 10 + *ptr++ - '0';
2980
2981          /* Come here from code above that handles a named recursion */
2982
2983          HANDLE_RECURSION:
2984
2985          previous = code;
2986
2987          /* Find the bracket that is being referenced. Temporarily end the
2988          regex in case it doesn't exist. */
2989
2990          *code = OP_END;
2991          called = (recno == 0)?
2992            cd->start_code : find_bracket(cd->start_code, utf8, recno);
2993
2994          if (called == NULL)
2995            {
2996            *errorcodeptr = ERR15;
2997            goto FAILED;
2998            }
2999
3000          /* If the subpattern is still open, this is a recursive call. We
3001          check to see if this is a left recursion that could loop for ever,
3002          and diagnose that case. */
3003
3004          if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3005            {
3006            *errorcodeptr = ERR40;
3007            goto FAILED;
3008            }
3009
3010          /* Insert the recursion/subroutine item */
3011
3012          *code = OP_RECURSE;
3013          PUT(code, 1, called - cd->start_code);
3014          code += 1 + LINK_SIZE;
3015          }
3016        continue;
3017
3018        /* Character after (? not specially recognized */
3019
3020        default:                  /* Option setting */
3021        set = unset = 0;
3022        optset = &set;
3023
3024        while (*ptr != ')' && *ptr != ':')
3025          {
3026          switch (*ptr++)
3027            {
3028            case '-': optset = &unset; break;
3029
3030            case 'i': *optset |= PCRE_CASELESS; break;
3031            case 'm': *optset |= PCRE_MULTILINE; break;
3032            case 's': *optset |= PCRE_DOTALL; break;
3033            case 'x': *optset |= PCRE_EXTENDED; break;
3034            case 'U': *optset |= PCRE_UNGREEDY; break;
3035            case 'X': *optset |= PCRE_EXTRA; break;
3036            }
3037          }
3038
3039        /* Set up the changed option bits, but don't change anything yet. */
3040
3041        newoptions = (options | set) & (~unset);
3042
3043        /* If the options ended with ')' this is not the start of a nested
3044        group with option changes, so the options change at this level. Compile
3045        code to change the ims options if this setting actually changes any of
3046        them. We also pass the new setting back so that it can be put at the
3047        start of any following branches, and when this group ends (if we are in
3048        a group), a resetting item can be compiled.
3049
3050        Note that if this item is right at the start of the pattern, the
3051        options will have been abstracted and made global, so there will be no
3052        change to compile. */
3053
3054        if (*ptr == ')')
3055          {
3056          if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3057            {
3058            *code++ = OP_OPT;
3059            *code++ = newoptions & PCRE_IMS;
3060            }
3061
3062          /* Change options at this level, and pass them back for use
3063          in subsequent branches. Reset the greedy defaults and the case
3064          value for firstbyte and reqbyte. */
3065
3066          *optionsptr = options = newoptions;
3067          greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3068          greedy_non_default = greedy_default ^ 1;
3069          req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3070
3071          previous = NULL;       /* This item can't be repeated */
3072          continue;              /* It is complete */
3073          }
3074
3075        /* If the options ended with ':' we are heading into a nested group
3076        with possible change of options. Such groups are non-capturing and are
3077        not assertions of any kind. All we need to do is skip over the ':';
3078        the newoptions value is handled below. */
3079
3080        bravalue = OP_BRA;
3081        ptr++;
3082        }
3083      }
3084
3085    /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3086    non-capturing and behave like (?:...) brackets */
3087
3088    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3089      {
3090      bravalue = OP_BRA;
3091      }
3092
3093    /* Else we have a referencing group; adjust the opcode. If the bracket
3094    number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3095    arrange for the true number to follow later, in an OP_BRANUMBER item. */
3096
3097    else
3098      {
3099      NUMBERED_GROUP:
3100      if (++(*brackets) > EXTRACT_BASIC_MAX)
3101        {
3102        bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3103        code[1+LINK_SIZE] = OP_BRANUMBER;
3104        PUT2(code, 2+LINK_SIZE, *brackets);
3105        skipbytes = 3;
3106        }
3107      else bravalue = OP_BRA + *brackets;
3108      }
3109
3110    /* Process nested bracketed re. Assertions may not be repeated, but other
3111    kinds can be. We copy code into a non-register variable in order to be able
3112    to pass its address because some compilers complain otherwise. Pass in a
3113    new setting for the ims options if they have changed. */
3114
3115    previous = (bravalue >= OP_ONCE)? code : NULL;
3116    *code = bravalue;
3117    tempcode = code;
3118    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
3119
3120    if (!compile_regex(
3121         newoptions,                   /* The complete new option state */
3122         options & PCRE_IMS,           /* The previous ims option state */
3123         brackets,                     /* Extracting bracket count */
3124         &tempcode,                    /* Where to put code (updated) */
3125         &ptr,                         /* Input pointer (updated) */
3126         errorcodeptr,                 /* Where to put an error message */
3127         (bravalue == OP_ASSERTBACK ||
3128          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3129         skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
3130         &subfirstbyte,                /* For possible first char */
3131         &subreqbyte,                  /* For possible last char */
3132         bcptr,                        /* Current branch chain */
3133         cd))                          /* Tables block */
3134      goto FAILED;
3135
3136    /* At the end of compiling, code is still pointing to the start of the
3137    group, while tempcode has been updated to point past the end of the group
3138    and any option resetting that may follow it. The pattern pointer (ptr)
3139    is on the bracket. */
3140
3141    /* If this is a conditional bracket, check that there are no more than
3142    two branches in the group. */
3143
3144    else if (bravalue == OP_COND)
3145      {
3146      uschar *tc = code;
3147      condcount = 0;
3148
3149      do {
3150         condcount++;
3151         tc += GET(tc,1);
3152         }
3153      while (*tc != OP_KET);
3154
3155      if (condcount > 2)
3156        {
3157        *errorcodeptr = ERR27;
3158        goto FAILED;
3159        }
3160
3161      /* If there is just one branch, we must not make use of its firstbyte or
3162      reqbyte, because this is equivalent to an empty second branch. */
3163
3164      if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3165      }
3166
3167    /* Handle updating of the required and first characters. Update for normal
3168    brackets of all kinds, and conditions with two branches (see code above).
3169    If the bracket is followed by a quantifier with zero repeat, we have to
3170    back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3171    main loop so that they can be accessed for the back off. */
3172
3173    zeroreqbyte = reqbyte;
3174    zerofirstbyte = firstbyte;
3175    groupsetfirstbyte = FALSE;
3176
3177    if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3178      {
3179      /* If we have not yet set a firstbyte in this branch, take it from the
3180      subpattern, remembering that it was set here so that a repeat of more
3181      than one can replicate it as reqbyte if necessary. If the subpattern has
3182      no firstbyte, set "none" for the whole branch. In both cases, a zero
3183      repeat forces firstbyte to "none". */
3184
3185      if (firstbyte == REQ_UNSET)
3186        {
3187        if (subfirstbyte >= 0)
3188          {
3189          firstbyte = subfirstbyte;
3190          groupsetfirstbyte = TRUE;
3191          }
3192        else firstbyte = REQ_NONE;
3193        zerofirstbyte = REQ_NONE;
3194        }
3195
3196      /* If firstbyte was previously set, convert the subpattern's firstbyte
3197      into reqbyte if there wasn't one, using the vary flag that was in
3198      existence beforehand. */
3199
3200      else if (subfirstbyte >= 0 && subreqbyte < 0)
3201        subreqbyte = subfirstbyte | tempreqvary;
3202
3203      /* If the subpattern set a required byte (or set a first byte that isn't
3204      really the first byte - see above), set it. */
3205
3206      if (subreqbyte >= 0) reqbyte = subreqbyte;
3207      }
3208
3209    /* For a forward assertion, we take the reqbyte, if set. This can be
3210    helpful if the pattern that follows the assertion doesn't set a different
3211    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3212    for an assertion, however because it leads to incorrect effect for patterns
3213    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3214    of a firstbyte. This is overcome by a scan at the end if there's no
3215    firstbyte, looking for an asserted first char. */
3216
3217    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3218
3219    /* Now update the main code pointer to the end of the group. */
3220
3221    code = tempcode;
3222
3223    /* Error if hit end of pattern */
3224
3225    if (*ptr != ')')
3226      {
3227      *errorcodeptr = ERR14;
3228      goto FAILED;
3229      }
3230    break;
3231
3232    /* Check \ for being a real metacharacter; if not, fall through and handle
3233    it as a data character at the start of a string. Escape items are checked
3234    for validity in the pre-compiling pass. */
3235
3236    case '\\':
3237    tempptr = ptr;
3238    c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3239
3240    /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3241    are arranged to be the negation of the corresponding OP_values. For the
3242    back references, the values are ESC_REF plus the reference number. Only
3243    back references and those types that consume a character may be repeated.
3244    We can test for values between ESC_b and ESC_Z for the latter; this may
3245    have to change if any new ones are ever created. */
3246
3247    if (c < 0)
3248      {
3249      if (-c == ESC_Q)            /* Handle start of quoted string */
3250        {
3251        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3252          else inescq = TRUE;
3253        continue;
3254        }
3255
3256      /* For metasequences that actually match a character, we disable the
3257      setting of a first character if it hasn't already been set. */
3258
3259      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3260        firstbyte = REQ_NONE;
3261
3262      /* Set values to reset to if this is followed by a zero repeat. */
3263
3264      zerofirstbyte = firstbyte;
3265      zeroreqbyte = reqbyte;
3266
3267      /* Back references are handled specially */
3268
3269      if (-c >= ESC_REF)
3270        {
3271        int number = -c - ESC_REF;
3272        previous = code;
3273        *code++ = OP_REF;
3274        PUT2INC(code, 0, number);
3275        }
3276
3277      /* So are Unicode property matches, if supported. We know that get_ucp
3278      won't fail because it was tested in the pre-pass. */
3279
3280#ifdef SUPPORT_UCP
3281      else if (-c == ESC_P || -c == ESC_p)
3282        {
3283        BOOL negated;
3284        int value = get_ucp(&ptr, &negated, errorcodeptr);
3285        previous = code;
3286        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3287        *code++ = value;
3288        }
3289#endif
3290
3291      /* For the rest, we can obtain the OP value by negating the escape
3292      value */
3293
3294      else
3295        {
3296        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3297        *code++ = -c;
3298        }
3299      continue;
3300      }
3301
3302    /* We have a data character whose value is in c. In UTF-8 mode it may have
3303    a value > 127. We set its representation in the length/buffer, and then
3304    handle it as a data character. */
3305
3306#ifdef SUPPORT_UTF8
3307    if (utf8 && c > 127)
3308      mclength = _pcre_ord2utf8(c, mcbuffer);
3309    else
3310#endif
3311
3312     {
3313     mcbuffer[0] = c;
3314     mclength = 1;
3315     }
3316
3317    goto ONE_CHAR;
3318
3319    /* Handle a literal character. It is guaranteed not to be whitespace or #
3320    when the extended flag is set. If we are in UTF-8 mode, it may be a
3321    multi-byte literal character. */
3322
3323    default:
3324    NORMAL_CHAR:
3325    mclength = 1;
3326    mcbuffer[0] = c;
3327
3328#ifdef SUPPORT_UTF8
3329    if (utf8 && (c & 0xc0) == 0xc0)
3330      {
3331      while ((ptr[1] & 0xc0) == 0x80)
3332        mcbuffer[mclength++] = *(++ptr);
3333      }
3334#endif
3335
3336    /* At this point we have the character's bytes in mcbuffer, and the length
3337    in mclength. When not in UTF-8 mode, the length is always 1. */
3338
3339    ONE_CHAR:
3340    previous = code;
3341    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3342    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3343
3344    /* Set the first and required bytes appropriately. If no previous first
3345    byte, set it from this character, but revert to none on a zero repeat.
3346    Otherwise, leave the firstbyte value alone, and don't change it on a zero
3347    repeat. */
3348
3349    if (firstbyte == REQ_UNSET)
3350      {
3351      zerofirstbyte = REQ_NONE;
3352      zeroreqbyte = reqbyte;
3353
3354      /* If the character is more than one byte long, we can set firstbyte
3355      only if it is not to be matched caselessly. */
3356
3357      if (mclength == 1 || req_caseopt == 0)
3358        {
3359        firstbyte = mcbuffer[0] | req_caseopt;
3360        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3361        }
3362      else firstbyte = reqbyte = REQ_NONE;
3363      }
3364
3365    /* firstbyte was previously set; we can set reqbyte only the length is
3366    1 or the matching is caseful. */
3367
3368    else
3369      {
3370      zerofirstbyte = firstbyte;
3371      zeroreqbyte = reqbyte;
3372      if (mclength == 1 || req_caseopt == 0)
3373        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3374      }
3375
3376    break;            /* End of literal character handling */
3377    }
3378  }                   /* end of big loop */
3379
3380/* Control never reaches here by falling through, only by a goto for all the
3381error states. Pass back the position in the pattern so that it can be displayed
3382to the user for diagnosing the error. */
3383
3384FAILED:
3385*ptrptr = ptr;
3386return FALSE;
3387}
3388
3389
3390
3391
3392/*************************************************
3393*     Compile sequence of alternatives           *
3394*************************************************/
3395
3396/* On entry, ptr is pointing past the bracket character, but on return
3397it points to the closing bracket, or vertical bar, or end of string.
3398The code variable is pointing at the byte into which the BRA operator has been
3399stored. If the ims options are changed at the start (for a (?ims: group) or
3400during any branch, we need to insert an OP_OPT item at the start of every
3401following branch to ensure they get set correctly at run time, and also pass
3402the new options into every subsequent branch compile.
3403
3404Argument:
3405  options        option bits, including any changes for this subpattern
3406  oldims         previous settings of ims option bits
3407  brackets       -> int containing the number of extracting brackets used
3408  codeptr        -> the address of the current code pointer
3409  ptrptr         -> the address of the current pattern pointer
3410  errorcodeptr   -> pointer to error code variable
3411  lookbehind     TRUE if this is a lookbehind assertion
3412  skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3413  firstbyteptr   place to put the first required character, or a negative number
3414  reqbyteptr     place to put the last required character, or a negative number
3415  bcptr          pointer to the chain of currently open branches
3416  cd             points to the data block with tables pointers etc.
3417
3418Returns:      TRUE on success
3419*/
3420
3421static BOOL
3422compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3423  const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3424  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3425{
3426const uschar *ptr = *ptrptr;
3427uschar *code = *codeptr;
3428uschar *last_branch = code;
3429uschar *start_bracket = code;
3430uschar *reverse_count = NULL;
3431int firstbyte, reqbyte;
3432int branchfirstbyte, branchreqbyte;
3433branch_chain bc;
3434
3435bc.outer = bcptr;
3436bc.current = code;
3437
3438firstbyte = reqbyte = REQ_UNSET;
3439
3440/* Offset is set zero to mark that this bracket is still open */
3441
3442PUT(code, 1, 0);
3443code += 1 + LINK_SIZE + skipbytes;
3444
3445/* Loop for each alternative branch */
3446
3447for (;;)
3448  {
3449  /* Handle a change of ims options at the start of the branch */
3450
3451  if ((options & PCRE_IMS) != oldims)
3452    {
3453    *code++ = OP_OPT;
3454    *code++ = options & PCRE_IMS;
3455    }
3456
3457  /* Set up dummy OP_REVERSE if lookbehind assertion */
3458
3459  if (lookbehind)
3460    {
3461    *code++ = OP_REVERSE;
3462    reverse_count = code;
3463    PUTINC(code, 0, 0);
3464    }
3465
3466  /* Now compile the branch */
3467
3468  if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3469        &branchfirstbyte, &branchreqbyte, &bc, cd))
3470    {
3471    *ptrptr = ptr;
3472    return FALSE;
3473    }
3474
3475  /* If this is the first branch, the firstbyte and reqbyte values for the
3476  branch become the values for the regex. */
3477
3478  if (*last_branch != OP_ALT)
3479    {
3480    firstbyte = branchfirstbyte;
3481    reqbyte = branchreqbyte;
3482    }
3483
3484  /* If this is not the first branch, the first char and reqbyte have to
3485  match the values from all the previous branches, except that if the previous
3486  value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3487  REQ_VARY for the regex. */
3488
3489  else
3490    {
3491    /* If we previously had a firstbyte, but it doesn't match the new branch,
3492    we have to abandon the firstbyte for the regex, but if there was previously
3493    no reqbyte, it takes on the value of the old firstbyte. */
3494
3495    if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3496      {
3497      if (reqbyte < 0) reqbyte = firstbyte;
3498      firstbyte = REQ_NONE;
3499      }
3500
3501    /* If we (now or from before) have no firstbyte, a firstbyte from the
3502    branch becomes a reqbyte if there isn't a branch reqbyte. */
3503
3504    if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3505        branchreqbyte = branchfirstbyte;
3506
3507    /* Now ensure that the reqbytes match */
3508
3509    if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3510      reqbyte = REQ_NONE;
3511    else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
3512    }
3513
3514  /* If lookbehind, check that this branch matches a fixed-length string,
3515  and put the length into the OP_REVERSE item. Temporarily mark the end of
3516  the branch with OP_END. */
3517
3518  if (lookbehind)
3519    {
3520    int length;
3521    *code = OP_END;
3522    length = find_fixedlength(last_branch, options);
3523    DPRINTF(("fixed length = %d\n", length));
3524    if (length < 0)
3525      {
3526      *errorcodeptr = (length == -2)? ERR36 : ERR25;
3527      *ptrptr = ptr;
3528      return FALSE;
3529      }
3530    PUT(reverse_count, 0, length);
3531    }
3532
3533  /* Reached end of expression, either ')' or end of pattern. Go back through
3534  the alternative branches and reverse the chain of offsets, with the field in
3535  the BRA item now becoming an offset to the first alternative. If there are
3536  no alternatives, it points to the end of the group. The length in the
3537  terminating ket is always the length of the whole bracketed item. If any of
3538  the ims options were changed inside the group, compile a resetting op-code
3539  following, except at the very end of the pattern. Return leaving the pointer
3540  at the terminating char. */
3541
3542  if (*ptr != '|')
3543    {
3544    int length = code - last_branch;
3545    do
3546      {
3547      int prev_length = GET(last_branch, 1);
3548      PUT(last_branch, 1, length);
3549      length = prev_length;
3550      last_branch -= length;
3551      }
3552    while (length > 0);
3553
3554    /* Fill in the ket */
3555
3556    *code = OP_KET;
3557    PUT(code, 1, code - start_bracket);
3558    code += 1 + LINK_SIZE;
3559
3560    /* Resetting option if needed */
3561
3562    if ((options & PCRE_IMS) != oldims && *ptr == ')')
3563      {
3564      *code++ = OP_OPT;
3565      *code++ = oldims;
3566      }
3567
3568    /* Set values to pass back */
3569
3570    *codeptr = code;
3571    *ptrptr = ptr;
3572    *firstbyteptr = firstbyte;
3573    *reqbyteptr = reqbyte;
3574    return TRUE;
3575    }
3576
3577  /* Another branch follows; insert an "or" node. Its length field points back
3578  to the previous branch while the bracket remains open. At the end the chain
3579  is reversed. It's done like this so that the start of the bracket has a
3580  zero offset until it is closed, making it possible to detect recursion. */
3581
3582  *code = OP_ALT;
3583  PUT(code, 1, code - last_branch);
3584  bc.current = last_branch = code;
3585  code += 1 + LINK_SIZE;
3586  ptr++;
3587  }
3588/* Control never reaches here */
3589}
3590
3591
3592
3593
3594/*************************************************
3595*          Check for anchored expression         *
3596*************************************************/
3597
3598/* Try to find out if this is an anchored regular expression. Consider each
3599alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3600all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3601it's anchored. However, if this is a multiline pattern, then only OP_SOD
3602counts, since OP_CIRC can match in the middle.
3603
3604We can also consider a regex to be anchored if OP_SOM starts all its branches.
3605This is the code for \G, which means "match at start of match position, taking
3606into account the match offset".
3607
3608A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3609because that will try the rest of the pattern at all possible matching points,
3610so there is no point trying again.... er ....
3611
3612.... except when the .* appears inside capturing parentheses, and there is a
3613subsequent back reference to those parentheses. We haven't enough information
3614to catch that case precisely.
3615
3616At first, the best we could do was to detect when .* was in capturing brackets
3617and the highest back reference was greater than or equal to that level.
3618However, by keeping a bitmap of the first 31 back references, we can catch some
3619of the more common cases more precisely.
3620
3621Arguments:
3622  code           points to start of expression (the bracket)
3623  options        points to the options setting
3624  bracket_map    a bitmap of which brackets we are inside while testing; this
3625                  handles up to substring 31; after that we just have to take
3626                  the less precise approach
3627  backref_map    the back reference bitmap
3628
3629Returns:     TRUE or FALSE
3630*/
3631
3632static BOOL
3633is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3634  unsigned int backref_map)
3635{
3636do {
3637   const uschar *scode =
3638     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3639   register int op = *scode;
3640
3641   /* Capturing brackets */
3642
3643   if (op > OP_BRA)
3644     {
3645     int new_map;
3646     op -= OP_BRA;
3647     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3648     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3649     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3650     }
3651
3652   /* Other brackets */
3653
3654   else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3655     {
3656     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3657     }
3658
3659   /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3660   are or may be referenced. */
3661
3662   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3663            (*options & PCRE_DOTALL) != 0)
3664     {
3665     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3666     }
3667
3668   /* Check for explicit anchoring */
3669
3670   else if (op != OP_SOD && op != OP_SOM &&
3671           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3672     return FALSE;
3673   code += GET(code, 1);
3674   }
3675while (*code == OP_ALT);   /* Loop for each alternative */
3676return TRUE;
3677}
3678
3679
3680
3681/*************************************************
3682*         Check for starting with ^ or .*        *
3683*************************************************/
3684
3685/* This is called to find out if every branch starts with ^ or .* so that
3686"first char" processing can be done to speed things up in multiline
3687matching and for non-DOTALL patterns that start with .* (which must start at
3688the beginning or after \n). As in the case of is_anchored() (see above), we
3689have to take account of back references to capturing brackets that contain .*
3690because in that case we can't make the assumption.
3691
3692Arguments:
3693  code           points to start of expression (the bracket)
3694  bracket_map    a bitmap of which brackets we are inside while testing; this
3695                  handles up to substring 31; after that we just have to take
3696                  the less precise approach
3697  backref_map    the back reference bitmap
3698
3699Returns:         TRUE or FALSE
3700*/
3701
3702static BOOL
3703is_startline(const uschar *code, unsigned int bracket_map,
3704  unsigned int backref_map)
3705{
3706do {
3707   const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3708     FALSE);
3709   register int op = *scode;
3710
3711   /* Capturing brackets */
3712
3713   if (op > OP_BRA)
3714     {
3715     int new_map;
3716     op -= OP_BRA;
3717     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3718     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3719     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3720     }
3721
3722   /* Other brackets */
3723
3724   else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3725     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3726
3727   /* .* means "start at start or after \n" if it isn't in brackets that
3728   may be referenced. */
3729
3730   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3731     {
3732     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3733     }
3734
3735   /* Check for explicit circumflex */
3736
3737   else if (op != OP_CIRC) return FALSE;
3738
3739   /* Move on to the next alternative */
3740
3741   code += GET(code, 1);
3742   }
3743while (*code == OP_ALT);  /* Loop for each alternative */
3744return TRUE;
3745}
3746
3747
3748
3749/*************************************************
3750*       Check for asserted fixed first char      *
3751*************************************************/
3752
3753/* During compilation, the "first char" settings from forward assertions are
3754discarded, because they can cause conflicts with actual literals that follow.
3755However, if we end up without a first char setting for an unanchored pattern,
3756it is worth scanning the regex to see if there is an initial asserted first
3757char. If all branches start with the same asserted char, or with a bracket all
3758of whose alternatives start with the same asserted char (recurse ad lib), then
3759we return that char, otherwise -1.
3760
3761Arguments:
3762  code       points to start of expression (the bracket)
3763  options    pointer to the options (used to check casing changes)
3764  inassert   TRUE if in an assertion
3765
3766Returns:     -1 or the fixed first char
3767*/
3768
3769static int
3770find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3771{
3772register int c = -1;
3773do {
3774   int d;
3775   const uschar *scode =
3776     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3777   register int op = *scode;
3778
3779   if (op >= OP_BRA) op = OP_BRA;
3780
3781   switch(op)
3782     {
3783     default:
3784     return -1;
3785
3786     case OP_BRA:
3787     case OP_ASSERT:
3788     case OP_ONCE:
3789     case OP_COND:
3790     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3791       return -1;
3792     if (c < 0) c = d; else if (c != d) return -1;
3793     break;
3794
3795     case OP_EXACT:       /* Fall through */
3796     scode += 2;
3797
3798     case OP_CHAR:
3799     case OP_CHARNC:
3800     case OP_PLUS:
3801     case OP_MINPLUS:
3802     if (!inassert) return -1;
3803     if (c < 0)
3804       {
3805       c = scode[1];
3806       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3807       }
3808     else if (c != scode[1]) return -1;
3809     break;
3810     }
3811
3812   code += GET(code, 1);
3813   }
3814while (*code == OP_ALT);
3815return c;
3816}
3817
3818
3819
3820/*************************************************
3821*        Compile a Regular Expression            *
3822*************************************************/
3823
3824/* This function takes a string and returns a pointer to a block of store
3825holding a compiled version of the expression. The original API for this
3826function had no error code return variable; it is retained for backwards
3827compatibility. The new function is given a new name.
3828
3829Arguments:
3830  pattern       the regular expression
3831  options       various option bits
3832  errorcodeptr  pointer to error code variable (pcre_compile2() only)
3833                  can be NULL if you don't want a code value
3834  errorptr      pointer to pointer to error text
3835  erroroffset   ptr offset in pattern where error was detected
3836  tables        pointer to character tables or NULL
3837
3838Returns:        pointer to compiled data block, or NULL on error,
3839                with errorptr and erroroffset set
3840*/
3841
3842EXPORT pcre *
3843pcre_compile(const char *pattern, int options, const char **errorptr,
3844  int *erroroffset, const unsigned char *tables)
3845{
3846return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3847}
3848
3849
3850EXPORT pcre *
3851pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3852  const char **errorptr, int *erroroffset, const unsigned char *tables)
3853{
3854real_pcre *re;
3855int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3856int c, firstbyte, reqbyte;
3857int bracount = 0;
3858int branch_extra = 0;
3859int branch_newextra;
3860int item_count = -1;
3861int name_count = 0;
3862int max_name_size = 0;
3863int lastitemlength = 0;
3864int errorcode = 0;
3865#ifdef SUPPORT_UTF8
3866BOOL utf8;
3867BOOL class_utf8;
3868#endif
3869BOOL inescq = FALSE;
3870BOOL capturing;
3871unsigned int brastackptr = 0;
3872size_t size;
3873uschar *code;
3874const uschar *codestart;
3875const uschar *ptr;
3876compile_data compile_block;
3877int brastack[BRASTACK_SIZE];
3878uschar bralenstack[BRASTACK_SIZE];
3879
3880/* We can't pass back an error message if errorptr is NULL; I guess the best we
3881can do is just return NULL, but we can set a code value if there is a code
3882pointer. */
3883
3884if (errorptr == NULL)
3885  {
3886  if (errorcodeptr != NULL) *errorcodeptr = 99;
3887  return NULL;
3888  }
3889
3890*errorptr = NULL;
3891if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3892
3893/* However, we can give a message for this error */
3894
3895if (erroroffset == NULL)
3896  {
3897  errorcode = ERR16;
3898  goto PCRE_EARLY_ERROR_RETURN;
3899  }
3900
3901*erroroffset = 0;
3902
3903/* Can't support UTF8 unless PCRE has been compiled to include the code. */
3904
3905#ifdef SUPPORT_UTF8
3906utf8 = (options & PCRE_UTF8) != 0;
3907if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3908     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3909  {
3910  errorcode = ERR44;
3911  goto PCRE_EARLY_ERROR_RETURN;
3912  }
3913#else
3914if ((options & PCRE_UTF8) != 0)
3915  {
3916  errorcode = ERR32;
3917  goto PCRE_EARLY_ERROR_RETURN;
3918  }
3919#endif
3920
3921if ((options & ~PUBLIC_OPTIONS) != 0)
3922  {
3923  errorcode = ERR17;
3924  goto PCRE_EARLY_ERROR_RETURN;
3925  }
3926
3927/* Set up pointers to the individual character tables */
3928
3929if (tables == NULL) tables = _pcre_default_tables;
3930compile_block.lcc = tables + lcc_offset;
3931compile_block.fcc = tables + fcc_offset;
3932compile_block.cbits = tables + cbits_offset;
3933compile_block.ctypes = tables + ctypes_offset;
3934
3935/* Maximum back reference and backref bitmap. This is updated for numeric
3936references during the first pass, but for named references during the actual
3937compile pass. The bitmap records up to 31 back references to help in deciding
3938whether (.*) can be treated as anchored or not. */
3939
3940compile_block.top_backref = 0;
3941compile_block.backref_map = 0;
3942
3943/* Reflect pattern for debugging output */
3944
3945DPRINTF(("------------------------------------------------------------------\n"));
3946DPRINTF(("%s\n", pattern));
3947
3948/* The first thing to do is to make a pass over the pattern to compute the
3949amount of store required to hold the compiled code. This does not have to be
3950perfect as long as errors are overestimates. At the same time we can detect any
3951flag settings right at the start, and extract them. Make an attempt to correct
3952for any counted white space if an "extended" flag setting appears late in the
3953pattern. We can't be so clever for #-comments. */
3954
3955ptr = (const uschar *)(pattern - 1);
3956while ((c = *(++ptr)) != 0)
3957  {
3958  int min, max;
3959  int class_optcount;
3960  int bracket_length;
3961  int duplength;
3962
3963  /* If we are inside a \Q...\E sequence, all chars are literal */
3964
3965  if (inescq)
3966    {
3967    if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3968    goto NORMAL_CHAR;
3969    }
3970
3971  /* Otherwise, first check for ignored whitespace and comments */
3972
3973  if ((options & PCRE_EXTENDED) != 0)
3974    {
3975    if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3976    if (c == '#')
3977      {
3978      /* The space before the ; is to avoid a warning on a silly compiler
3979      on the Macintosh. */
3980      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3981      if (c == 0) break;
3982      continue;
3983      }
3984    }
3985
3986  item_count++;    /* Is zero for the first non-comment item */
3987
3988  /* Allow space for auto callout before every item except quantifiers. */
3989
3990  if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3991       c != '*' && c != '+' && c != '?' &&
3992       (c != '{' || !is_counted_repeat(ptr + 1)))
3993    length += 2 + 2*LINK_SIZE;
3994
3995  switch(c)
3996    {
3997    /* A backslashed item may be an escaped data character or it may be a
3998    character type. */
3999
4000    case '\\':
4001    c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4002    if (errorcode != 0) goto PCRE_ERROR_RETURN;
4003
4004    lastitemlength = 1;     /* Default length of last item for repeats */
4005
4006    if (c >= 0)             /* Data character */
4007      {
4008      length += 2;          /* For a one-byte character */
4009
4010#ifdef SUPPORT_UTF8
4011      if (utf8 && c > 127)
4012        {
4013        int i;
4014        for (i = 0; i < _pcre_utf8_table1_size; i++)
4015          if (c <= _pcre_utf8_table1[i]) break;
4016        length += i;
4017        lastitemlength += i;
4018        }
4019#endif
4020
4021      continue;
4022      }
4023
4024    /* If \Q, enter "literal" mode */
4025
4026    if (-c == ESC_Q)
4027      {
4028      inescq = TRUE;
4029      continue;
4030      }
4031
4032    /* \X is supported only if Unicode property support is compiled */
4033
4034#ifndef SUPPORT_UCP
4035    if (-c == ESC_X)
4036      {
4037      errorcode = ERR45;
4038      goto PCRE_ERROR_RETURN;
4039      }
4040#endif
4041
4042    /* \P and \p are for Unicode properties, but only when the support has
4043    been compiled. Each item needs 2 bytes. */
4044
4045    else if (-c == ESC_P || -c == ESC_p)
4046      {
4047#ifdef SUPPORT_UCP
4048      BOOL negated;
4049      length += 2;
4050      lastitemlength = 2;
4051      if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4052      continue;
4053#else
4054      errorcode = ERR45;
4055      goto PCRE_ERROR_RETURN;
4056#endif
4057      }
4058
4059    /* Other escapes need one byte */
4060
4061    length++;
4062
4063    /* A back reference needs an additional 2 bytes, plus either one or 5
4064    bytes for a repeat. We also need to keep the value of the highest
4065    back reference. */
4066
4067    if (c <= -ESC_REF)
4068      {
4069      int refnum = -c - ESC_REF;
4070      compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4071      if (refnum > compile_block.top_backref)
4072        compile_block.top_backref = refnum;
4073      length += 2;   /* For single back reference */
4074      if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4075        {
4076        ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4077        if (errorcode != 0) goto PCRE_ERROR_RETURN;
4078        if ((min == 0 && (max == 1 || max == -1)) ||
4079          (min == 1 && max == -1))
4080            length++;
4081        else length += 5;
4082        if (ptr[1] == '?') ptr++;
4083        }
4084      }
4085    continue;
4086
4087    case '^':     /* Single-byte metacharacters */
4088    case '.':
4089    case '$':
4090    length++;
4091    lastitemlength = 1;
4092    continue;
4093
4094    case '*':            /* These repeats won't be after brackets; */
4095    case '+':            /* those are handled separately */
4096    case '?':
4097    length++;
4098    goto POSESSIVE;      /* A few lines below */
4099
4100    /* This covers the cases of braced repeats after a single char, metachar,
4101    class, or back reference. */
4102
4103    case '{':
4104    if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4105    ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4106    if (errorcode != 0) goto PCRE_ERROR_RETURN;
4107
4108    /* These special cases just insert one extra opcode */
4109
4110    if ((min == 0 && (max == 1 || max == -1)) ||
4111      (min == 1 && max == -1))
4112        length++;
4113
4114    /* These cases might insert additional copies of a preceding character. */
4115
4116    else
4117      {
4118      if (min != 1)
4119        {
4120        length -= lastitemlength;   /* Uncount the original char or metachar */
4121        if (min > 0) length += 3 + lastitemlength;
4122        }
4123      length += lastitemlength + ((max > 0)? 3 : 1);
4124      }
4125
4126    if (ptr[1] == '?') ptr++;      /* Needs no extra length */
4127
4128    POSESSIVE:                     /* Test for possessive quantifier */
4129    if (ptr[1] == '+')
4130      {
4131      ptr++;
4132      length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
4133      }
4134    continue;
4135
4136    /* An alternation contains an offset to the next branch or ket. If any ims
4137    options changed in the previous branch(es), and/or if we are in a
4138    lookbehind assertion, extra space will be needed at the start of the
4139    branch. This is handled by branch_extra. */
4140
4141    case '|':
4142    length += 1 + LINK_SIZE + branch_extra;
4143    continue;
4144
4145    /* A character class uses 33 characters provided that all the character
4146    values are less than 256. Otherwise, it uses a bit map for low valued
4147    characters, and individual items for others. Don't worry about character
4148    types that aren't allowed in classes - they'll get picked up during the
4149    compile. A character class that contains only one single-byte character
4150    uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4151    where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4152
4153    case '[':
4154    if (*(++ptr) == '^')
4155      {
4156      class_optcount = 10;  /* Greater than one */
4157      ptr++;
4158      }
4159    else class_optcount = 0;
4160
4161#ifdef SUPPORT_UTF8
4162    class_utf8 = FALSE;
4163#endif
4164
4165    /* Written as a "do" so that an initial ']' is taken as data */
4166
4167    if (*ptr != 0) do
4168      {
4169      /* Inside \Q...\E everything is literal except \E */
4170
4171      if (inescq)
4172        {
4173        if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4174        inescq = FALSE;
4175        ptr += 1;
4176        continue;
4177        }
4178
4179      /* Outside \Q...\E, check for escapes */
4180
4181      if (*ptr == '\\')
4182        {
4183        c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4184        if (errorcode != 0) goto PCRE_ERROR_RETURN;
4185
4186        /* \b is backspace inside a class; \X is literal */
4187
4188        if (-c == ESC_b) c = '\b';
4189        else if (-c == ESC_X) c = 'X';
4190
4191        /* \Q enters quoting mode */
4192
4193        else if (-c == ESC_Q)
4194          {
4195          inescq = TRUE;
4196          continue;
4197          }
4198
4199        /* Handle escapes that turn into characters */
4200
4201        if (c >= 0) goto NON_SPECIAL_CHARACTER;
4202
4203        /* Escapes that are meta-things. The normal ones just affect the
4204        bit map, but Unicode properties require an XCLASS extended item. */
4205
4206        else
4207          {
4208          class_optcount = 10;         /* \d, \s etc; make sure > 1 */
4209#ifdef SUPPORT_UTF8
4210          if (-c == ESC_p || -c == ESC_P)
4211            {
4212            if (!class_utf8)
4213              {
4214              class_utf8 = TRUE;
4215              length += LINK_SIZE + 2;
4216              }
4217            length += 2;
4218            }
4219#endif
4220          }
4221        }
4222
4223      /* Check the syntax for POSIX stuff. The bits we actually handle are
4224      checked during the real compile phase. */
4225
4226      else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4227        {
4228        ptr++;
4229        class_optcount = 10;    /* Make sure > 1 */
4230        }
4231
4232      /* Anything else increments the possible optimization count. We have to
4233      detect ranges here so that we can compute the number of extra ranges for
4234      caseless wide characters when UCP support is available. If there are wide
4235      characters, we are going to have to use an XCLASS, even for single
4236      characters. */
4237
4238      else
4239        {
4240        int d;
4241
4242        GET_ONE_CHARACTER:
4243
4244#ifdef SUPPORT_UTF8
4245        if (utf8)
4246          {
4247          int extra = 0;
4248          GETCHARLEN(c, ptr, extra);
4249          ptr += extra;
4250          }
4251        else c = *ptr;
4252#else
4253        c = *ptr;
4254#endif
4255
4256        /* Come here from handling \ above when it escapes to a char value */
4257
4258        NON_SPECIAL_CHARACTER:
4259        class_optcount++;
4260
4261        d = -1;
4262        if (ptr[1] == '-')
4263          {
4264          uschar const *hyptr = ptr++;
4265          if (ptr[1] == '\\')
4266            {
4267            ptr++;
4268            d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4269            if (errorcode != 0) goto PCRE_ERROR_RETURN;
4270            if (-d == ESC_b) d = '\b';        /* backspace */
4271            else if (-d == ESC_X) d = 'X';    /* literal X in a class */
4272            }
4273          else if (ptr[1] != 0 && ptr[1] != ']')
4274            {
4275            ptr++;
4276#ifdef SUPPORT_UTF8
4277            if (utf8)
4278              {
4279              int extra = 0;
4280              GETCHARLEN(d, ptr, extra);
4281              ptr += extra;
4282              }
4283            else
4284#endif
4285            d = *ptr;
4286            }
4287          if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
4288          }
4289
4290        /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4291        127 for caseless matching, we will need to use an XCLASS. */
4292
4293        if (d >= 0)
4294          {
4295          class_optcount = 10;     /* Ensure > 1 */
4296          if (d < c)
4297            {
4298            errorcode = ERR8;
4299            goto PCRE_ERROR_RETURN;
4300            }
4301
4302#ifdef SUPPORT_UTF8
4303          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4304            {
4305            uschar buffer[6];
4306            if (!class_utf8)         /* Allow for XCLASS overhead */
4307              {
4308              class_utf8 = TRUE;
4309              length += LINK_SIZE + 2;
4310              }
4311
4312#ifdef SUPPORT_UCP
4313            /* If we have UCP support, find out how many extra ranges are
4314            needed to map the other case of characters within this range. We
4315            have to mimic the range optimization here, because extending the
4316            range upwards might push d over a boundary that makes is use
4317            another byte in the UTF-8 representation. */
4318
4319            if ((options & PCRE_CASELESS) != 0)
4320              {
4321              int occ, ocd;
4322              int cc = c;
4323              int origd = d;
4324              while (get_othercase_range(&cc, origd, &occ, &ocd))
4325                {
4326                if (occ >= c && ocd <= d) continue;   /* Skip embedded */
4327
4328                if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
4329                  {                            /* if there is overlap,   */
4330                  c = occ;                     /* noting that if occ < c */
4331                  continue;                    /* we can't have ocd > d  */
4332                  }                            /* because a subrange is  */
4333                if (ocd > d && occ <= d + 1)   /* always shorter than    */
4334                  {                            /* the basic range.       */
4335                  d = ocd;
4336                  continue;
4337                  }
4338
4339                /* An extra item is needed */
4340
4341                length += 1 + _pcre_ord2utf8(occ, buffer) +
4342                  ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4343                }
4344              }
4345#endif  /* SUPPORT_UCP */
4346
4347            /* The length of the (possibly extended) range */
4348
4349            length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4350            }
4351#endif  /* SUPPORT_UTF8 */
4352
4353          }
4354
4355        /* We have a single character. There is nothing to be done unless we
4356        are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4357        allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4358        support. */
4359
4360        else
4361          {
4362#ifdef SUPPORT_UTF8
4363          if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4364            {
4365            uschar buffer[6];
4366            class_optcount = 10;     /* Ensure > 1 */
4367            if (!class_utf8)         /* Allow for XCLASS overhead */
4368              {
4369              class_utf8 = TRUE;
4370              length += LINK_SIZE + 2;
4371              }
4372#ifdef SUPPORT_UCP
4373            length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4374              (1 + _pcre_ord2utf8(c, buffer));
4375#else   /* SUPPORT_UCP */
4376            length += 1 + _pcre_ord2utf8(c, buffer);
4377#endif  /* SUPPORT_UCP */
4378            }
4379#endif  /* SUPPORT_UTF8 */
4380          }
4381        }
4382      }
4383    while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4384
4385    if (*ptr == 0)                          /* Missing terminating ']' */
4386      {
4387      errorcode = ERR6;
4388      goto PCRE_ERROR_RETURN;
4389      }
4390
4391    /* We can optimize when there was only one optimizable character. Repeats
4392    for positive and negated single one-byte chars are handled by the general
4393    code. Here, we handle repeats for the class opcodes. */
4394
4395    if (class_optcount == 1) length += 3; else
4396      {
4397      length += 33;
4398
4399      /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4400      we also need extra for wrapping the whole thing in a sub-pattern. */
4401
4402      if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4403        {
4404        ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4405        if (errorcode != 0) goto PCRE_ERROR_RETURN;
4406        if ((min == 0 && (max == 1 || max == -1)) ||
4407          (min == 1 && max == -1))
4408            length++;
4409        else length += 5;
4410        if (ptr[1] == '+')
4411          {
4412          ptr++;
4413          length += 2 + 2*LINK_SIZE;
4414          }
4415        else if (ptr[1] == '?') ptr++;
4416        }
4417      }
4418    continue;
4419
4420    /* Brackets may be genuine groups or special things */
4421
4422    case '(':
4423    branch_newextra = 0;
4424    bracket_length = 1 + LINK_SIZE;
4425    capturing = FALSE;
4426
4427    /* Handle special forms of bracket, which all start (? */
4428
4429    if (ptr[1] == '?')
4430      {
4431      int set, unset;
4432      int *optset;
4433
4434      switch (c = ptr[2])
4435        {
4436        /* Skip over comments entirely */
4437        case '#':
4438        ptr += 3;
4439        while (*ptr != 0 && *ptr != ')') ptr++;
4440        if (*ptr == 0)
4441          {
4442          errorcode = ERR18;
4443          goto PCRE_ERROR_RETURN;
4444          }
4445        continue;
4446
4447        /* Non-referencing groups and lookaheads just move the pointer on, and
4448        then behave like a non-special bracket, except that they don't increment
4449        the count of extracting brackets. Ditto for the "once only" bracket,
4450        which is in Perl from version 5.005. */
4451
4452        case ':':
4453        case '=':
4454        case '!':
4455        case '>':
4456        ptr += 2;
4457        break;
4458
4459        /* (?R) specifies a recursive call to the regex, which is an extension
4460        to provide the facility which can be obtained by (?p{perl-code}) in
4461        Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4462
4463        From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4464        the appropriate numbered brackets. This includes both recursive and
4465        non-recursive calls. (?R) is now synonymous with (?0). */
4466
4467        case 'R':
4468        ptr++;
4469
4470        case '0': case '1': case '2': case '3': case '4':
4471        case '5': case '6': case '7': case '8': case '9':
4472        ptr += 2;
4473        if (c != 'R')
4474          while ((digitab[*(++ptr)] & ctype_digit) != 0);
4475        if (*ptr != ')')
4476          {
4477          errorcode = ERR29;
4478          goto PCRE_ERROR_RETURN;
4479          }
4480        length += 1 + LINK_SIZE;
4481
4482        /* If this item is quantified, it will get wrapped inside brackets so
4483        as to use the code for quantified brackets. We jump down and use the
4484        code that handles this for real brackets. */
4485
4486        if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4487          {
4488          length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
4489          duplength = 5 + 3 * LINK_SIZE;
4490          goto HANDLE_QUANTIFIED_BRACKETS;
4491          }
4492        continue;
4493
4494        /* (?C) is an extension which provides "callout" - to provide a bit of
4495        the functionality of the Perl (?{...}) feature. An optional number may
4496        follow (default is zero). */
4497
4498        case 'C':
4499        ptr += 2;
4500        while ((digitab[*(++ptr)] & ctype_digit) != 0);
4501        if (*ptr != ')')
4502          {
4503          errorcode = ERR39;
4504          goto PCRE_ERROR_RETURN;
4505          }
4506        length += 2 + 2*LINK_SIZE;
4507        continue;
4508
4509        /* Named subpatterns are an extension copied from Python */
4510
4511        case 'P':
4512        ptr += 3;
4513
4514        /* Handle the definition of a named subpattern */
4515
4516        if (*ptr == '<')
4517          {
4518          const uschar *p;    /* Don't amalgamate; some compilers */
4519          p = ++ptr;          /* grumble at autoincrement in declaration */
4520          while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4521          if (*ptr != '>')
4522            {
4523            errorcode = ERR42;
4524            goto PCRE_ERROR_RETURN;
4525            }
4526          name_count++;
4527          if (ptr - p > max_name_size) max_name_size = (ptr - p);
4528          capturing = TRUE;   /* Named parentheses are always capturing */
4529          break;
4530          }
4531
4532        /* Handle back references and recursive calls to named subpatterns */
4533
4534        if (*ptr == '=' || *ptr == '>')
4535          {
4536          while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4537          if (*ptr != ')')
4538            {
4539            errorcode = ERR42;
4540            goto PCRE_ERROR_RETURN;
4541            }
4542          break;
4543          }
4544
4545        /* Unknown character after (?P */
4546
4547        errorcode = ERR41;
4548        goto PCRE_ERROR_RETURN;
4549
4550        /* Lookbehinds are in Perl from version 5.005 */
4551
4552        case '<':
4553        ptr += 3;
4554        if (*ptr == '=' || *ptr == '!')
4555          {
4556          branch_newextra = 1 + LINK_SIZE;
4557          length += 1 + LINK_SIZE;         /* For the first branch */
4558          break;
4559          }
4560        errorcode = ERR24;
4561        goto PCRE_ERROR_RETURN;
4562
4563        /* Conditionals are in Perl from version 5.005. The bracket must either
4564        be followed by a number (for bracket reference) or by an assertion
4565        group, or (a PCRE extension) by 'R' for a recursion test. */
4566
4567        case '(':
4568        if (ptr[3] == 'R' && ptr[4] == ')')
4569          {
4570          ptr += 4;
4571          length += 3;
4572          }
4573        else if ((digitab[ptr[3]] & ctype_digit) != 0)
4574          {
4575          ptr += 4;
4576          length += 3;
4577          while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4578          if (*ptr != ')')
4579            {
4580            errorcode = ERR26;
4581            goto PCRE_ERROR_RETURN;
4582            }
4583          }
4584        else   /* An assertion must follow */
4585          {
4586          ptr++;   /* Can treat like ':' as far as spacing is concerned */
4587          if (ptr[2] != '?' ||
4588             (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4589            {
4590            ptr += 2;    /* To get right offset in message */
4591            errorcode = ERR28;
4592            goto PCRE_ERROR_RETURN;
4593            }
4594          }
4595        break;
4596
4597        /* Else loop checking valid options until ) is met. Anything else is an
4598        error. If we are without any brackets, i.e. at top level, the settings
4599        act as if specified in the options, so massage the options immediately.
4600        This is for backward compatibility with Perl 5.004. */
4601
4602        default:
4603        set = unset = 0;
4604        optset = &set;
4605        ptr += 2;
4606
4607        for (;; ptr++)
4608          {
4609          c = *ptr;
4610          switch (c)
4611            {
4612            case 'i':
4613            *optset |= PCRE_CASELESS;
4614            continue;
4615
4616            case 'm':
4617            *optset |= PCRE_MULTILINE;
4618            continue;
4619
4620            case 's':
4621            *optset |= PCRE_DOTALL;
4622            continue;
4623
4624            case 'x':
4625            *optset |= PCRE_EXTENDED;
4626            continue;
4627
4628            case 'X':
4629            *optset |= PCRE_EXTRA;
4630            continue;
4631
4632            case 'U':
4633            *optset |= PCRE_UNGREEDY;
4634            continue;
4635
4636            case '-':
4637            optset = &unset;
4638            continue;
4639
4640            /* A termination by ')' indicates an options-setting-only item; if
4641            this is at the very start of the pattern (indicated by item_count
4642            being zero), we use it to set the global options. This is helpful
4643            when analyzing the pattern for first characters, etc. Otherwise
4644            nothing is done here and it is handled during the compiling
4645            process.
4646
4647            We allow for more than one options setting at the start. If such
4648            settings do not change the existing options, nothing is compiled.
4649            However, we must leave space just in case something is compiled.
4650            This can happen for pathological sequences such as (?i)(?-i)
4651            because the global options will end up with -i set. The space is
4652            small and not significant. (Before I did this there was a reported
4653            bug with (?i)(?-i) in a machine-generated pattern.)
4654
4655            [Historical note: Up to Perl 5.8, options settings at top level
4656            were always global settings, wherever they appeared in the pattern.
4657            That is, they were equivalent to an external setting. From 5.8
4658            onwards, they apply only to what follows (which is what you might
4659            expect).] */
4660
4661            case ')':
4662            if (item_count == 0)
4663              {
4664              options = (options | set) & (~unset);
4665              set = unset = 0;     /* To save length */
4666              item_count--;        /* To allow for several */
4667              length += 2;
4668              }
4669
4670            /* Fall through */
4671
4672            /* A termination by ':' indicates the start of a nested group with
4673            the given options set. This is again handled at compile time, but
4674            we must allow for compiled space if any of the ims options are
4675            set. We also have to allow for resetting space at the end of
4676            the group, which is why 4 is added to the length and not just 2.
4677            If there are several changes of options within the same group, this
4678            will lead to an over-estimate on the length, but this shouldn't
4679            matter very much. We also have to allow for resetting options at
4680            the start of any alternations, which we do by setting
4681            branch_newextra to 2. Finally, we record whether the case-dependent
4682            flag ever changes within the regex. This is used by the "required
4683            character" code. */
4684
4685            case ':':
4686            if (((set|unset) & PCRE_IMS) != 0)
4687              {
4688              length += 4;
4689              branch_newextra = 2;
4690              if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4691              }
4692            goto END_OPTIONS;
4693
4694            /* Unrecognized option character */
4695
4696            default:
4697            errorcode = ERR12;
4698            goto PCRE_ERROR_RETURN;
4699            }
4700          }
4701
4702        /* If we hit a closing bracket, that's it - this is a freestanding
4703        option-setting. We need to ensure that branch_extra is updated if
4704        necessary. The only values branch_newextra can have here are 0 or 2.
4705        If the value is 2, then branch_extra must either be 2 or 5, depending
4706        on whether this is a lookbehind group or not. */
4707
4708        END_OPTIONS:
4709        if (c == ')')
4710          {
4711          if (branch_newextra == 2 &&
4712              (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4713            branch_extra += branch_newextra;
4714          continue;
4715          }
4716
4717        /* If options were terminated by ':' control comes here. This is a
4718        non-capturing group with an options change. There is nothing more that
4719        needs to be done because "capturing" is already set FALSE by default;
4720        we can just fall through. */
4721
4722        }
4723      }
4724
4725    /* Ordinary parentheses, not followed by '?', are capturing unless
4726    PCRE_NO_AUTO_CAPTURE is set. */
4727
4728    else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4729
4730    /* Capturing brackets must be counted so we can process escapes in a
4731    Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4732    an additional 3 bytes of memory per capturing bracket. */
4733
4734    if (capturing)
4735      {
4736      bracount++;
4737      if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4738      }
4739
4740    /* Save length for computing whole length at end if there's a repeat that
4741    requires duplication of the group. Also save the current value of
4742    branch_extra, and start the new group with the new value. If non-zero, this
4743    will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4744
4745    if (brastackptr >= sizeof(brastack)/sizeof(int))
4746      {
4747      errorcode = ERR19;
4748      goto PCRE_ERROR_RETURN;
4749      }
4750
4751    bralenstack[brastackptr] = branch_extra;
4752    branch_extra = branch_newextra;
4753
4754    brastack[brastackptr++] = length;
4755    length += bracket_length;
4756    continue;
4757
4758    /* Handle ket. Look for subsequent max/min; for certain sets of values we
4759    have to replicate this bracket up to that many times. If brastackptr is
4760    0 this is an unmatched bracket which will generate an error, but take care
4761    not to try to access brastack[-1] when computing the length and restoring
4762    the branch_extra value. */
4763
4764    case ')':
4765    length += 1 + LINK_SIZE;
4766    if (brastackptr > 0)
4767      {
4768      duplength = length - brastack[--brastackptr];
4769      branch_extra = bralenstack[brastackptr];
4770      }
4771    else duplength = 0;
4772
4773    /* The following code is also used when a recursion such as (?3) is
4774    followed by a quantifier, because in that case, it has to be wrapped inside
4775    brackets so that the quantifier works. The value of duplength must be
4776    set before arrival. */
4777
4778    HANDLE_QUANTIFIED_BRACKETS:
4779
4780    /* Leave ptr at the final char; for read_repeat_counts this happens
4781    automatically; for the others we need an increment. */
4782
4783    if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4784      {
4785      ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4786      if (errorcode != 0) goto PCRE_ERROR_RETURN;
4787      }
4788    else if (c == '*') { min = 0; max = -1; ptr++; }
4789    else if (c == '+') { min = 1; max = -1; ptr++; }
4790    else if (c == '?') { min = 0; max = 1;  ptr++; }
4791    else { min = 1; max = 1; }
4792
4793    /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4794    group, and if the maximum is greater than zero, we have to replicate
4795    maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4796    bracket set. */
4797
4798    if (min == 0)
4799      {
4800      length++;
4801      if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4802      }
4803
4804    /* When the minimum is greater than zero, we have to replicate up to
4805    minval-1 times, with no additions required in the copies. Then, if there
4806    is a limited maximum we have to replicate up to maxval-1 times allowing
4807    for a BRAZERO item before each optional copy and nesting brackets for all
4808    but one of the optional copies. */
4809
4810    else
4811      {
4812      length += (min - 1) * duplength;
4813      if (max > min)   /* Need this test as max=-1 means no limit */
4814        length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4815          - (2 + 2*LINK_SIZE);
4816      }
4817
4818    /* Allow space for once brackets for "possessive quantifier" */
4819
4820    if (ptr[1] == '+')
4821      {
4822      ptr++;
4823      length += 2 + 2*LINK_SIZE;
4824      }
4825    continue;
4826
4827    /* Non-special character. It won't be space or # in extended mode, so it is
4828    always a genuine character. If we are in a \Q...\E sequence, check for the
4829    end; if not, we have a literal. */
4830
4831    default:
4832    NORMAL_CHAR:
4833
4834    if (inescq && c == '\\' && ptr[1] == 'E')
4835      {
4836      inescq = FALSE;
4837      ptr++;
4838      continue;
4839      }
4840
4841    length += 2;          /* For a one-byte character */
4842    lastitemlength = 1;   /* Default length of last item for repeats */
4843
4844    /* In UTF-8 mode, check for additional bytes. */
4845
4846#ifdef SUPPORT_UTF8
4847    if (utf8 && (c & 0xc0) == 0xc0)
4848      {
4849      while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
4850        {                                     /* because the end is marked */
4851        lastitemlength++;                     /* by a zero byte. */
4852        length++;
4853        ptr++;
4854        }
4855      }
4856#endif
4857
4858    continue;
4859    }
4860  }
4861
4862length += 2 + LINK_SIZE;    /* For final KET and END */
4863
4864if ((options & PCRE_AUTO_CALLOUT) != 0)
4865  length += 2 + 2*LINK_SIZE;  /* For final callout */
4866
4867if (length > MAX_PATTERN_SIZE)
4868  {
4869  errorcode = ERR20;
4870  goto PCRE_EARLY_ERROR_RETURN;
4871  }
4872
4873/* Compute the size of data block needed and get it, either from malloc or
4874externally provided function. */
4875
4876size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4877re = (real_pcre *)(pcre_malloc)(size);
4878
4879if (re == NULL)
4880  {
4881  errorcode = ERR21;
4882  goto PCRE_EARLY_ERROR_RETURN;
4883  }
4884
4885/* Put in the magic number, and save the sizes, options, and character table
4886pointer. NULL is used for the default character tables. The nullpad field is at
4887the end; it's there to help in the case when a regex compiled on a system with
48884-byte pointers is run on another with 8-byte pointers. */
4889
4890re->magic_number = MAGIC_NUMBER;
4891re->size = size;
4892re->options = options;
4893re->dummy1 = 0;
4894re->name_table_offset = sizeof(real_pcre);
4895re->name_entry_size = max_name_size + 3;
4896re->name_count = name_count;
4897re->ref_count = 0;
4898re->tables = (tables == _pcre_default_tables)? NULL : tables;
4899re->nullpad = NULL;
4900
4901/* The starting points of the name/number translation table and of the code are
4902passed around in the compile data block. */
4903
4904compile_block.names_found = 0;
4905compile_block.name_entry_size = max_name_size + 3;
4906compile_block.name_table = (uschar *)re + re->name_table_offset;
4907codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4908compile_block.start_code = codestart;
4909compile_block.start_pattern = (const uschar *)pattern;
4910compile_block.req_varyopt = 0;
4911compile_block.nopartial = FALSE;
4912
4913/* Set up a starting, non-extracting bracket, then compile the expression. On
4914error, errorcode will be set non-zero, so we don't need to look at the result
4915of the function here. */
4916
4917ptr = (const uschar *)pattern;
4918code = (uschar *)codestart;
4919*code = OP_BRA;
4920bracount = 0;
4921(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4922  &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4923re->top_bracket = bracount;
4924re->top_backref = compile_block.top_backref;
4925
4926if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4927
4928/* If not reached end of pattern on success, there's an excess bracket. */
4929
4930if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4931
4932/* Fill in the terminating state and check for disastrous overflow, but
4933if debugging, leave the test till after things are printed out. */
4934
4935*code++ = OP_END;
4936
4937#ifndef DEBUG
4938if (code - codestart > length) errorcode = ERR23;
4939#endif
4940
4941/* Give an error if there's back reference to a non-existent capturing
4942subpattern. */
4943
4944if (re->top_backref > re->top_bracket) errorcode = ERR15;
4945
4946/* Failed to compile, or error while post-processing */
4947
4948if (errorcode != 0)
4949  {
4950  (pcre_free)(re);
4951  PCRE_ERROR_RETURN:
4952  *erroroffset = ptr - (const uschar *)pattern;
4953  PCRE_EARLY_ERROR_RETURN:
4954  *errorptr = error_texts[errorcode];
4955  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4956  return NULL;
4957  }
4958
4959/* If the anchored option was not passed, set the flag if we can determine that
4960the pattern is anchored by virtue of ^ characters or \A or anything else (such
4961as starting with .* when DOTALL is set).
4962
4963Otherwise, if we know what the first character has to be, save it, because that
4964speeds up unanchored matches no end. If not, see if we can set the
4965PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4966start with ^. and also when all branches start with .* for non-DOTALL matches.
4967*/
4968
4969if ((options & PCRE_ANCHORED) == 0)
4970  {
4971  int temp_options = options;
4972  if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4973    re->options |= PCRE_ANCHORED;
4974  else
4975    {
4976    if (firstbyte < 0)
4977      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4978    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
4979      {
4980      int ch = firstbyte & 255;
4981      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4982         compile_block.fcc[ch] == ch)? ch : firstbyte;
4983      re->options |= PCRE_FIRSTSET;
4984      }
4985    else if (is_startline(codestart, 0, compile_block.backref_map))
4986      re->options |= PCRE_STARTLINE;
4987    }
4988  }
4989
4990/* For an anchored pattern, we use the "required byte" only if it follows a
4991variable length item in the regex. Remove the caseless flag for non-caseable
4992bytes. */
4993
4994if (reqbyte >= 0 &&
4995     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4996  {
4997  int ch = reqbyte & 255;
4998  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
4999    compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5000  re->options |= PCRE_REQCHSET;
5001  }
5002
5003/* Print out the compiled data for debugging */
5004
5005#ifdef DEBUG
5006
5007printf("Length = %d top_bracket = %d top_backref = %d\n",
5008  length, re->top_bracket, re->top_backref);
5009
5010if (re->options != 0)
5011  {
5012  printf("%s%s%s%s%s%s%s%s%s%s\n",
5013    ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5014    ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5015    ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5016    ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5017    ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5018    ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5019    ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5020    ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5021    ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5022    ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5023  }
5024
5025if ((re->options & PCRE_FIRSTSET) != 0)
5026  {
5027  int ch = re->first_byte & 255;
5028  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5029  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5030    else printf("First char = \\x%02x%s\n", ch, caseless);
5031  }
5032
5033if ((re->options & PCRE_REQCHSET) != 0)
5034  {
5035  int ch = re->req_byte & 255;
5036  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5037  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5038    else printf("Req char = \\x%02x%s\n", ch, caseless);
5039  }
5040
5041_pcre_printint(re, stdout);
5042
5043/* This check is done here in the debugging case so that the code that
5044was compiled can be seen. */
5045
5046if (code - codestart > length)
5047  {
5048  (pcre_free)(re);
5049  *errorptr = error_texts[ERR23];
5050  *erroroffset = ptr - (uschar *)pattern;
5051  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5052  return NULL;
5053  }
5054#endif
5055
5056return (pcre *)re;
5057}
5058
5059/* End of pcre_compile.c */
Note: See TracBrowser for help on using the repository browser.