source: project/chicken/branches/prerelease/pcre/pcre_compile.c @ 9381

Last change on this file since 9381 was 9381, checked in by Ivan Raikov, 12 years ago

Merged trunk into prerelease

File size: 192.5 KB
Line 
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2008 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing processed string start */
51#define PSEND   end_pattern    /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* When DEBUG is defined, we need the pcre_printint() function, which is also
57used by pcretest. DEBUG is not defined when building a production library. */
58
59#ifdef DEBUG
60#include "pcre_printint.src"
61#endif
62
63
64/* Macro for setting individual bits in class bitmaps. */
65
66#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67
68/* Maximum length value to check against when making sure that the integer that
69holds the compiled pattern length does not overflow. We make it a bit less than
70INT_MAX to allow for adding in group terminating bytes, so that we don't have
71to check them every time. */
72
73#define OFLOW_MAX (INT_MAX - 20)
74
75
76/*************************************************
77*      Code parameters and static tables         *
78*************************************************/
79
80/* This value specifies the size of stack workspace that is used during the
81first pre-compile phase that determines how much memory is required. The regex
82is partly compiled into this space, but the compiled parts are discarded as
83soon as they can be, so that hopefully there will never be an overrun. The code
84does, however, check for an overrun. The largest amount I've seen used is 218,
85so this number is very generous.
86
87The same workspace is used during the second, actual compile phase for
88remembering forward references to groups so that they can be filled in at the
89end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90is 4 there is plenty of room. */
91
92#define COMPILE_WORK_SIZE (4096)
93
94
95/* Table for handling escaped characters in the range '0'-'z'. Positive returns
96are simple data values; negative values are for special things like \d and so
97on. Zero means further processing is needed (for things like \x), or the escape
98is invalid. */
99
100#ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
101static const short int escapes[] = {
102     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
103     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
104   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
105-ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
106-ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
107-ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
108   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
109-ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
110-ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
111     0,      0, -ESC_z                                            /* x - z */
112};
113
114#else           /* This is the "abnormal" table for EBCDIC systems */
115static const short int escapes[] = {
116/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
117/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
118/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
119/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
120/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
121/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
122/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
123/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
124/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
125/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
126/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
127/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
128/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
129/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
130/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
131/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
132/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
133/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
134/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
135/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
136/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
137/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
138/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
139};
140#endif
141
142
143/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144searched linearly. Put all the names into a single string, in order to reduce
145the number of relocations when a shared library is dynamically linked. */
146
147typedef struct verbitem {
148  int   len;
149  int   op;
150} verbitem;
151
152static const char verbnames[] =
153  "ACCEPT\0"
154  "COMMIT\0"
155  "F\0"
156  "FAIL\0"
157  "PRUNE\0"
158  "SKIP\0"
159  "THEN";
160
161static verbitem verbs[] = {
162  { 6, OP_ACCEPT },
163  { 6, OP_COMMIT },
164  { 1, OP_FAIL },
165  { 4, OP_FAIL },
166  { 5, OP_PRUNE },
167  { 4, OP_SKIP  },
168  { 4, OP_THEN  }
169};
170
171static int verbcount = sizeof(verbs)/sizeof(verbitem);
172
173
174/* Tables of names of POSIX character classes and their lengths. The names are
175now all in a single string, to reduce the number of relocations when a shared
176library is dynamically loaded. The list of lengths is terminated by a zero
177length entry. The first three must be alpha, lower, upper, as this is assumed
178for handling case independence. */
179
180static const char posix_names[] =
181  "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
182  "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
183  "word\0"   "xdigit";
184
185static const uschar posix_name_lengths[] = {
186  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187
188/* Table of class bit maps for each POSIX class. Each class is formed from a
189base map, with an optional addition or removal of another map. Then, for some
190classes, there is some additional tweaking: for [:blank:] the vertical space
191characters are removed, and for [:alpha:] and [:alnum:] the underscore
192character is removed. The triples in the table consist of the base map offset,
193second map offset or -1 if no second map, and a non-negative value for map
194addition or a negative value for map subtraction (if there are two maps). The
195absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196remove vertical space characters, 2 => remove underscore. */
197
198static const int posix_class_maps[] = {
199  cbit_word,  cbit_digit, -2,             /* alpha */
200  cbit_lower, -1,          0,             /* lower */
201  cbit_upper, -1,          0,             /* upper */
202  cbit_word,  -1,          2,             /* alnum - word without underscore */
203  cbit_print, cbit_cntrl,  0,             /* ascii */
204  cbit_space, -1,          1,             /* blank - a GNU extension */
205  cbit_cntrl, -1,          0,             /* cntrl */
206  cbit_digit, -1,          0,             /* digit */
207  cbit_graph, -1,          0,             /* graph */
208  cbit_print, -1,          0,             /* print */
209  cbit_punct, -1,          0,             /* punct */
210  cbit_space, -1,          0,             /* space */
211  cbit_word,  -1,          0,             /* word - a Perl extension */
212  cbit_xdigit,-1,          0              /* xdigit */
213};
214
215
216#define STRING(a)  # a
217#define XSTRING(s) STRING(s)
218
219/* The texts of compile-time error messages. These are "char *" because they
220are passed to the outside world. Do not ever re-use any error number, because
221they are documented. Always add a new error instead. Messages marked DEAD below
222are no longer used. This used to be a table of strings, but in order to reduce
223the number of relocations needed when a shared library is loaded dynamically,
224it is now one long string. We cannot use a table of offsets, because the
225lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226simply count through to the one we want - this isn't a performance issue
227because these strings are used only when there is a compilation error. */
228
229static const char error_texts[] =
230  "no error\0"
231  "\\ at end of pattern\0"
232  "\\c at end of pattern\0"
233  "unrecognized character follows \\\0"
234  "numbers out of order in {} quantifier\0"
235  /* 5 */
236  "number too big in {} quantifier\0"
237  "missing terminating ] for character class\0"
238  "invalid escape sequence in character class\0"
239  "range out of order in character class\0"
240  "nothing to repeat\0"
241  /* 10 */
242  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
243  "internal error: unexpected repeat\0"
244  "unrecognized character after (? or (?-\0"
245  "POSIX named classes are supported only within a class\0"
246  "missing )\0"
247  /* 15 */
248  "reference to non-existent subpattern\0"
249  "erroffset passed as NULL\0"
250  "unknown option bit(s) set\0"
251  "missing ) after comment\0"
252  "parentheses nested too deeply\0"  /** DEAD **/
253  /* 20 */
254  "regular expression is too large\0"
255  "failed to get memory\0"
256  "unmatched parentheses\0"
257  "internal error: code overflow\0"
258  "unrecognized character after (?<\0"
259  /* 25 */
260  "lookbehind assertion is not fixed length\0"
261  "malformed number or name after (?(\0"
262  "conditional group contains more than two branches\0"
263  "assertion expected after (?(\0"
264  "(?R or (?[+-]digits must be followed by )\0"
265  /* 30 */
266  "unknown POSIX class name\0"
267  "POSIX collating elements are not supported\0"
268  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269  "spare error\0"  /** DEAD **/
270  "character value in \\x{...} sequence is too large\0"
271  /* 35 */
272  "invalid condition (?(0)\0"
273  "\\C not allowed in lookbehind assertion\0"
274  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275  "number after (?C is > 255\0"
276  "closing ) for (?C expected\0"
277  /* 40 */
278  "recursive call could loop indefinitely\0"
279  "unrecognized character after (?P\0"
280  "syntax error in subpattern name (missing terminator)\0"
281  "two named subpatterns have the same name\0"
282  "invalid UTF-8 string\0"
283  /* 45 */
284  "support for \\P, \\p, and \\X has not been compiled\0"
285  "malformed \\P or \\p sequence\0"
286  "unknown property name after \\P or \\p\0"
287  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289  /* 50 */
290  "repeated subpattern is too long\0"    /** DEAD **/
291  "octal value is greater than \\377 (not in UTF-8 mode)\0"
292  "internal error: overran compiling workspace\0"
293  "internal error: previously-checked referenced subpattern not found\0"
294  "DEFINE group contains more than one branch\0"
295  /* 55 */
296  "repeating a DEFINE group is not allowed\0"
297  "inconsistent NEWLINE options\0"
298  "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299  "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300  "(*VERB) with an argument is not supported\0"
301  /* 60 */
302  "(*VERB) not recognized\0"
303  "number is too big\0"
304  "subpattern name expected\0"
305  "digit expected after (?+";
306
307
308/* Table to identify digits and hex digits. This is used when compiling
309patterns. Note that the tables in chartables are dependent on the locale, and
310may mark arbitrary characters as digits - but the PCRE compiling code expects
311to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312a private table here. It costs 256 bytes, but it is a lot faster than doing
313character value tests (at least in some simple cases I timed), and in some
314applications one wants PCRE to compile efficiently as well as match
315efficiently.
316
317For convenience, we use the same bit definitions as in chartables:
318
319  0x04   decimal digit
320  0x08   hexadecimal digit
321
322Then we can use ctype_digit and ctype_xdigit in the code. */
323
324#ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
325static const unsigned char digitab[] =
326  {
327  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
328  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
329  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
330  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
331  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
332  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
333  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
334  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
335  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
336  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
337  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
338  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
339  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
340  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
341  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
342  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
343  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359
360#else           /* This is the "abnormal" case, for EBCDIC systems */
361static const unsigned char digitab[] =
362  {
363  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
364  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
365  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
366  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
367  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
368  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
369  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
370  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
371  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
372  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
373  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
374  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
375  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
376  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
377  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
379  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
380  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
381  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
382  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
383  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
384  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
385  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
386  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
387  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
388  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
389  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
390  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
391  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
392  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
393  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
394  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
395
396static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
398  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
399  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
400  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
401  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
402  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
403  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
404  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
405  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
406  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
407  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
408  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
409  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
410  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
411  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
413  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
414  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
415  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
416  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
417  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
418  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
419  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
420  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
422  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
423  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
424  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
425  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
426  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
427  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
428  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
429#endif
430
431
432/* Definition to allow mutual recursion */
433
434static BOOL
435  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436    int *, int *, branch_chain *, compile_data *, int *);
437
438
439
440/*************************************************
441*            Find an error text                  *
442*************************************************/
443
444/* The error texts are now all in one long string, to save on relocations. As
445some of the text is of unknown length, we can't use a table of offsets.
446Instead, just count through the strings. This is not a performance issue
447because it happens only when there has been a compilation error.
448
449Argument:   the error number
450Returns:    pointer to the error string
451*/
452
453static const char *
454find_error_text(int n)
455{
456const char *s = error_texts;
457for (; n > 0; n--) while (*s++ != 0);
458return s;
459}
460
461
462/*************************************************
463*            Handle escapes                      *
464*************************************************/
465
466/* This function is called when a \ has been encountered. It either returns a
467positive value for a simple escape such as \n, or a negative value which
468encodes one of the more complicated things such as \d. A backreference to group
469n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471ptr is pointing at the \. On exit, it is on the final character of the escape
472sequence.
473
474Arguments:
475  ptrptr         points to the pattern position pointer
476  errorcodeptr   points to the errorcode variable
477  bracount       number of previous extracting brackets
478  options        the options bits
479  isclass        TRUE if inside a character class
480
481Returns:         zero or positive => a data character
482                 negative => a special escape sequence
483                 on error, errorcodeptr is set
484*/
485
486static int
487check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488  int options, BOOL isclass)
489{
490BOOL utf8 = (options & PCRE_UTF8) != 0;
491const uschar *ptr = *ptrptr + 1;
492int c, i;
493
494GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
495ptr--;                            /* Set pointer back to the last byte */
496
497/* If backslash is at the end of the pattern, it's an error. */
498
499if (c == 0) *errorcodeptr = ERR1;
500
501/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502in a table. A non-zero result is something that can be returned immediately.
503Otherwise further processing may be required. */
504
505#ifndef EBCDIC  /* ASCII coding */
506else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
507else if ((i = escapes[c - '0']) != 0) c = i;
508
509#else           /* EBCDIC coding */
510else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
511else if ((i = escapes[c - 0x48]) != 0)  c = i;
512#endif
513
514/* Escapes that need further processing, or are illegal. */
515
516else
517  {
518  const uschar *oldptr;
519  BOOL braced, negated;
520
521  switch (c)
522    {
523    /* A number of Perl escapes are not handled by PCRE. We give an explicit
524    error. */
525
526    case 'l':
527    case 'L':
528    case 'N':
529    case 'u':
530    case 'U':
531    *errorcodeptr = ERR37;
532    break;
533
534    /* \g must be followed by a number, either plain or braced. If positive, it
535    is an absolute backreference. If negative, it is a relative backreference.
536    This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537    reference to a named group. This is part of Perl's movement towards a
538    unified syntax for back references. As this is synonymous with \k{name}, we
539    fudge it up by pretending it really was \k. */
540
541    case 'g':
542    if (ptr[1] == '{')
543      {
544      const uschar *p;
545      for (p = ptr+2; *p != 0 && *p != '}'; p++)
546        if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547      if (*p != 0 && *p != '}')
548        {
549        c = -ESC_k;
550        break;
551        }
552      braced = TRUE;
553      ptr++;
554      }
555    else braced = FALSE;
556
557    if (ptr[1] == '-')
558      {
559      negated = TRUE;
560      ptr++;
561      }
562    else negated = FALSE;
563
564    c = 0;
565    while ((digitab[ptr[1]] & ctype_digit) != 0)
566      c = c * 10 + *(++ptr) - '0';
567
568    if (c < 0)
569      {
570      *errorcodeptr = ERR61;
571      break;
572      }
573
574    if (c == 0 || (braced && *(++ptr) != '}'))
575      {
576      *errorcodeptr = ERR57;
577      break;
578      }
579
580    if (negated)
581      {
582      if (c > bracount)
583        {
584        *errorcodeptr = ERR15;
585        break;
586        }
587      c = bracount - (c - 1);
588      }
589
590    c = -(ESC_REF + c);
591    break;
592
593    /* The handling of escape sequences consisting of a string of digits
594    starting with one that is not zero is not straightforward. By experiment,
595    the way Perl works seems to be as follows:
596
597    Outside a character class, the digits are read as a decimal number. If the
598    number is less than 10, or if there are that many previous extracting
599    left brackets, then it is a back reference. Otherwise, up to three octal
600    digits are read to form an escaped byte. Thus \123 is likely to be octal
601    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602    value is greater than 377, the least significant 8 bits are taken. Inside a
603    character class, \ followed by a digit is always an octal number. */
604
605    case '1': case '2': case '3': case '4': case '5':
606    case '6': case '7': case '8': case '9':
607
608    if (!isclass)
609      {
610      oldptr = ptr;
611      c -= '0';
612      while ((digitab[ptr[1]] & ctype_digit) != 0)
613        c = c * 10 + *(++ptr) - '0';
614      if (c < 0)
615        {
616        *errorcodeptr = ERR61;
617        break;
618        }
619      if (c < 10 || c <= bracount)
620        {
621        c = -(ESC_REF + c);
622        break;
623        }
624      ptr = oldptr;      /* Put the pointer back and fall through */
625      }
626
627    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628    generates a binary zero byte and treats the digit as a following literal.
629    Thus we have to pull back the pointer by one. */
630
631    if ((c = *ptr) >= '8')
632      {
633      ptr--;
634      c = 0;
635      break;
636      }
637
638    /* \0 always starts an octal number, but we may drop through to here with a
639    larger first octal digit. The original code used just to take the least
640    significant 8 bits of octal numbers (I think this is what early Perls used
641    to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642    than 3 octal digits. */
643
644    case '0':
645    c -= '0';
646    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647        c = c * 8 + *(++ptr) - '0';
648    if (!utf8 && c > 255) *errorcodeptr = ERR51;
649    break;
650
651    /* \x is complicated. \x{ddd} is a character number which can be greater
652    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653    treated as a data character. */
654
655    case 'x':
656    if (ptr[1] == '{')
657      {
658      const uschar *pt = ptr + 2;
659      int count = 0;
660
661      c = 0;
662      while ((digitab[*pt] & ctype_xdigit) != 0)
663        {
664        register int cc = *pt++;
665        if (c == 0 && cc == '0') continue;     /* Leading zeroes */
666        count++;
667
668#ifndef EBCDIC  /* ASCII coding */
669        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
670        c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671#else           /* EBCDIC coding */
672        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
673        c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674#endif
675        }
676
677      if (*pt == '}')
678        {
679        if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680        ptr = pt;
681        break;
682        }
683
684      /* If the sequence of hex digits does not end with '}', then we don't
685      recognize this construct; fall through to the normal \x handling. */
686      }
687
688    /* Read just a single-byte hex-defined char */
689
690    c = 0;
691    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692      {
693      int cc;                               /* Some compilers don't like ++ */
694      cc = *(++ptr);                        /* in initializers */
695#ifndef EBCDIC  /* ASCII coding */
696      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
697      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698#else           /* EBCDIC coding */
699      if (cc <= 'z') cc += 64;              /* Convert to upper case */
700      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701#endif
702      }
703    break;
704
705    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706    This coding is ASCII-specific, but then the whole concept of \cx is
707    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708
709    case 'c':
710    c = *(++ptr);
711    if (c == 0)
712      {
713      *errorcodeptr = ERR2;
714      break;
715      }
716
717#ifndef EBCDIC  /* ASCII coding */
718    if (c >= 'a' && c <= 'z') c -= 32;
719    c ^= 0x40;
720#else           /* EBCDIC coding */
721    if (c >= 'a' && c <= 'z') c += 64;
722    c ^= 0xC0;
723#endif
724    break;
725
726    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727    other alphanumeric following \ is an error if PCRE_EXTRA was set;
728    otherwise, for Perl compatibility, it is a literal. This code looks a bit
729    odd, but there used to be some cases other than the default, and there may
730    be again in future, so I haven't "optimized" it. */
731
732    default:
733    if ((options & PCRE_EXTRA) != 0) switch(c)
734      {
735      default:
736      *errorcodeptr = ERR3;
737      break;
738      }
739    break;
740    }
741  }
742
743*ptrptr = ptr;
744return c;
745}
746
747
748
749#ifdef SUPPORT_UCP
750/*************************************************
751*               Handle \P and \p                 *
752*************************************************/
753
754/* This function is called after \P or \p has been encountered, provided that
755PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756pointing at the P or p. On exit, it is pointing at the final character of the
757escape sequence.
758
759Argument:
760  ptrptr         points to the pattern position pointer
761  negptr         points to a boolean that is set TRUE for negation else FALSE
762  dptr           points to an int that is set to the detailed property value
763  errorcodeptr   points to the error code variable
764
765Returns:         type value from ucp_type_table, or -1 for an invalid type
766*/
767
768static int
769get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770{
771int c, i, bot, top;
772const uschar *ptr = *ptrptr;
773char name[32];
774
775c = *(++ptr);
776if (c == 0) goto ERROR_RETURN;
777
778*negptr = FALSE;
779
780/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781negation. */
782
783if (c == '{')
784  {
785  if (ptr[1] == '^')
786    {
787    *negptr = TRUE;
788    ptr++;
789    }
790  for (i = 0; i < (int)sizeof(name) - 1; i++)
791    {
792    c = *(++ptr);
793    if (c == 0) goto ERROR_RETURN;
794    if (c == '}') break;
795    name[i] = c;
796    }
797  if (c !='}') goto ERROR_RETURN;
798  name[i] = 0;
799  }
800
801/* Otherwise there is just one following character */
802
803else
804  {
805  name[0] = c;
806  name[1] = 0;
807  }
808
809*ptrptr = ptr;
810
811/* Search for a recognized property name using binary chop */
812
813bot = 0;
814top = _pcre_utt_size;
815
816while (bot < top)
817  {
818  i = (bot + top) >> 1;
819  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820  if (c == 0)
821    {
822    *dptr = _pcre_utt[i].value;
823    return _pcre_utt[i].type;
824    }
825  if (c > 0) bot = i + 1; else top = i;
826  }
827
828*errorcodeptr = ERR47;
829*ptrptr = ptr;
830return -1;
831
832ERROR_RETURN:
833*errorcodeptr = ERR46;
834*ptrptr = ptr;
835return -1;
836}
837#endif
838
839
840
841
842/*************************************************
843*            Check for counted repeat            *
844*************************************************/
845
846/* This function is called when a '{' is encountered in a place where it might
847start a quantifier. It looks ahead to see if it really is a quantifier or not.
848It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849where the ddds are digits.
850
851Arguments:
852  p         pointer to the first char after '{'
853
854Returns:    TRUE or FALSE
855*/
856
857static BOOL
858is_counted_repeat(const uschar *p)
859{
860if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861while ((digitab[*p] & ctype_digit) != 0) p++;
862if (*p == '}') return TRUE;
863
864if (*p++ != ',') return FALSE;
865if (*p == '}') return TRUE;
866
867if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868while ((digitab[*p] & ctype_digit) != 0) p++;
869
870return (*p == '}');
871}
872
873
874
875/*************************************************
876*         Read repeat counts                     *
877*************************************************/
878
879/* Read an item of the form {n,m} and return the values. This is called only
880after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881so the syntax is guaranteed to be correct, but we need to check the values.
882
883Arguments:
884  p              pointer to first char after '{'
885  minp           pointer to int for min
886  maxp           pointer to int for max
887                 returned as -1 if no max
888  errorcodeptr   points to error code variable
889
890Returns:         pointer to '}' on success;
891                 current ptr on error, with errorcodeptr set non-zero
892*/
893
894static const uschar *
895read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896{
897int min = 0;
898int max = -1;
899
900/* Read the minimum value and do a paranoid check: a negative value indicates
901an integer overflow. */
902
903while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904if (min < 0 || min > 65535)
905  {
906  *errorcodeptr = ERR5;
907  return p;
908  }
909
910/* Read the maximum value if there is one, and again do a paranoid on its size.
911Also, max must not be less than min. */
912
913if (*p == '}') max = min; else
914  {
915  if (*(++p) != '}')
916    {
917    max = 0;
918    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919    if (max < 0 || max > 65535)
920      {
921      *errorcodeptr = ERR5;
922      return p;
923      }
924    if (max < min)
925      {
926      *errorcodeptr = ERR4;
927      return p;
928      }
929    }
930  }
931
932/* Fill in the required variables, and pass back the pointer to the terminating
933'}'. */
934
935*minp = min;
936*maxp = max;
937return p;
938}
939
940
941
942/*************************************************
943*       Find forward referenced subpattern       *
944*************************************************/
945
946/* This function scans along a pattern's text looking for capturing
947subpatterns, and counting them. If it finds a named pattern that matches the
948name it is given, it returns its number. Alternatively, if the name is NULL, it
949returns when it reaches a given numbered subpattern. This is used for forward
950references to subpatterns. We know that if (?P< is encountered, the name will
951be terminated by '>' because that is checked in the first pass.
952
953Arguments:
954  ptr          current position in the pattern
955  count        current count of capturing parens so far encountered
956  name         name to seek, or NULL if seeking a numbered subpattern
957  lorn         name length, or subpattern number if name is NULL
958  xmode        TRUE if we are in /x mode
959
960Returns:       the number of the named subpattern, or -1 if not found
961*/
962
963static int
964find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965  BOOL xmode)
966{
967const uschar *thisname;
968
969for (; *ptr != 0; ptr++)
970  {
971  int term;
972
973  /* Skip over backslashed characters and also entire \Q...\E */
974
975  if (*ptr == '\\')
976    {
977    if (*(++ptr) == 0) return -1;
978    if (*ptr == 'Q') for (;;)
979      {
980      while (*(++ptr) != 0 && *ptr != '\\');
981      if (*ptr == 0) return -1;
982      if (*(++ptr) == 'E') break;
983      }
984    continue;
985    }
986
987  /* Skip over character classes */
988
989  if (*ptr == '[')
990    {
991    while (*(++ptr) != ']')
992      {
993      if (*ptr == 0) return -1;
994      if (*ptr == '\\')
995        {
996        if (*(++ptr) == 0) return -1;
997        if (*ptr == 'Q') for (;;)
998          {
999          while (*(++ptr) != 0 && *ptr != '\\');
1000          if (*ptr == 0) return -1;
1001          if (*(++ptr) == 'E') break;
1002          }
1003        continue;
1004        }
1005      }
1006    continue;
1007    }
1008
1009  /* Skip comments in /x mode */
1010
1011  if (xmode && *ptr == '#')
1012    {
1013    while (*(++ptr) != 0 && *ptr != '\n');
1014    if (*ptr == 0) return -1;
1015    continue;
1016    }
1017
1018  /* An opening parens must now be a real metacharacter */
1019
1020  if (*ptr != '(') continue;
1021  if (ptr[1] != '?' && ptr[1] != '*')
1022    {
1023    count++;
1024    if (name == NULL && count == lorn) return count;
1025    continue;
1026    }
1027
1028  ptr += 2;
1029  if (*ptr == 'P') ptr++;                      /* Allow optional P */
1030
1031  /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033  if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034       *ptr != '\'')
1035    continue;
1036
1037  count++;
1038
1039  if (name == NULL && count == lorn) return count;
1040  term = *ptr++;
1041  if (term == '<') term = '>';
1042  thisname = ptr;
1043  while (*ptr != term) ptr++;
1044  if (name != NULL && lorn == ptr - thisname &&
1045      strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046    return count;
1047  }
1048
1049return -1;
1050}
1051
1052
1053
1054/*************************************************
1055*      Find first significant op code            *
1056*************************************************/
1057
1058/* This is called by several functions that scan a compiled expression looking
1059for a fixed first character, or an anchoring op code etc. It skips over things
1060that do not influence this. For some calls, a change of option is important.
1061For some calls, it makes sense to skip negative forward and all backward
1062assertions, and also the \b assertion; for others it does not.
1063
1064Arguments:
1065  code         pointer to the start of the group
1066  options      pointer to external options
1067  optbit       the option bit whose changing is significant, or
1068                 zero if none are
1069  skipassert   TRUE if certain assertions are to be skipped
1070
1071Returns:       pointer to the first significant opcode
1072*/
1073
1074static const uschar*
1075first_significant_code(const uschar *code, int *options, int optbit,
1076  BOOL skipassert)
1077{
1078for (;;)
1079  {
1080  switch ((int)*code)
1081    {
1082    case OP_OPT:
1083    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084      *options = (int)code[1];
1085    code += 2;
1086    break;
1087
1088    case OP_ASSERT_NOT:
1089    case OP_ASSERTBACK:
1090    case OP_ASSERTBACK_NOT:
1091    if (!skipassert) return code;
1092    do code += GET(code, 1); while (*code == OP_ALT);
1093    code += _pcre_OP_lengths[*code];
1094    break;
1095
1096    case OP_WORD_BOUNDARY:
1097    case OP_NOT_WORD_BOUNDARY:
1098    if (!skipassert) return code;
1099    /* Fall through */
1100
1101    case OP_CALLOUT:
1102    case OP_CREF:
1103    case OP_RREF:
1104    case OP_DEF:
1105    code += _pcre_OP_lengths[*code];
1106    break;
1107
1108    default:
1109    return code;
1110    }
1111  }
1112/* Control never reaches here */
1113}
1114
1115
1116
1117
1118/*************************************************
1119*        Find the fixed length of a pattern      *
1120*************************************************/
1121
1122/* Scan a pattern and compute the fixed length of subject that will match it,
1123if the length is fixed. This is needed for dealing with backward assertions.
1124In UTF8 mode, the result is in characters rather than bytes.
1125
1126Arguments:
1127  code     points to the start of the pattern (the bracket)
1128  options  the compiling options
1129
1130Returns:   the fixed length, or -1 if there is no fixed length,
1131             or -2 if \C was encountered
1132*/
1133
1134static int
1135find_fixedlength(uschar *code, int options)
1136{
1137int length = -1;
1138
1139register int branchlength = 0;
1140register uschar *cc = code + 1 + LINK_SIZE;
1141
1142/* Scan along the opcodes for this branch. If we get to the end of the
1143branch, check the length against that of the other branches. */
1144
1145for (;;)
1146  {
1147  int d;
1148  register int op = *cc;
1149  switch (op)
1150    {
1151    case OP_CBRA:
1152    case OP_BRA:
1153    case OP_ONCE:
1154    case OP_COND:
1155    d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156    if (d < 0) return d;
1157    branchlength += d;
1158    do cc += GET(cc, 1); while (*cc == OP_ALT);
1159    cc += 1 + LINK_SIZE;
1160    break;
1161
1162    /* Reached end of a branch; if it's a ket it is the end of a nested
1163    call. If it's ALT it is an alternation in a nested call. If it is
1164    END it's the end of the outer call. All can be handled by the same code. */
1165
1166    case OP_ALT:
1167    case OP_KET:
1168    case OP_KETRMAX:
1169    case OP_KETRMIN:
1170    case OP_END:
1171    if (length < 0) length = branchlength;
1172      else if (length != branchlength) return -1;
1173    if (*cc != OP_ALT) return length;
1174    cc += 1 + LINK_SIZE;
1175    branchlength = 0;
1176    break;
1177
1178    /* Skip over assertive subpatterns */
1179
1180    case OP_ASSERT:
1181    case OP_ASSERT_NOT:
1182    case OP_ASSERTBACK:
1183    case OP_ASSERTBACK_NOT:
1184    do cc += GET(cc, 1); while (*cc == OP_ALT);
1185    /* Fall through */
1186
1187    /* Skip over things that don't match chars */
1188
1189    case OP_REVERSE:
1190    case OP_CREF:
1191    case OP_RREF:
1192    case OP_DEF:
1193    case OP_OPT:
1194    case OP_CALLOUT:
1195    case OP_SOD:
1196    case OP_SOM:
1197    case OP_EOD:
1198    case OP_EODN:
1199    case OP_CIRC:
1200    case OP_DOLL:
1201    case OP_NOT_WORD_BOUNDARY:
1202    case OP_WORD_BOUNDARY:
1203    cc += _pcre_OP_lengths[*cc];
1204    break;
1205
1206    /* Handle literal characters */
1207
1208    case OP_CHAR:
1209    case OP_CHARNC:
1210    case OP_NOT:
1211    branchlength++;
1212    cc += 2;
1213#ifdef SUPPORT_UTF8
1214    if ((options & PCRE_UTF8) != 0)
1215      {
1216      while ((*cc & 0xc0) == 0x80) cc++;
1217      }
1218#endif
1219    break;
1220
1221    /* Handle exact repetitions. The count is already in characters, but we
1222    need to skip over a multibyte character in UTF8 mode.  */
1223
1224    case OP_EXACT:
1225    branchlength += GET2(cc,1);
1226    cc += 4;
1227#ifdef SUPPORT_UTF8
1228    if ((options & PCRE_UTF8) != 0)
1229      {
1230      while((*cc & 0x80) == 0x80) cc++;
1231      }
1232#endif
1233    break;
1234
1235    case OP_TYPEEXACT:
1236    branchlength += GET2(cc,1);
1237    if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238    cc += 4;
1239    break;
1240
1241    /* Handle single-char matchers */
1242
1243    case OP_PROP:
1244    case OP_NOTPROP:
1245    cc += 2;
1246    /* Fall through */
1247
1248    case OP_NOT_DIGIT:
1249    case OP_DIGIT:
1250    case OP_NOT_WHITESPACE:
1251    case OP_WHITESPACE:
1252    case OP_NOT_WORDCHAR:
1253    case OP_WORDCHAR:
1254    case OP_ANY:
1255    branchlength++;
1256    cc++;
1257    break;
1258
1259    /* The single-byte matcher isn't allowed */
1260
1261    case OP_ANYBYTE:
1262    return -2;
1263
1264    /* Check a class for variable quantification */
1265
1266#ifdef SUPPORT_UTF8
1267    case OP_XCLASS:
1268    cc += GET(cc, 1) - 33;
1269    /* Fall through */
1270#endif
1271
1272    case OP_CLASS:
1273    case OP_NCLASS:
1274    cc += 33;
1275
1276    switch (*cc)
1277      {
1278      case OP_CRSTAR:
1279      case OP_CRMINSTAR:
1280      case OP_CRQUERY:
1281      case OP_CRMINQUERY:
1282      return -1;
1283
1284      case OP_CRRANGE:
1285      case OP_CRMINRANGE:
1286      if (GET2(cc,1) != GET2(cc,3)) return -1;
1287      branchlength += GET2(cc,1);
1288      cc += 5;
1289      break;
1290
1291      default:
1292      branchlength++;
1293      }
1294    break;
1295
1296    /* Anything else is variable length */
1297
1298    default:
1299    return -1;
1300    }
1301  }
1302/* Control never gets here */
1303}
1304
1305
1306
1307
1308/*************************************************
1309*    Scan compiled regex for numbered bracket    *
1310*************************************************/
1311
1312/* This little function scans through a compiled pattern until it finds a
1313capturing bracket with the given number.
1314
1315Arguments:
1316  code        points to start of expression
1317  utf8        TRUE in UTF-8 mode
1318  number      the required bracket number
1319
1320Returns:      pointer to the opcode for the bracket, or NULL if not found
1321*/
1322
1323static const uschar *
1324find_bracket(const uschar *code, BOOL utf8, int number)
1325{
1326for (;;)
1327  {
1328  register int c = *code;
1329  if (c == OP_END) return NULL;
1330
1331  /* XCLASS is used for classes that cannot be represented just by a bit
1332  map. This includes negated single high-valued characters. The length in
1333  the table is zero; the actual length is stored in the compiled code. */
1334
1335  if (c == OP_XCLASS) code += GET(code, 1);
1336
1337  /* Handle capturing bracket */
1338
1339  else if (c == OP_CBRA)
1340    {
1341    int n = GET2(code, 1+LINK_SIZE);
1342    if (n == number) return (uschar *)code;
1343    code += _pcre_OP_lengths[c];
1344    }
1345
1346  /* Otherwise, we can get the item's length from the table, except that for
1347  repeated character types, we have to test for \p and \P, which have an extra
1348  two bytes of parameters. */
1349
1350  else
1351    {
1352    switch(c)
1353      {
1354      case OP_TYPESTAR:
1355      case OP_TYPEMINSTAR:
1356      case OP_TYPEPLUS:
1357      case OP_TYPEMINPLUS:
1358      case OP_TYPEQUERY:
1359      case OP_TYPEMINQUERY:
1360      case OP_TYPEPOSSTAR:
1361      case OP_TYPEPOSPLUS:
1362      case OP_TYPEPOSQUERY:
1363      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364      break;
1365
1366      case OP_TYPEUPTO:
1367      case OP_TYPEMINUPTO:
1368      case OP_TYPEEXACT:
1369      case OP_TYPEPOSUPTO:
1370      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371      break;
1372      }
1373
1374    /* Add in the fixed length from the table */
1375
1376    code += _pcre_OP_lengths[c];
1377
1378  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379  a multi-byte character. The length in the table is a minimum, so we have to
1380  arrange to skip the extra bytes. */
1381
1382#ifdef SUPPORT_UTF8
1383    if (utf8) switch(c)
1384      {
1385      case OP_CHAR:
1386      case OP_CHARNC:
1387      case OP_EXACT:
1388      case OP_UPTO:
1389      case OP_MINUPTO:
1390      case OP_POSUPTO:
1391      case OP_STAR:
1392      case OP_MINSTAR:
1393      case OP_POSSTAR:
1394      case OP_PLUS:
1395      case OP_MINPLUS:
1396      case OP_POSPLUS:
1397      case OP_QUERY:
1398      case OP_MINQUERY:
1399      case OP_POSQUERY:
1400      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401      break;
1402      }
1403#endif
1404    }
1405  }
1406}
1407
1408
1409
1410/*************************************************
1411*   Scan compiled regex for recursion reference  *
1412*************************************************/
1413
1414/* This little function scans through a compiled pattern until it finds an
1415instance of OP_RECURSE.
1416
1417Arguments:
1418  code        points to start of expression
1419  utf8        TRUE in UTF-8 mode
1420
1421Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1422*/
1423
1424static const uschar *
1425find_recurse(const uschar *code, BOOL utf8)
1426{
1427for (;;)
1428  {
1429  register int c = *code;
1430  if (c == OP_END) return NULL;
1431  if (c == OP_RECURSE) return code;
1432
1433  /* XCLASS is used for classes that cannot be represented just by a bit
1434  map. This includes negated single high-valued characters. The length in
1435  the table is zero; the actual length is stored in the compiled code. */
1436
1437  if (c == OP_XCLASS) code += GET(code, 1);
1438
1439  /* Otherwise, we can get the item's length from the table, except that for
1440  repeated character types, we have to test for \p and \P, which have an extra
1441  two bytes of parameters. */
1442
1443  else
1444    {
1445    switch(c)
1446      {
1447      case OP_TYPESTAR:
1448      case OP_TYPEMINSTAR:
1449      case OP_TYPEPLUS:
1450      case OP_TYPEMINPLUS:
1451      case OP_TYPEQUERY:
1452      case OP_TYPEMINQUERY:
1453      case OP_TYPEPOSSTAR:
1454      case OP_TYPEPOSPLUS:
1455      case OP_TYPEPOSQUERY:
1456      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457      break;
1458
1459      case OP_TYPEPOSUPTO:
1460      case OP_TYPEUPTO:
1461      case OP_TYPEMINUPTO:
1462      case OP_TYPEEXACT:
1463      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464      break;
1465      }
1466
1467    /* Add in the fixed length from the table */
1468
1469    code += _pcre_OP_lengths[c];
1470
1471    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472    by a multi-byte character. The length in the table is a minimum, so we have
1473    to arrange to skip the extra bytes. */
1474
1475#ifdef SUPPORT_UTF8
1476    if (utf8) switch(c)
1477      {
1478      case OP_CHAR:
1479      case OP_CHARNC:
1480      case OP_EXACT:
1481      case OP_UPTO:
1482      case OP_MINUPTO:
1483      case OP_POSUPTO:
1484      case OP_STAR:
1485      case OP_MINSTAR:
1486      case OP_POSSTAR:
1487      case OP_PLUS:
1488      case OP_MINPLUS:
1489      case OP_POSPLUS:
1490      case OP_QUERY:
1491      case OP_MINQUERY:
1492      case OP_POSQUERY:
1493      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494      break;
1495      }
1496#endif
1497    }
1498  }
1499}
1500
1501
1502
1503/*************************************************
1504*    Scan compiled branch for non-emptiness      *
1505*************************************************/
1506
1507/* This function scans through a branch of a compiled pattern to see whether it
1508can match the empty string or not. It is called from could_be_empty()
1509below and from compile_branch() when checking for an unlimited repeat of a
1510group that can match nothing. Note that first_significant_code() skips over
1511backward and negative forward assertions when its final argument is TRUE. If we
1512hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513bracket whose current branch will already have been scanned.
1514
1515Arguments:
1516  code        points to start of search
1517  endcode     points to where to stop
1518  utf8        TRUE if in UTF8 mode
1519
1520Returns:      TRUE if what is matched could be empty
1521*/
1522
1523static BOOL
1524could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525{
1526register int c;
1527for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528     code < endcode;
1529     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530  {
1531  const uschar *ccode;
1532
1533  c = *code;
1534
1535  /* Skip over forward assertions; the other assertions are skipped by
1536  first_significant_code() with a TRUE final argument. */
1537
1538  if (c == OP_ASSERT)
1539    {
1540    do code += GET(code, 1); while (*code == OP_ALT);
1541    c = *code;
1542    continue;
1543    }
1544
1545  /* Groups with zero repeats can of course be empty; skip them. */
1546
1547  if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548    {
1549    code += _pcre_OP_lengths[c];
1550    do code += GET(code, 1); while (*code == OP_ALT);
1551    c = *code;
1552    continue;
1553    }
1554
1555  /* For other groups, scan the branches. */
1556
1557  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558    {
1559    BOOL empty_branch;
1560    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1561
1562    /* Scan a closed bracket */
1563
1564    empty_branch = FALSE;
1565    do
1566      {
1567      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568        empty_branch = TRUE;
1569      code += GET(code, 1);
1570      }
1571    while (*code == OP_ALT);
1572    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1573    c = *code;
1574    continue;
1575    }
1576
1577  /* Handle the other opcodes */
1578
1579  switch (c)
1580    {
1581    /* Check for quantifiers after a class. XCLASS is used for classes that
1582    cannot be represented just by a bit map. This includes negated single
1583    high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584    actual length is stored in the compiled code, so we must update "code"
1585    here. */
1586
1587#ifdef SUPPORT_UTF8
1588    case OP_XCLASS:
1589    ccode = code += GET(code, 1);
1590    goto CHECK_CLASS_REPEAT;
1591#endif
1592
1593    case OP_CLASS:
1594    case OP_NCLASS:
1595    ccode = code + 33;
1596
1597#ifdef SUPPORT_UTF8
1598    CHECK_CLASS_REPEAT:
1599#endif
1600
1601    switch (*ccode)
1602      {
1603      case OP_CRSTAR:            /* These could be empty; continue */
1604      case OP_CRMINSTAR:
1605      case OP_CRQUERY:
1606      case OP_CRMINQUERY:
1607      break;
1608
1609      default:                   /* Non-repeat => class must match */
1610      case OP_CRPLUS:            /* These repeats aren't empty */
1611      case OP_CRMINPLUS:
1612      return FALSE;
1613
1614      case OP_CRRANGE:
1615      case OP_CRMINRANGE:
1616      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1617      break;
1618      }
1619    break;
1620
1621    /* Opcodes that must match a character */
1622
1623    case OP_PROP:
1624    case OP_NOTPROP:
1625    case OP_EXTUNI:
1626    case OP_NOT_DIGIT:
1627    case OP_DIGIT:
1628    case OP_NOT_WHITESPACE:
1629    case OP_WHITESPACE:
1630    case OP_NOT_WORDCHAR:
1631    case OP_WORDCHAR:
1632    case OP_ANY:
1633    case OP_ANYBYTE:
1634    case OP_CHAR:
1635    case OP_CHARNC:
1636    case OP_NOT:
1637    case OP_PLUS:
1638    case OP_MINPLUS:
1639    case OP_POSPLUS:
1640    case OP_EXACT:
1641    case OP_NOTPLUS:
1642    case OP_NOTMINPLUS:
1643    case OP_NOTPOSPLUS:
1644    case OP_NOTEXACT:
1645    case OP_TYPEPLUS:
1646    case OP_TYPEMINPLUS:
1647    case OP_TYPEPOSPLUS:
1648    case OP_TYPEEXACT:
1649    return FALSE;
1650
1651    /* These are going to continue, as they may be empty, but we have to
1652    fudge the length for the \p and \P cases. */
1653
1654    case OP_TYPESTAR:
1655    case OP_TYPEMINSTAR:
1656    case OP_TYPEPOSSTAR:
1657    case OP_TYPEQUERY:
1658    case OP_TYPEMINQUERY:
1659    case OP_TYPEPOSQUERY:
1660    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661    break;
1662
1663    /* Same for these */
1664
1665    case OP_TYPEUPTO:
1666    case OP_TYPEMINUPTO:
1667    case OP_TYPEPOSUPTO:
1668    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669    break;
1670
1671    /* End of branch */
1672
1673    case OP_KET:
1674    case OP_KETRMAX:
1675    case OP_KETRMIN:
1676    case OP_ALT:
1677    return TRUE;
1678
1679    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680    MINUPTO, and POSUPTO may be followed by a multibyte character */
1681
1682#ifdef SUPPORT_UTF8
1683    case OP_STAR:
1684    case OP_MINSTAR:
1685    case OP_POSSTAR:
1686    case OP_QUERY:
1687    case OP_MINQUERY:
1688    case OP_POSQUERY:
1689    case OP_UPTO:
1690    case OP_MINUPTO:
1691    case OP_POSUPTO:
1692    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693    break;
1694#endif
1695    }
1696  }
1697
1698return TRUE;
1699}
1700
1701
1702
1703/*************************************************
1704*    Scan compiled regex for non-emptiness       *
1705*************************************************/
1706
1707/* This function is called to check for left recursive calls. We want to check
1708the current branch of the current pattern to see if it could match the empty
1709string. If it could, we must look outwards for branches at other levels,
1710stopping when we pass beyond the bracket which is the subject of the recursion.
1711
1712Arguments:
1713  code        points to start of the recursion
1714  endcode     points to where to stop (current RECURSE item)
1715  bcptr       points to the chain of current (unclosed) branch starts
1716  utf8        TRUE if in UTF-8 mode
1717
1718Returns:      TRUE if what is matched could be empty
1719*/
1720
1721static BOOL
1722could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723  BOOL utf8)
1724{
1725while (bcptr != NULL && bcptr->current >= code)
1726  {
1727  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728  bcptr = bcptr->outer;
1729  }
1730return TRUE;
1731}
1732
1733
1734
1735/*************************************************
1736*           Check for POSIX class syntax         *
1737*************************************************/
1738
1739/* This function is called when the sequence "[:" or "[." or "[=" is
1740encountered in a character class. It checks whether this is followed by a
1741sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742reach an unescaped ']' without the special preceding character, return FALSE.
1743
1744Originally, this function only recognized a sequence of letters between the
1745terminators, but it seems that Perl recognizes any sequence of characters,
1746though of course unknown POSIX names are subsequently rejected. Perl gives an
1747"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748didn't consider this to be a POSIX class. Likewise for [:1234:].
1749
1750The problem in trying to be exactly like Perl is in the handling of escapes. We
1751have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753below handles the special case of \], but does not try to do any other escape
1754processing. This makes it different from Perl for cases such as [:l\ower:]
1755where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757I think.
1758
1759Arguments:
1760  ptr      pointer to the initial [
1761  endptr   where to return the end pointer
1762
1763Returns:   TRUE or FALSE
1764*/
1765
1766static BOOL
1767check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768{
1769int terminator;          /* Don't combine these lines; the Solaris cc */
1770terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1771for (++ptr; *ptr != 0; ptr++)
1772  {
1773  if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774    {
1775    if (*ptr == ']') return FALSE;
1776    if (*ptr == terminator && ptr[1] == ']')
1777      {
1778      *endptr = ptr;
1779      return TRUE;
1780      }
1781    }
1782  }
1783return FALSE;
1784}
1785
1786
1787
1788
1789/*************************************************
1790*          Check POSIX class name                *
1791*************************************************/
1792
1793/* This function is called to check the name given in a POSIX-style class entry
1794such as [:alnum:].
1795
1796Arguments:
1797  ptr        points to the first letter
1798  len        the length of the name
1799
1800Returns:     a value representing the name, or -1 if unknown
1801*/
1802
1803static int
1804check_posix_name(const uschar *ptr, int len)
1805{
1806const char *pn = posix_names;
1807register int yield = 0;
1808while (posix_name_lengths[yield] != 0)
1809  {
1810  if (len == posix_name_lengths[yield] &&
1811    strncmp((const char *)ptr, pn, len) == 0) return yield;
1812  pn += posix_name_lengths[yield] + 1;
1813  yield++;
1814  }
1815return -1;
1816}
1817
1818
1819/*************************************************
1820*    Adjust OP_RECURSE items in repeated group   *
1821*************************************************/
1822
1823/* OP_RECURSE items contain an offset from the start of the regex to the group
1824that is referenced. This means that groups can be replicated for fixed
1825repetition simply by copying (because the recursion is allowed to refer to
1826earlier groups that are outside the current group). However, when a group is
1827optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1828it, after it has been compiled. This means that any OP_RECURSE items within it
1829that refer to the group itself or any contained groups have to have their
1830offsets adjusted. That one of the jobs of this function. Before it is called,
1831the partially compiled regex must be temporarily terminated with OP_END.
1832
1833This function has been extended with the possibility of forward references for
1834recursions and subroutine calls. It must also check the list of such references
1835for the group we are dealing with. If it finds that one of the recursions in
1836the current group is on this list, it adjusts the offset in the list, not the
1837value in the reference (which is a group number).
1838
1839Arguments:
1840  group      points to the start of the group
1841  adjust     the amount by which the group is to be moved
1842  utf8       TRUE in UTF-8 mode
1843  cd         contains pointers to tables etc.
1844  save_hwm   the hwm forward reference pointer at the start of the group
1845
1846Returns:     nothing
1847*/
1848
1849static void
1850adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1851  uschar *save_hwm)
1852{
1853uschar *ptr = group;
1854
1855while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1856  {
1857  int offset;
1858  uschar *hc;
1859
1860  /* See if this recursion is on the forward reference list. If so, adjust the
1861  reference. */
1862
1863  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1864    {
1865    offset = GET(hc, 0);
1866    if (cd->start_code + offset == ptr + 1)
1867      {
1868      PUT(hc, 0, offset + adjust);
1869      break;
1870      }
1871    }
1872
1873  /* Otherwise, adjust the recursion offset if it's after the start of this
1874  group. */
1875
1876  if (hc >= cd->hwm)
1877    {
1878    offset = GET(ptr, 1);
1879    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1880    }
1881
1882  ptr += 1 + LINK_SIZE;
1883  }
1884}
1885
1886
1887
1888/*************************************************
1889*        Insert an automatic callout point       *
1890*************************************************/
1891
1892/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1893callout points before each pattern item.
1894
1895Arguments:
1896  code           current code pointer
1897  ptr            current pattern pointer
1898  cd             pointers to tables etc
1899
1900Returns:         new code pointer
1901*/
1902
1903static uschar *
1904auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1905{
1906*code++ = OP_CALLOUT;
1907*code++ = 255;
1908PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1909PUT(code, LINK_SIZE, 0);                /* Default length */
1910return code + 2*LINK_SIZE;
1911}
1912
1913
1914
1915/*************************************************
1916*         Complete a callout item                *
1917*************************************************/
1918
1919/* A callout item contains the length of the next item in the pattern, which
1920we can't fill in till after we have reached the relevant point. This is used
1921for both automatic and manual callouts.
1922
1923Arguments:
1924  previous_callout   points to previous callout item
1925  ptr                current pattern pointer
1926  cd                 pointers to tables etc
1927
1928Returns:             nothing
1929*/
1930
1931static void
1932complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1933{
1934int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1935PUT(previous_callout, 2 + LINK_SIZE, length);
1936}
1937
1938
1939
1940#ifdef SUPPORT_UCP
1941/*************************************************
1942*           Get othercase range                  *
1943*************************************************/
1944
1945/* This function is passed the start and end of a class range, in UTF-8 mode
1946with UCP support. It searches up the characters, looking for internal ranges of
1947characters in the "other" case. Each call returns the next one, updating the
1948start address.
1949
1950Arguments:
1951  cptr        points to starting character value; updated
1952  d           end value
1953  ocptr       where to put start of othercase range
1954  odptr       where to put end of othercase range
1955
1956Yield:        TRUE when range returned; FALSE when no more
1957*/
1958
1959static BOOL
1960get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1961  unsigned int *odptr)
1962{
1963unsigned int c, othercase, next;
1964
1965for (c = *cptr; c <= d; c++)
1966  { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1967
1968if (c > d) return FALSE;
1969
1970*ocptr = othercase;
1971next = othercase + 1;
1972
1973for (++c; c <= d; c++)
1974  {
1975  if (_pcre_ucp_othercase(c) != next) break;
1976  next++;
1977  }
1978
1979*odptr = next - 1;
1980*cptr = c;
1981
1982return TRUE;
1983}
1984#endif  /* SUPPORT_UCP */
1985
1986
1987
1988/*************************************************
1989*     Check if auto-possessifying is possible    *
1990*************************************************/
1991
1992/* This function is called for unlimited repeats of certain items, to see
1993whether the next thing could possibly match the repeated item. If not, it makes
1994sense to automatically possessify the repeated item.
1995
1996Arguments:
1997  op_code       the repeated op code
1998  this          data for this item, depends on the opcode
1999  utf8          TRUE in UTF-8 mode
2000  utf8_char     used for utf8 character bytes, NULL if not relevant
2001  ptr           next character in pattern
2002  options       options bits
2003  cd            contains pointers to tables etc.
2004
2005Returns:        TRUE if possessifying is wanted
2006*/
2007
2008static BOOL
2009check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2010  const uschar *ptr, int options, compile_data *cd)
2011{
2012int next;
2013
2014/* Skip whitespace and comments in extended mode */
2015
2016if ((options & PCRE_EXTENDED) != 0)
2017  {
2018  for (;;)
2019    {
2020    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2021    if (*ptr == '#')
2022      {
2023      while (*(++ptr) != 0)
2024        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2025      }
2026    else break;
2027    }
2028  }
2029
2030/* If the next item is one that we can handle, get its value. A non-negative
2031value is a character, a negative value is an escape value. */
2032
2033if (*ptr == '\\')
2034  {
2035  int temperrorcode = 0;
2036  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2037  if (temperrorcode != 0) return FALSE;
2038  ptr++;    /* Point after the escape sequence */
2039  }
2040
2041else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2042  {
2043#ifdef SUPPORT_UTF8
2044  if (utf8) { GETCHARINC(next, ptr); } else
2045#endif
2046  next = *ptr++;
2047  }
2048
2049else return FALSE;
2050
2051/* Skip whitespace and comments in extended mode */
2052
2053if ((options & PCRE_EXTENDED) != 0)
2054  {
2055  for (;;)
2056    {
2057    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2058    if (*ptr == '#')
2059      {
2060      while (*(++ptr) != 0)
2061        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2062      }
2063    else break;
2064    }
2065  }
2066
2067/* If the next thing is itself optional, we have to give up. */
2068
2069if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2070  return FALSE;
2071
2072/* Now compare the next item with the previous opcode. If the previous is a
2073positive single character match, "item" either contains the character or, if
2074"item" is greater than 127 in utf8 mode, the character's bytes are in
2075utf8_char. */
2076
2077
2078/* Handle cases when the next item is a character. */
2079
2080if (next >= 0) switch(op_code)
2081  {
2082  case OP_CHAR:
2083#ifdef SUPPORT_UTF8
2084  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2085#endif
2086  return item != next;
2087
2088  /* For CHARNC (caseless character) we must check the other case. If we have
2089  Unicode property support, we can use it to test the other case of
2090  high-valued characters. */
2091
2092  case OP_CHARNC:
2093#ifdef SUPPORT_UTF8
2094  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2095#endif
2096  if (item == next) return FALSE;
2097#ifdef SUPPORT_UTF8
2098  if (utf8)
2099    {
2100    unsigned int othercase;
2101    if (next < 128) othercase = cd->fcc[next]; else
2102#ifdef SUPPORT_UCP
2103    othercase = _pcre_ucp_othercase((unsigned int)next);
2104#else
2105    othercase = NOTACHAR;
2106#endif
2107    return (unsigned int)item != othercase;
2108    }
2109  else
2110#endif  /* SUPPORT_UTF8 */
2111  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2112
2113  /* For OP_NOT, "item" must be a single-byte character. */
2114
2115  case OP_NOT:
2116  if (next < 0) return FALSE;  /* Not a character */
2117  if (item == next) return TRUE;
2118  if ((options & PCRE_CASELESS) == 0) return FALSE;
2119#ifdef SUPPORT_UTF8
2120  if (utf8)
2121    {
2122    unsigned int othercase;
2123    if (next < 128) othercase = cd->fcc[next]; else
2124#ifdef SUPPORT_UCP
2125    othercase = _pcre_ucp_othercase(next);
2126#else
2127    othercase = NOTACHAR;
2128#endif
2129    return (unsigned int)item == othercase;
2130    }
2131  else
2132#endif  /* SUPPORT_UTF8 */
2133  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2134
2135  case OP_DIGIT:
2136  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2137
2138  case OP_NOT_DIGIT:
2139  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2140
2141  case OP_WHITESPACE:
2142  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2143
2144  case OP_NOT_WHITESPACE:
2145  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2146
2147  case OP_WORDCHAR:
2148  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2149
2150  case OP_NOT_WORDCHAR:
2151  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2152
2153  case OP_HSPACE:
2154  case OP_NOT_HSPACE:
2155  switch(next)
2156    {
2157    case 0x09:
2158    case 0x20:
2159    case 0xa0:
2160    case 0x1680:
2161    case 0x180e:
2162    case 0x2000:
2163    case 0x2001:
2164    case 0x2002:
2165    case 0x2003:
2166    case 0x2004:
2167    case 0x2005:
2168    case 0x2006:
2169    case 0x2007:
2170    case 0x2008:
2171    case 0x2009:
2172    case 0x200A:
2173    case 0x202f:
2174    case 0x205f:
2175    case 0x3000:
2176    return op_code != OP_HSPACE;
2177    default:
2178    return op_code == OP_HSPACE;
2179    }
2180
2181  case OP_VSPACE:
2182  case OP_NOT_VSPACE:
2183  switch(next)
2184    {
2185    case 0x0a:
2186    case 0x0b:
2187    case 0x0c:
2188    case 0x0d:
2189    case 0x85:
2190    case 0x2028:
2191    case 0x2029:
2192    return op_code != OP_VSPACE;
2193    default:
2194    return op_code == OP_VSPACE;
2195    }
2196
2197  default:
2198  return FALSE;
2199  }
2200
2201
2202/* Handle the case when the next item is \d, \s, etc. */
2203
2204switch(op_code)
2205  {
2206  case OP_CHAR:
2207  case OP_CHARNC:
2208#ifdef SUPPORT_UTF8
2209  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2210#endif
2211  switch(-next)
2212    {
2213    case ESC_d:
2214    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2215
2216    case ESC_D:
2217    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2218
2219    case ESC_s:
2220    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2221
2222    case ESC_S:
2223    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2224
2225    case ESC_w:
2226    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2227
2228    case ESC_W:
2229    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2230
2231    case ESC_h:
2232    case ESC_H:
2233    switch(item)
2234      {
2235      case 0x09:
2236      case 0x20:
2237      case 0xa0:
2238      case 0x1680:
2239      case 0x180e:
2240      case 0x2000:
2241      case 0x2001:
2242      case 0x2002:
2243      case 0x2003:
2244      case 0x2004:
2245      case 0x2005:
2246      case 0x2006:
2247      case 0x2007:
2248      case 0x2008:
2249      case 0x2009:
2250      case 0x200A:
2251      case 0x202f:
2252      case 0x205f:
2253      case 0x3000:
2254      return -next != ESC_h;
2255      default:
2256      return -next == ESC_h;
2257      }
2258
2259    case ESC_v:
2260    case ESC_V:
2261    switch(item)
2262      {
2263      case 0x0a:
2264      case 0x0b:
2265      case 0x0c:
2266      case 0x0d:
2267      case 0x85:
2268      case 0x2028:
2269      case 0x2029:
2270      return -next != ESC_v;
2271      default:
2272      return -next == ESC_v;
2273      }
2274
2275    default:
2276    return FALSE;
2277    }
2278
2279  case OP_DIGIT:
2280  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2281         next == -ESC_h || next == -ESC_v;
2282
2283  case OP_NOT_DIGIT:
2284  return next == -ESC_d;
2285
2286  case OP_WHITESPACE:
2287  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2288
2289  case OP_NOT_WHITESPACE:
2290  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2291
2292  case OP_HSPACE:
2293  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2294
2295  case OP_NOT_HSPACE:
2296  return next == -ESC_h;
2297
2298  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2299  case OP_VSPACE:
2300  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2301
2302  case OP_NOT_VSPACE:
2303  return next == -ESC_v;
2304
2305  case OP_WORDCHAR:
2306  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2307
2308  case OP_NOT_WORDCHAR:
2309  return next == -ESC_w || next == -ESC_d;
2310
2311  default:
2312  return FALSE;
2313  }
2314
2315/* Control does not reach here */
2316}
2317
2318
2319
2320/*************************************************
2321*           Compile one branch                   *
2322*************************************************/
2323
2324/* Scan the pattern, compiling it into the a vector. If the options are
2325changed during the branch, the pointer is used to change the external options
2326bits. This function is used during the pre-compile phase when we are trying
2327to find out the amount of memory needed, as well as during the real compile
2328phase. The value of lengthptr distinguishes the two phases.
2329
2330Arguments:
2331  optionsptr     pointer to the option bits
2332  codeptr        points to the pointer to the current code point
2333  ptrptr         points to the current pattern pointer
2334  errorcodeptr   points to error code variable
2335  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2336  reqbyteptr     set to the last literal character required, else < 0
2337  bcptr          points to current branch chain
2338  cd             contains pointers to tables etc.
2339  lengthptr      NULL during the real compile phase
2340                 points to length accumulator during pre-compile phase
2341
2342Returns:         TRUE on success
2343                 FALSE, with *errorcodeptr set non-zero on error
2344*/
2345
2346static BOOL
2347compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2348  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2349  compile_data *cd, int *lengthptr)
2350{
2351int repeat_type, op_type;
2352int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2353int bravalue = 0;
2354int greedy_default, greedy_non_default;
2355int firstbyte, reqbyte;
2356int zeroreqbyte, zerofirstbyte;
2357int req_caseopt, reqvary, tempreqvary;
2358int options = *optionsptr;
2359int after_manual_callout = 0;
2360int length_prevgroup = 0;
2361register int c;
2362register uschar *code = *codeptr;
2363uschar *last_code = code;
2364uschar *orig_code = code;
2365uschar *tempcode;
2366BOOL inescq = FALSE;
2367BOOL groupsetfirstbyte = FALSE;
2368const uschar *ptr = *ptrptr;
2369const uschar *tempptr;
2370uschar *previous = NULL;
2371uschar *previous_callout = NULL;
2372uschar *save_hwm = NULL;
2373uschar classbits[32];
2374
2375#ifdef SUPPORT_UTF8
2376BOOL class_utf8;
2377BOOL utf8 = (options & PCRE_UTF8) != 0;
2378uschar *class_utf8data;
2379uschar *class_utf8data_base;
2380uschar utf8_char[6];
2381#else
2382BOOL utf8 = FALSE;
2383uschar *utf8_char = NULL;
2384#endif
2385
2386#ifdef DEBUG
2387if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2388#endif
2389
2390/* Set up the default and non-default settings for greediness */
2391
2392greedy_default = ((options & PCRE_UNGREEDY) != 0);
2393greedy_non_default = greedy_default ^ 1;
2394
2395/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2396matching encountered yet". It gets changed to REQ_NONE if we hit something that
2397matches a non-fixed char first char; reqbyte just remains unset if we never
2398find one.
2399
2400When we hit a repeat whose minimum is zero, we may have to adjust these values
2401to take the zero repeat into account. This is implemented by setting them to
2402zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2403item types that can be repeated set these backoff variables appropriately. */
2404
2405firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2406
2407/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2408according to the current setting of the caseless flag. REQ_CASELESS is a bit
2409value > 255. It is added into the firstbyte or reqbyte variables to record the
2410case status of the value. This is used only for ASCII characters. */
2411
2412req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2413
2414/* Switch on next character until the end of the branch */
2415
2416for (;; ptr++)
2417  {
2418  BOOL negate_class;
2419  BOOL should_flip_negation;
2420  BOOL possessive_quantifier;
2421  BOOL is_quantifier;
2422  BOOL is_recurse;
2423  BOOL reset_bracount;
2424  int class_charcount;
2425  int class_lastchar;
2426  int newoptions;
2427  int recno;
2428  int refsign;
2429  int skipbytes;
2430  int subreqbyte;
2431  int subfirstbyte;
2432  int terminator;
2433  int mclength;
2434  uschar mcbuffer[8];
2435
2436  /* Get next byte in the pattern */
2437
2438  c = *ptr;
2439
2440  /* If we are in the pre-compile phase, accumulate the length used for the
2441  previous cycle of this loop. */
2442
2443  if (lengthptr != NULL)
2444    {
2445#ifdef DEBUG
2446    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2447#endif
2448    if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2449      {
2450      *errorcodeptr = ERR52;
2451      goto FAILED;
2452      }
2453
2454    /* There is at least one situation where code goes backwards: this is the
2455    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2456    the class is simply eliminated. However, it is created first, so we have to
2457    allow memory for it. Therefore, don't ever reduce the length at this point.
2458    */
2459
2460    if (code < last_code) code = last_code;
2461
2462    /* Paranoid check for integer overflow */
2463
2464    if (OFLOW_MAX - *lengthptr < code - last_code)
2465      {
2466      *errorcodeptr = ERR20;
2467      goto FAILED;
2468      }
2469
2470    *lengthptr += code - last_code;
2471    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2472
2473    /* If "previous" is set and it is not at the start of the work space, move
2474    it back to there, in order to avoid filling up the work space. Otherwise,
2475    if "previous" is NULL, reset the current code pointer to the start. */
2476
2477    if (previous != NULL)
2478      {
2479      if (previous > orig_code)
2480        {
2481        memmove(orig_code, previous, code - previous);
2482        code -= previous - orig_code;
2483        previous = orig_code;
2484        }
2485      }
2486    else code = orig_code;
2487
2488    /* Remember where this code item starts so we can pick up the length
2489    next time round. */
2490
2491    last_code = code;
2492    }
2493
2494  /* In the real compile phase, just check the workspace used by the forward
2495  reference list. */
2496
2497  else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2498    {
2499    *errorcodeptr = ERR52;
2500    goto FAILED;
2501    }
2502
2503  /* If in \Q...\E, check for the end; if not, we have a literal */
2504
2505  if (inescq && c != 0)
2506    {
2507    if (c == '\\' && ptr[1] == 'E')
2508      {
2509      inescq = FALSE;
2510      ptr++;
2511      continue;
2512      }
2513    else
2514      {
2515      if (previous_callout != NULL)
2516        {
2517        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2518          complete_callout(previous_callout, ptr, cd);
2519        previous_callout = NULL;
2520        }
2521      if ((options & PCRE_AUTO_CALLOUT) != 0)
2522        {
2523        previous_callout = code;
2524        code = auto_callout(code, ptr, cd);
2525        }
2526      goto NORMAL_CHAR;
2527      }
2528    }
2529
2530  /* Fill in length of a previous callout, except when the next thing is
2531  a quantifier. */
2532
2533  is_quantifier = c == '*' || c == '+' || c == '?' ||
2534    (c == '{' && is_counted_repeat(ptr+1));
2535
2536  if (!is_quantifier && previous_callout != NULL &&
2537       after_manual_callout-- <= 0)
2538    {
2539    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2540      complete_callout(previous_callout, ptr, cd);
2541    previous_callout = NULL;
2542    }
2543
2544  /* In extended mode, skip white space and comments */
2545
2546  if ((options & PCRE_EXTENDED) != 0)
2547    {
2548    if ((cd->ctypes[c] & ctype_space) != 0) continue;
2549    if (c == '#')
2550      {
2551      while (*(++ptr) != 0)
2552        {
2553        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2554        }
2555      if (*ptr != 0) continue;
2556
2557      /* Else fall through to handle end of string */
2558      c = 0;
2559      }
2560    }
2561
2562  /* No auto callout for quantifiers. */
2563
2564  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2565    {
2566    previous_callout = code;
2567    code = auto_callout(code, ptr, cd);
2568    }
2569
2570  switch(c)
2571    {
2572    /* ===================================================================*/
2573    case 0:                        /* The branch terminates at string end */
2574    case '|':                      /* or | or ) */
2575    case ')':
2576    *firstbyteptr = firstbyte;
2577    *reqbyteptr = reqbyte;
2578    *codeptr = code;
2579    *ptrptr = ptr;
2580    if (lengthptr != NULL)
2581      {
2582      if (OFLOW_MAX - *lengthptr < code - last_code)
2583        {
2584        *errorcodeptr = ERR20;
2585        goto FAILED;
2586        }
2587      *lengthptr += code - last_code;   /* To include callout length */
2588      DPRINTF((">> end branch\n"));
2589      }
2590    return TRUE;
2591
2592
2593    /* ===================================================================*/
2594    /* Handle single-character metacharacters. In multiline mode, ^ disables
2595    the setting of any following char as a first character. */
2596
2597    case '^':
2598    if ((options & PCRE_MULTILINE) != 0)
2599      {
2600      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2601      }
2602    previous = NULL;
2603    *code++ = OP_CIRC;
2604    break;
2605
2606    case '$':
2607    previous = NULL;
2608    *code++ = OP_DOLL;
2609    break;
2610
2611    /* There can never be a first char if '.' is first, whatever happens about
2612    repeats. The value of reqbyte doesn't change either. */
2613
2614    case '.':
2615    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2616    zerofirstbyte = firstbyte;
2617    zeroreqbyte = reqbyte;
2618    previous = code;
2619    *code++ = OP_ANY;
2620    break;
2621
2622
2623    /* ===================================================================*/
2624    /* Character classes. If the included characters are all < 256, we build a
2625    32-byte bitmap of the permitted characters, except in the special case
2626    where there is only one such character. For negated classes, we build the
2627    map as usual, then invert it at the end. However, we use a different opcode
2628    so that data characters > 255 can be handled correctly.
2629
2630    If the class contains characters outside the 0-255 range, a different
2631    opcode is compiled. It may optionally have a bit map for characters < 256,
2632    but those above are are explicitly listed afterwards. A flag byte tells
2633    whether the bitmap is present, and whether this is a negated class or not.
2634    */
2635
2636    case '[':
2637    previous = code;
2638
2639    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2640    they are encountered at the top level, so we'll do that too. */
2641
2642    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2643        check_posix_syntax(ptr, &tempptr))
2644      {
2645      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2646      goto FAILED;
2647      }
2648
2649    /* If the first character is '^', set the negation flag and skip it. Also,
2650    if the first few characters (either before or after ^) are \Q\E or \E we
2651    skip them too. This makes for compatibility with Perl. */
2652
2653    negate_class = FALSE;
2654    for (;;)
2655      {
2656      c = *(++ptr);
2657      if (c == '\\')
2658        {
2659        if (ptr[1] == 'E') ptr++;
2660          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2661            else break;
2662        }
2663      else if (!negate_class && c == '^')
2664        negate_class = TRUE;
2665      else break;
2666      }
2667
2668    /* If a class contains a negative special such as \S, we need to flip the
2669    negation flag at the end, so that support for characters > 255 works
2670    correctly (they are all included in the class). */
2671
2672    should_flip_negation = FALSE;
2673
2674    /* Keep a count of chars with values < 256 so that we can optimize the case
2675    of just a single character (as long as it's < 256). However, For higher
2676    valued UTF-8 characters, we don't yet do any optimization. */
2677
2678    class_charcount = 0;
2679    class_lastchar = -1;
2680
2681    /* Initialize the 32-char bit map to all zeros. We build the map in a
2682    temporary bit of memory, in case the class contains only 1 character (less
2683    than 256), because in that case the compiled code doesn't use the bit map.
2684    */
2685
2686    memset(classbits, 0, 32 * sizeof(uschar));
2687
2688#ifdef SUPPORT_UTF8
2689    class_utf8 = FALSE;                       /* No chars >= 256 */
2690    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2691    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2692#endif
2693
2694    /* Process characters until ] is reached. By writing this as a "do" it
2695    means that an initial ] is taken as a data character. At the start of the
2696    loop, c contains the first byte of the character. */
2697
2698    if (c != 0) do
2699      {
2700      const uschar *oldptr;
2701
2702#ifdef SUPPORT_UTF8
2703      if (utf8 && c > 127)
2704        {                           /* Braces are required because the */
2705        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2706        }
2707
2708      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2709      data and reset the pointer. This is so that very large classes that
2710      contain a zillion UTF-8 characters no longer overwrite the work space
2711      (which is on the stack). */
2712
2713      if (lengthptr != NULL)
2714        {
2715        *lengthptr += class_utf8data - class_utf8data_base;
2716        class_utf8data = class_utf8data_base;
2717        }
2718
2719#endif
2720
2721      /* Inside \Q...\E everything is literal except \E */
2722
2723      if (inescq)
2724        {
2725        if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2726          {
2727          inescq = FALSE;                   /* Reset literal state */
2728          ptr++;                            /* Skip the 'E' */
2729          continue;                         /* Carry on with next */
2730          }
2731        goto CHECK_RANGE;                   /* Could be range if \E follows */
2732        }
2733
2734      /* Handle POSIX class names. Perl allows a negation extension of the
2735      form [:^name:]. A square bracket that doesn't match the syntax is
2736      treated as a literal. We also recognize the POSIX constructions
2737      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2738      5.6 and 5.8 do. */
2739
2740      if (c == '[' &&
2741          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2742          check_posix_syntax(ptr, &tempptr))
2743        {
2744        BOOL local_negate = FALSE;
2745        int posix_class, taboffset, tabopt;
2746        register const uschar *cbits = cd->cbits;
2747        uschar pbits[32];
2748
2749        if (ptr[1] != ':')
2750          {
2751          *errorcodeptr = ERR31;
2752          goto FAILED;
2753          }
2754
2755        ptr += 2;
2756        if (*ptr == '^')
2757          {
2758          local_negate = TRUE;
2759          should_flip_negation = TRUE;  /* Note negative special */
2760          ptr++;
2761          }
2762
2763        posix_class = check_posix_name(ptr, tempptr - ptr);
2764        if (posix_class < 0)
2765          {
2766          *errorcodeptr = ERR30;
2767          goto FAILED;
2768          }
2769
2770        /* If matching is caseless, upper and lower are converted to
2771        alpha. This relies on the fact that the class table starts with
2772        alpha, lower, upper as the first 3 entries. */
2773
2774        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2775          posix_class = 0;
2776
2777        /* We build the bit map for the POSIX class in a chunk of local store
2778        because we may be adding and subtracting from it, and we don't want to
2779        subtract bits that may be in the main map already. At the end we or the
2780        result into the bit map that is being built. */
2781
2782        posix_class *= 3;
2783
2784        /* Copy in the first table (always present) */
2785
2786        memcpy(pbits, cbits + posix_class_maps[posix_class],
2787          32 * sizeof(uschar));
2788
2789        /* If there is a second table, add or remove it as required. */
2790
2791        taboffset = posix_class_maps[posix_class + 1];
2792        tabopt = posix_class_maps[posix_class + 2];
2793
2794        if (taboffset >= 0)
2795          {
2796          if (tabopt >= 0)
2797            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2798          else
2799            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2800          }
2801
2802        /* Not see if we need to remove any special characters. An option
2803        value of 1 removes vertical space and 2 removes underscore. */
2804
2805        if (tabopt < 0) tabopt = -tabopt;
2806        if (tabopt == 1) pbits[1] &= ~0x3c;
2807          else if (tabopt == 2) pbits[11] &= 0x7f;
2808
2809        /* Add the POSIX table or its complement into the main table that is
2810        being built and we are done. */
2811
2812        if (local_negate)
2813          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2814        else
2815          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2816
2817        ptr = tempptr + 1;
2818        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2819        continue;    /* End of POSIX syntax handling */
2820        }
2821
2822      /* Backslash may introduce a single character, or it may introduce one
2823      of the specials, which just set a flag. The sequence \b is a special
2824      case. Inside a class (and only there) it is treated as backspace.
2825      Elsewhere it marks a word boundary. Other escapes have preset maps ready
2826      to 'or' into the one we are building. We assume they have more than one
2827      character in them, so set class_charcount bigger than one. */
2828
2829      if (c == '\\')
2830        {
2831        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2832        if (*errorcodeptr != 0) goto FAILED;
2833
2834        if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
2835        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2836        else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2837        else if (-c == ESC_Q)            /* Handle start of quoted string */
2838          {
2839          if (ptr[1] == '\\' && ptr[2] == 'E')
2840            {
2841            ptr += 2; /* avoid empty string */
2842            }
2843          else inescq = TRUE;
2844          continue;
2845          }
2846        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2847
2848        if (c < 0)
2849          {
2850          register const uschar *cbits = cd->cbits;
2851          class_charcount += 2;     /* Greater than 1 is what matters */
2852
2853          /* Save time by not doing this in the pre-compile phase. */
2854
2855          if (lengthptr == NULL) switch (-c)
2856            {
2857            case ESC_d:
2858            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2859            continue;
2860
2861            case ESC_D:
2862            should_flip_negation = TRUE;
2863            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2864            continue;
2865
2866            case ESC_w:
2867            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2868            continue;
2869
2870            case ESC_W:
2871            should_flip_negation = TRUE;
2872            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2873            continue;
2874
2875            case ESC_s:
2876            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2877            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2878            continue;
2879
2880            case ESC_S:
2881            should_flip_negation = TRUE;
2882            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2883            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2884            continue;
2885
2886            default:    /* Not recognized; fall through */
2887            break;      /* Need "default" setting to stop compiler warning. */
2888            }
2889
2890          /* In the pre-compile phase, just do the recognition. */
2891
2892          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2893                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2894
2895          /* We need to deal with \H, \h, \V, and \v in both phases because
2896          they use extra memory. */
2897
2898          if (-c == ESC_h)
2899            {
2900            SETBIT(classbits, 0x09); /* VT */
2901            SETBIT(classbits, 0x20); /* SPACE */
2902            SETBIT(classbits, 0xa0); /* NSBP */
2903#ifdef SUPPORT_UTF8
2904            if (utf8)
2905              {
2906              class_utf8 = TRUE;
2907              *class_utf8data++ = XCL_SINGLE;
2908              class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2909              *class_utf8data++ = XCL_SINGLE;
2910              class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2911              *class_utf8data++ = XCL_RANGE;
2912              class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2913              class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2914              *class_utf8data++ = XCL_SINGLE;
2915              class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2916              *class_utf8data++ = XCL_SINGLE;
2917              class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2918              *class_utf8data++ = XCL_SINGLE;
2919              class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2920              }
2921#endif
2922            continue;
2923            }
2924
2925          if (-c == ESC_H)
2926            {
2927            for (c = 0; c < 32; c++)
2928              {
2929              int x = 0xff;
2930              switch (c)
2931                {
2932                case 0x09/8: x ^= 1 << (0x09%8); break;
2933                case 0x20/8: x ^= 1 << (0x20%8); break;
2934                case 0xa0/8: x ^= 1 << (0xa0%8); break;
2935                default: break;
2936                }
2937              classbits[c] |= x;
2938              }
2939
2940#ifdef SUPPORT_UTF8
2941            if (utf8)
2942              {
2943              class_utf8 = TRUE;
2944              *class_utf8data++ = XCL_RANGE;
2945              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2946              class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2947              *class_utf8data++ = XCL_RANGE;
2948              class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2949              class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2950              *class_utf8data++ = XCL_RANGE;
2951              class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2952              class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2953              *class_utf8data++ = XCL_RANGE;
2954              class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2955              class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2956              *class_utf8data++ = XCL_RANGE;
2957              class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2958              class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2959              *class_utf8data++ = XCL_RANGE;
2960              class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2961              class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2962              *class_utf8data++ = XCL_RANGE;
2963              class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2964              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2965              }
2966#endif
2967            continue;
2968            }
2969
2970          if (-c == ESC_v)
2971            {
2972            SETBIT(classbits, 0x0a); /* LF */
2973            SETBIT(classbits, 0x0b); /* VT */
2974            SETBIT(classbits, 0x0c); /* FF */
2975            SETBIT(classbits, 0x0d); /* CR */
2976            SETBIT(classbits, 0x85); /* NEL */
2977#ifdef SUPPORT_UTF8
2978            if (utf8)
2979              {
2980              class_utf8 = TRUE;
2981              *class_utf8data++ = XCL_RANGE;
2982              class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2983              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2984              }
2985#endif
2986            continue;
2987            }
2988
2989          if (-c == ESC_V)
2990            {
2991            for (c = 0; c < 32; c++)
2992              {
2993              int x = 0xff;
2994              switch (c)
2995                {
2996                case 0x0a/8: x ^= 1 << (0x0a%8);
2997                             x ^= 1 << (0x0b%8);
2998                             x ^= 1 << (0x0c%8);
2999                             x ^= 1 << (0x0d%8);
3000                             break;
3001                case 0x85/8: x ^= 1 << (0x85%8); break;
3002                default: break;
3003                }
3004              classbits[c] |= x;
3005              }
3006
3007#ifdef SUPPORT_UTF8
3008            if (utf8)
3009              {
3010              class_utf8 = TRUE;
3011              *class_utf8data++ = XCL_RANGE;
3012              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3013              class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3014              *class_utf8data++ = XCL_RANGE;
3015              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3016              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3017              }
3018#endif
3019            continue;
3020            }
3021
3022          /* We need to deal with \P and \p in both phases. */
3023
3024#ifdef SUPPORT_UCP
3025          if (-c == ESC_p || -c == ESC_P)
3026            {
3027            BOOL negated;
3028            int pdata;
3029            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3030            if (ptype < 0) goto FAILED;
3031            class_utf8 = TRUE;
3032            *class_utf8data++ = ((-c == ESC_p) != negated)?
3033              XCL_PROP : XCL_NOTPROP;
3034            *class_utf8data++ = ptype;
3035            *class_utf8data++ = pdata;
3036            class_charcount -= 2;   /* Not a < 256 character */
3037            continue;
3038            }
3039#endif
3040          /* Unrecognized escapes are faulted if PCRE is running in its
3041          strict mode. By default, for compatibility with Perl, they are
3042          treated as literals. */
3043
3044          if ((options & PCRE_EXTRA) != 0)
3045            {
3046            *errorcodeptr = ERR7;
3047            goto FAILED;
3048            }
3049
3050          class_charcount -= 2;  /* Undo the default count from above */
3051          c = *ptr;              /* Get the final character and fall through */
3052          }
3053
3054        /* Fall through if we have a single character (c >= 0). This may be
3055        greater than 256 in UTF-8 mode. */
3056
3057        }   /* End of backslash handling */
3058
3059      /* A single character may be followed by '-' to form a range. However,
3060      Perl does not permit ']' to be the end of the range. A '-' character
3061      at the end is treated as a literal. Perl ignores orphaned \E sequences
3062      entirely. The code for handling \Q and \E is messy. */
3063
3064      CHECK_RANGE:
3065      while (ptr[1] == '\\' && ptr[2] == 'E')
3066        {
3067        inescq = FALSE;
3068        ptr += 2;
3069        }
3070
3071      oldptr = ptr;
3072
3073      /* Remember \r or \n */
3074
3075      if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3076
3077      /* Check for range */
3078
3079      if (!inescq && ptr[1] == '-')
3080        {
3081        int d;
3082        ptr += 2;
3083        while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3084
3085        /* If we hit \Q (not followed by \E) at this point, go into escaped
3086        mode. */
3087
3088        while (*ptr == '\\' && ptr[1] == 'Q')
3089          {
3090          ptr += 2;
3091          if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3092          inescq = TRUE;
3093          break;
3094          }
3095
3096        if (*ptr == 0 || (!inescq && *ptr == ']'))
3097          {
3098          ptr = oldptr;
3099          goto LONE_SINGLE_CHARACTER;
3100          }
3101
3102#ifdef SUPPORT_UTF8
3103        if (utf8)
3104          {                           /* Braces are required because the */
3105          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3106          }
3107        else
3108#endif
3109        d = *ptr;  /* Not UTF-8 mode */
3110
3111        /* The second part of a range can be a single-character escape, but
3112        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3113        in such circumstances. */
3114
3115        if (!inescq && d == '\\')
3116          {
3117          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3118          if (*errorcodeptr != 0) goto FAILED;
3119
3120          /* \b is backspace; \X is literal X; \R is literal R; any other
3121          special means the '-' was literal */
3122
3123          if (d < 0)
3124            {
3125            if (d == -ESC_b) d = '\b';
3126            else if (d == -ESC_X) d = 'X';
3127            else if (d == -ESC_R) d = 'R'; else
3128              {
3129              ptr = oldptr;
3130              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3131              }
3132            }
3133          }
3134
3135        /* Check that the two values are in the correct order. Optimize
3136        one-character ranges */
3137
3138        if (d < c)
3139          {
3140          *errorcodeptr = ERR8;
3141          goto FAILED;
3142          }
3143
3144        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3145
3146        /* Remember \r or \n */
3147
3148        if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3149
3150        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3151        matching, we have to use an XCLASS with extra data items. Caseless
3152        matching for characters > 127 is available only if UCP support is
3153        available. */
3154
3155#ifdef SUPPORT_UTF8
3156        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3157          {
3158          class_utf8 = TRUE;
3159
3160          /* With UCP support, we can find the other case equivalents of
3161          the relevant characters. There may be several ranges. Optimize how
3162          they fit with the basic range. */
3163
3164#ifdef SUPPORT_UCP
3165          if ((options & PCRE_CASELESS) != 0)
3166            {
3167            unsigned int occ, ocd;
3168            unsigned int cc = c;
3169            unsigned int origd = d;
3170            while (get_othercase_range(&cc, origd, &occ, &ocd))
3171              {
3172              if (occ >= (unsigned int)c &&
3173                  ocd <= (unsigned int)d)
3174                continue;                          /* Skip embedded ranges */
3175
3176              if (occ < (unsigned int)&&
3177                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3178                {                                  /* if there is overlap,   */
3179                c = occ;                           /* noting that if occ < c */
3180                continue;                          /* we can't have ocd > d  */
3181                }                                  /* because a subrange is  */
3182              if (ocd > (unsigned int)d &&
3183                  occ <= (unsigned int)d + 1)      /* always shorter than    */
3184                {                                  /* the basic range.       */
3185                d = ocd;
3186                continue;
3187                }
3188
3189              if (occ == ocd)
3190                {
3191                *class_utf8data++ = XCL_SINGLE;
3192                }
3193              else
3194                {
3195                *class_utf8data++ = XCL_RANGE;
3196                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3197                }
3198              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3199              }
3200            }
3201#endif  /* SUPPORT_UCP */
3202
3203          /* Now record the original range, possibly modified for UCP caseless
3204          overlapping ranges. */
3205
3206          *class_utf8data++ = XCL_RANGE;
3207          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3208          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3209
3210          /* With UCP support, we are done. Without UCP support, there is no
3211          caseless matching for UTF-8 characters > 127; we can use the bit map
3212          for the smaller ones. */
3213
3214#ifdef SUPPORT_UCP
3215          continue;    /* With next character in the class */
3216#else
3217          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3218
3219          /* Adjust upper limit and fall through to set up the map */
3220
3221          d = 127;
3222
3223#endif  /* SUPPORT_UCP */
3224          }
3225#endif  /* SUPPORT_UTF8 */
3226
3227        /* We use the bit map for all cases when not in UTF-8 mode; else
3228        ranges that lie entirely within 0-127 when there is UCP support; else
3229        for partial ranges without UCP support. */
3230
3231        class_charcount += d - c + 1;
3232        class_lastchar = d;
3233
3234        /* We can save a bit of time by skipping this in the pre-compile. */
3235
3236        if (lengthptr == NULL) for (; c <= d; c++)
3237          {
3238          classbits[c/8] |= (1 << (c&7));
3239          if ((options & PCRE_CASELESS) != 0)
3240            {
3241            int uc = cd->fcc[c];           /* flip case */
3242            classbits[uc/8] |= (1 << (uc&7));
3243            }
3244          }
3245
3246        continue;   /* Go get the next char in the class */
3247        }
3248
3249      /* Handle a lone single character - we can get here for a normal
3250      non-escape char, or after \ that introduces a single character or for an
3251      apparent range that isn't. */
3252
3253      LONE_SINGLE_CHARACTER:
3254
3255      /* Handle a character that cannot go in the bit map */
3256
3257#ifdef SUPPORT_UTF8
3258      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3259        {
3260        class_utf8 = TRUE;
3261        *class_utf8data++ = XCL_SINGLE;
3262        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3263
3264#ifdef SUPPORT_UCP
3265        if ((options & PCRE_CASELESS) != 0)
3266          {
3267          unsigned int othercase;
3268          if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3269            {
3270            *class_utf8data++ = XCL_SINGLE;
3271            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3272            }
3273          }
3274#endif  /* SUPPORT_UCP */
3275
3276        }
3277      else
3278#endif  /* SUPPORT_UTF8 */
3279
3280      /* Handle a single-byte character */
3281        {
3282        classbits[c/8] |= (1 << (c&7));
3283        if ((options & PCRE_CASELESS) != 0)
3284          {
3285          c = cd->fcc[c];   /* flip case */
3286          classbits[c/8] |= (1 << (c&7));
3287          }
3288        class_charcount++;
3289        class_lastchar = c;
3290        }
3291      }
3292
3293    /* Loop until ']' reached. This "while" is the end of the "do" above. */
3294
3295    while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3296
3297    if (c == 0)                          /* Missing terminating ']' */
3298      {
3299      *errorcodeptr = ERR6;
3300      goto FAILED;
3301      }
3302
3303
3304/* This code has been disabled because it would mean that \s counts as
3305an explicit \r or \n reference, and that's not really what is wanted. Now
3306we set the flag only if there is a literal "\r" or "\n" in the class. */
3307
3308#if 0
3309    /* Remember whether \r or \n are in this class */
3310
3311    if (negate_class)
3312      {
3313      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3314      }
3315    else
3316      {
3317      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3318      }
3319#endif
3320
3321
3322    /* If class_charcount is 1, we saw precisely one character whose value is
3323    less than 256. As long as there were no characters >= 128 and there was no
3324    use of \p or \P, in other words, no use of any XCLASS features, we can
3325    optimize.
3326
3327    In UTF-8 mode, we can optimize the negative case only if there were no
3328    characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3329    operate on single-bytes only. This is an historical hangover. Maybe one day
3330    we can tidy these opcodes to handle multi-byte characters.
3331
3332    The optimization throws away the bit map. We turn the item into a
3333    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3334    that OP_NOT does not support multibyte characters. In the positive case, it
3335    can cause firstbyte to be set. Otherwise, there can be no first char if
3336    this item is first, whatever repeat count may follow. In the case of
3337    reqbyte, save the previous value for reinstating. */
3338
3339#ifdef SUPPORT_UTF8
3340    if (class_charcount == 1 && !class_utf8 &&
3341      (!utf8 || !negate_class || class_lastchar < 128))
3342#else
3343    if (class_charcount == 1)
3344#endif
3345      {
3346      zeroreqbyte = reqbyte;
3347
3348      /* The OP_NOT opcode works on one-byte characters only. */
3349
3350      if (negate_class)
3351        {
3352        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3353        zerofirstbyte = firstbyte;
3354        *code++ = OP_NOT;
3355        *code++ = class_lastchar;
3356        break;
3357        }
3358
3359      /* For a single, positive character, get the value into mcbuffer, and
3360      then we can handle this with the normal one-character code. */
3361
3362#ifdef SUPPORT_UTF8
3363      if (utf8 && class_lastchar > 127)
3364        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3365      else
3366#endif
3367        {
3368        mcbuffer[0] = class_lastchar;
3369        mclength = 1;
3370        }
3371      goto ONE_CHAR;
3372      }       /* End of 1-char optimization */
3373
3374    /* The general case - not the one-char optimization. If this is the first
3375    thing in the branch, there can be no first char setting, whatever the
3376    repeat count. Any reqbyte setting must remain unchanged after any kind of
3377    repeat. */
3378
3379    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3380    zerofirstbyte = firstbyte;
3381    zeroreqbyte = reqbyte;
3382
3383    /* If there are characters with values > 255, we have to compile an
3384    extended class, with its own opcode, unless there was a negated special
3385    such as \S in the class, because in that case all characters > 255 are in
3386    the class, so any that were explicitly given as well can be ignored. If
3387    (when there are explicit characters > 255 that must be listed) there are no
3388    characters < 256, we can omit the bitmap in the actual compiled code. */
3389
3390#ifdef SUPPORT_UTF8
3391    if (class_utf8 && !should_flip_negation)
3392      {
3393      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3394      *code++ = OP_XCLASS;
3395      code += LINK_SIZE;
3396      *code = negate_class? XCL_NOT : 0;
3397
3398      /* If the map is required, move up the extra data to make room for it;
3399      otherwise just move the code pointer to the end of the extra data. */
3400
3401      if (class_charcount > 0)
3402        {
3403        *code++ |= XCL_MAP;
3404        memmove(code + 32, code, class_utf8data - code);
3405        memcpy(code, classbits, 32);
3406        code = class_utf8data + 32;
3407        }
3408      else code = class_utf8data;
3409
3410      /* Now fill in the complete length of the item */
3411
3412      PUT(previous, 1, code - previous);
3413      break;   /* End of class handling */
3414      }
3415#endif
3416
3417    /* If there are no characters > 255, set the opcode to OP_CLASS or
3418    OP_NCLASS, depending on whether the whole class was negated and whether
3419    there were negative specials such as \S in the class. Then copy the 32-byte
3420    map into the code vector, negating it if necessary. */
3421
3422    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3423    if (negate_class)
3424      {
3425      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3426        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3427      }
3428    else
3429      {
3430      memcpy(code, classbits, 32);
3431      }
3432    code += 32;
3433    break;
3434
3435
3436    /* ===================================================================*/
3437    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3438    has been tested above. */
3439
3440    case '{':
3441    if (!is_quantifier) goto NORMAL_CHAR;
3442    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3443    if (*errorcodeptr != 0) goto FAILED;
3444    goto REPEAT;
3445
3446    case '*':
3447    repeat_min = 0;
3448    repeat_max = -1;
3449    goto REPEAT;
3450
3451    case '+':
3452    repeat_min = 1;
3453    repeat_max = -1;
3454    goto REPEAT;
3455
3456    case '?':
3457    repeat_min = 0;
3458    repeat_max = 1;
3459
3460    REPEAT:
3461    if (previous == NULL)
3462      {
3463      *errorcodeptr = ERR9;
3464      goto FAILED;
3465      }
3466
3467    if (repeat_min == 0)
3468      {
3469      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3470      reqbyte = zeroreqbyte;        /* Ditto */
3471      }
3472
3473    /* Remember whether this is a variable length repeat */
3474
3475    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3476
3477    op_type = 0;                    /* Default single-char op codes */
3478    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3479
3480    /* Save start of previous item, in case we have to move it up to make space
3481    for an inserted OP_ONCE for the additional '+' extension. */
3482
3483    tempcode = previous;
3484
3485    /* If the next character is '+', we have a possessive quantifier. This
3486    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3487    If the next character is '?' this is a minimizing repeat, by default,
3488    but if PCRE_UNGREEDY is set, it works the other way round. We change the
3489    repeat type to the non-default. */
3490
3491    if (ptr[1] == '+')
3492      {
3493      repeat_type = 0;                  /* Force greedy */
3494      possessive_quantifier = TRUE;
3495      ptr++;
3496      }
3497    else if (ptr[1] == '?')
3498      {
3499      repeat_type = greedy_non_default;
3500      ptr++;
3501      }
3502    else repeat_type = greedy_default;
3503
3504    /* If previous was a character match, abolish the item and generate a
3505    repeat item instead. If a char item has a minumum of more than one, ensure
3506    that it is set in reqbyte - it might not be if a sequence such as x{3} is
3507    the first thing in a branch because the x will have gone into firstbyte
3508    instead.  */
3509
3510    if (*previous == OP_CHAR || *previous == OP_CHARNC)
3511      {
3512      /* Deal with UTF-8 characters that take up more than one byte. It's
3513      easier to write this out separately than try to macrify it. Use c to
3514      hold the length of the character in bytes, plus 0x80 to flag that it's a
3515      length rather than a small character. */
3516
3517#ifdef SUPPORT_UTF8
3518      if (utf8 && (code[-1] & 0x80) != 0)
3519        {
3520        uschar *lastchar = code - 1;
3521        while((*lastchar & 0xc0) == 0x80) lastchar--;
3522        c = code - lastchar;            /* Length of UTF-8 character */
3523        memcpy(utf8_char, lastchar, c); /* Save the char */
3524        c |= 0x80;                      /* Flag c as a length */
3525        }
3526      else
3527#endif
3528
3529      /* Handle the case of a single byte - either with no UTF8 support, or
3530      with UTF-8 disabled, or for a UTF-8 character < 128. */
3531
3532        {
3533        c = code[-1];
3534        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3535        }
3536
3537      /* If the repetition is unlimited, it pays to see if the next thing on
3538      the line is something that cannot possibly match this character. If so,
3539      automatically possessifying this item gains some performance in the case
3540      where the match fails. */
3541
3542      if (!possessive_quantifier &&
3543          repeat_max < 0 &&
3544          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3545            options, cd))
3546        {
3547        repeat_type = 0;    /* Force greedy */
3548        possessive_quantifier = TRUE;
3549        }
3550
3551      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3552      }
3553
3554    /* If previous was a single negated character ([^a] or similar), we use
3555    one of the special opcodes, replacing it. The code is shared with single-
3556    character repeats by setting opt_type to add a suitable offset into
3557    repeat_type. We can also test for auto-possessification. OP_NOT is
3558    currently used only for single-byte chars. */
3559
3560    else if (*previous == OP_NOT)
3561      {
3562      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3563      c = previous[1];
3564      if (!possessive_quantifier &&
3565          repeat_max < 0 &&
3566          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3567        {
3568        repeat_type = 0;    /* Force greedy */
3569        possessive_quantifier = TRUE;
3570        }
3571      goto OUTPUT_SINGLE_REPEAT;
3572      }
3573
3574    /* If previous was a character type match (\d or similar), abolish it and
3575    create a suitable repeat item. The code is shared with single-character
3576    repeats by setting op_type to add a suitable offset into repeat_type. Note
3577    the the Unicode property types will be present only when SUPPORT_UCP is
3578    defined, but we don't wrap the little bits of code here because it just
3579    makes it horribly messy. */
3580
3581    else if (*previous < OP_EODN)
3582      {
3583      uschar *oldcode;
3584      int prop_type, prop_value;
3585      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3586      c = *previous;
3587
3588      if (!possessive_quantifier &&
3589          repeat_max < 0 &&
3590          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3591        {
3592        repeat_type = 0;    /* Force greedy */
3593        possessive_quantifier = TRUE;
3594        }
3595
3596      OUTPUT_SINGLE_REPEAT:
3597      if (*previous == OP_PROP || *previous == OP_NOTPROP)
3598        {
3599        prop_type = previous[1];
3600        prop_value = previous[2];
3601        }
3602      else prop_type = prop_value = -1;
3603
3604      oldcode = code;
3605      code = previous;                  /* Usually overwrite previous item */
3606
3607      /* If the maximum is zero then the minimum must also be zero; Perl allows
3608      this case, so we do too - by simply omitting the item altogether. */
3609
3610      if (repeat_max == 0) goto END_REPEAT;
3611
3612      /* All real repeats make it impossible to handle partial matching (maybe
3613      one day we will be able to remove this restriction). */
3614
3615      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3616
3617      /* Combine the op_type with the repeat_type */
3618
3619      repeat_type += op_type;
3620
3621      /* A minimum of zero is handled either as the special case * or ?, or as
3622      an UPTO, with the maximum given. */
3623
3624      if (repeat_min == 0)
3625        {
3626        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3627          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3628        else
3629          {
3630          *code++ = OP_UPTO + repeat_type;
3631          PUT2INC(code, 0, repeat_max);
3632          }
3633        }
3634
3635      /* A repeat minimum of 1 is optimized into some special cases. If the
3636      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3637      left in place and, if the maximum is greater than 1, we use OP_UPTO with
3638      one less than the maximum. */
3639
3640      else if (repeat_min == 1)
3641        {
3642        if (repeat_max == -1)
3643          *code++ = OP_PLUS + repeat_type;
3644        else
3645          {
3646          code = oldcode;                 /* leave previous item in place */
3647          if (repeat_max == 1) goto END_REPEAT;
3648          *code++ = OP_UPTO + repeat_type;
3649          PUT2INC(code, 0, repeat_max - 1);
3650          }
3651        }
3652
3653      /* The case {n,n} is just an EXACT, while the general case {n,m} is
3654      handled as an EXACT followed by an UPTO. */
3655
3656      else
3657        {
3658        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3659        PUT2INC(code, 0, repeat_min);
3660
3661        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3662        we have to insert the character for the previous code. For a repeated
3663        Unicode property match, there are two extra bytes that define the
3664        required property. In UTF-8 mode, long characters have their length in
3665        c, with the 0x80 bit as a flag. */
3666
3667        if (repeat_max < 0)
3668          {
3669#ifdef SUPPORT_UTF8
3670          if (utf8 && c >= 128)
3671            {
3672            memcpy(code, utf8_char, c & 7);
3673            code += c & 7;
3674            }
3675          else
3676#endif
3677            {
3678            *code++ = c;
3679            if (prop_type >= 0)
3680              {
3681              *code++ = prop_type;
3682              *code++ = prop_value;
3683              }
3684            }
3685          *code++ = OP_STAR + repeat_type;
3686          }
3687
3688        /* Else insert an UPTO if the max is greater than the min, again
3689        preceded by the character, for the previously inserted code. If the
3690        UPTO is just for 1 instance, we can use QUERY instead. */
3691
3692        else if (repeat_max != repeat_min)
3693          {
3694#ifdef SUPPORT_UTF8
3695          if (utf8 && c >= 128)
3696            {
3697            memcpy(code, utf8_char, c & 7);
3698            code += c & 7;
3699            }
3700          else
3701#endif
3702          *code++ = c;
3703          if (prop_type >= 0)
3704            {
3705            *code++ = prop_type;
3706            *code++ = prop_value;
3707            }
3708          repeat_max -= repeat_min;
3709
3710          if (repeat_max == 1)
3711            {
3712            *code++ = OP_QUERY + repeat_type;
3713            }
3714          else
3715            {
3716            *code++ = OP_UPTO + repeat_type;
3717            PUT2INC(code, 0, repeat_max);
3718            }
3719          }
3720        }
3721
3722      /* The character or character type itself comes last in all cases. */
3723
3724#ifdef SUPPORT_UTF8
3725      if (utf8 && c >= 128)
3726        {
3727        memcpy(code, utf8_char, c & 7);
3728        code += c & 7;
3729        }
3730      else
3731#endif
3732      *code++ = c;
3733
3734      /* For a repeated Unicode property match, there are two extra bytes that
3735      define the required property. */
3736
3737#ifdef SUPPORT_UCP
3738      if (prop_type >= 0)
3739        {
3740        *code++ = prop_type;
3741        *code++ = prop_value;
3742        }
3743#endif
3744      }
3745
3746    /* If previous was a character class or a back reference, we put the repeat
3747    stuff after it, but just skip the item if the repeat was {0,0}. */
3748
3749    else if (*previous == OP_CLASS ||
3750             *previous == OP_NCLASS ||
3751#ifdef SUPPORT_UTF8
3752             *previous == OP_XCLASS ||
3753#endif
3754             *previous == OP_REF)
3755      {
3756      if (repeat_max == 0)
3757        {
3758        code = previous;
3759        goto END_REPEAT;
3760        }
3761
3762      /* All real repeats make it impossible to handle partial matching (maybe
3763      one day we will be able to remove this restriction). */
3764
3765      if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3766
3767      if (repeat_min == 0 && repeat_max == -1)
3768        *code++ = OP_CRSTAR + repeat_type;
3769      else if (repeat_min == 1 && repeat_max == -1)
3770        *code++ = OP_CRPLUS + repeat_type;
3771      else if (repeat_min == 0 && repeat_max == 1)
3772        *code++ = OP_CRQUERY + repeat_type;
3773      else
3774        {
3775        *code++ = OP_CRRANGE + repeat_type;
3776        PUT2INC(code, 0, repeat_min);
3777        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3778        PUT2INC(code, 0, repeat_max);
3779        }
3780      }
3781
3782    /* If previous was a bracket group, we may have to replicate it in certain
3783    cases. */
3784
3785    else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3786             *previous == OP_ONCE || *previous == OP_COND)
3787      {
3788      register int i;
3789      int ketoffset = 0;
3790      int len = code - previous;
3791      uschar *bralink = NULL;
3792
3793      /* Repeating a DEFINE group is pointless */
3794
3795      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3796        {
3797        *errorcodeptr = ERR55;
3798        goto FAILED;
3799        }
3800
3801      /* If the maximum repeat count is unlimited, find the end of the bracket
3802      by scanning through from the start, and compute the offset back to it
3803      from the current code pointer. There may be an OP_OPT setting following
3804      the final KET, so we can't find the end just by going back from the code
3805      pointer. */
3806
3807      if (repeat_max == -1)
3808        {
3809        register uschar *ket = previous;
3810        do ket += GET(ket, 1); while (*ket != OP_KET);
3811        ketoffset = code - ket;
3812        }
3813
3814      /* The case of a zero minimum is special because of the need to stick
3815      OP_BRAZERO in front of it, and because the group appears once in the
3816      data, whereas in other cases it appears the minimum number of times. For
3817      this reason, it is simplest to treat this case separately, as otherwise
3818      the code gets far too messy. There are several special subcases when the
3819      minimum is zero. */
3820
3821      if (repeat_min == 0)
3822        {
3823        /* If the maximum is also zero, we just omit the group from the output
3824        altogether. */
3825
3826        if (repeat_max == 0)
3827          {
3828          code = previous;
3829          goto END_REPEAT;
3830          }
3831
3832        /* If the maximum is 1 or unlimited, we just have to stick in the
3833        BRAZERO and do no more at this point. However, we do need to adjust
3834        any OP_RECURSE calls inside the group that refer to the group itself or
3835        any internal or forward referenced group, because the offset is from
3836        the start of the whole regex. Temporarily terminate the pattern while
3837        doing this. */
3838
3839        if (repeat_max <= 1)
3840          {
3841          *code = OP_END;
3842          adjust_recurse(previous, 1, utf8, cd, save_hwm);
3843          memmove(previous+1, previous, len);
3844          code++;
3845          *previous++ = OP_BRAZERO + repeat_type;
3846          }
3847
3848        /* If the maximum is greater than 1 and limited, we have to replicate
3849        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3850        The first one has to be handled carefully because it's the original
3851        copy, which has to be moved up. The remainder can be handled by code
3852        that is common with the non-zero minimum case below. We have to
3853        adjust the value or repeat_max, since one less copy is required. Once
3854        again, we may have to adjust any OP_RECURSE calls inside the group. */
3855
3856        else
3857          {
3858          int offset;
3859          *code = OP_END;
3860          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3861          memmove(previous + 2 + LINK_SIZE, previous, len);
3862          code += 2 + LINK_SIZE;
3863          *previous++ = OP_BRAZERO + repeat_type;
3864          *previous++ = OP_BRA;
3865
3866          /* We chain together the bracket offset fields that have to be
3867          filled in later when the ends of the brackets are reached. */
3868
3869          offset = (bralink == NULL)? 0 : previous - bralink;
3870          bralink = previous;
3871          PUTINC(previous, 0, offset);
3872          }
3873
3874        repeat_max--;
3875        }
3876
3877      /* If the minimum is greater than zero, replicate the group as many
3878      times as necessary, and adjust the maximum to the number of subsequent
3879      copies that we need. If we set a first char from the group, and didn't
3880      set a required char, copy the latter from the former. If there are any
3881      forward reference subroutine calls in the group, there will be entries on
3882      the workspace list; replicate these with an appropriate increment. */
3883
3884      else
3885        {
3886        if (repeat_min > 1)
3887          {
3888          /* In the pre-compile phase, we don't actually do the replication. We
3889          just adjust the length as if we had. Do some paranoid checks for
3890          potential integer overflow. */
3891
3892          if (lengthptr != NULL)
3893            {
3894            int delta = (repeat_min - 1)*length_prevgroup;
3895            if ((double)(repeat_min - 1)*(double)length_prevgroup >
3896                                                            (double)INT_MAX ||
3897                OFLOW_MAX - *lengthptr < delta)
3898              {
3899              *errorcodeptr = ERR20;
3900              goto FAILED;
3901              }
3902            *lengthptr += delta;
3903            }
3904
3905          /* This is compiling for real */
3906
3907          else
3908            {
3909            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3910            for (i = 1; i < repeat_min; i++)
3911              {
3912              uschar *hc;
3913              uschar *this_hwm = cd->hwm;
3914              memcpy(code, previous, len);
3915              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3916                {
3917                PUT(cd->hwm, 0, GET(hc, 0) + len);
3918                cd->hwm += LINK_SIZE;
3919                }
3920              save_hwm = this_hwm;
3921              code += len;
3922              }
3923            }
3924          }
3925
3926        if (repeat_max > 0) repeat_max -= repeat_min;
3927        }
3928
3929      /* This code is common to both the zero and non-zero minimum cases. If
3930      the maximum is limited, it replicates the group in a nested fashion,
3931      remembering the bracket starts on a stack. In the case of a zero minimum,
3932      the first one was set up above. In all cases the repeat_max now specifies
3933      the number of additional copies needed. Again, we must remember to
3934      replicate entries on the forward reference list. */
3935
3936      if (repeat_max >= 0)
3937        {
3938        /* In the pre-compile phase, we don't actually do the replication. We
3939        just adjust the length as if we had. For each repetition we must add 1
3940        to the length for BRAZERO and for all but the last repetition we must
3941        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3942        paranoid checks to avoid integer overflow. */
3943
3944        if (lengthptr != NULL && repeat_max > 0)
3945          {
3946          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3947                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3948          if ((double)repeat_max *
3949                (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3950                  > (double)INT_MAX ||
3951              OFLOW_MAX - *lengthptr < delta)
3952            {
3953            *errorcodeptr = ERR20;
3954            goto FAILED;
3955            }
3956          *lengthptr += delta;
3957          }
3958
3959        /* This is compiling for real */
3960
3961        else for (i = repeat_max - 1; i >= 0; i--)
3962          {
3963          uschar *hc;
3964          uschar *this_hwm = cd->hwm;
3965
3966          *code++ = OP_BRAZERO + repeat_type;
3967
3968          /* All but the final copy start a new nesting, maintaining the
3969          chain of brackets outstanding. */
3970
3971          if (i != 0)
3972            {
3973            int offset;
3974            *code++ = OP_BRA;
3975            offset = (bralink == NULL)? 0 : code - bralink;
3976            bralink = code;
3977            PUTINC(code, 0, offset);
3978            }
3979
3980          memcpy(code, previous, len);
3981          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3982            {
3983            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3984            cd->hwm += LINK_SIZE;
3985            }
3986          save_hwm = this_hwm;
3987          code += len;
3988          }
3989
3990        /* Now chain through the pending brackets, and fill in their length
3991        fields (which are holding the chain links pro tem). */
3992
3993        while (bralink != NULL)
3994          {
3995          int oldlinkoffset;
3996          int offset = code - bralink + 1;
3997          uschar *bra = code - offset;
3998          oldlinkoffset = GET(bra, 1);
3999          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4000          *code++ = OP_KET;
4001          PUTINC(code, 0, offset);
4002          PUT(bra, 1, offset);
4003          }
4004        }
4005
4006      /* If the maximum is unlimited, set a repeater in the final copy. We
4007      can't just offset backwards from the current code point, because we
4008      don't know if there's been an options resetting after the ket. The
4009      correct offset was computed above.
4010
4011      Then, when we are doing the actual compile phase, check to see whether
4012      this group is a non-atomic one that could match an empty string. If so,
4013      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4014      that runtime checking can be done. [This check is also applied to
4015      atomic groups at runtime, but in a different way.] */
4016
4017      else
4018        {
4019        uschar *ketcode = code - ketoffset;
4020        uschar *bracode = ketcode - GET(ketcode, 1);
4021        *ketcode = OP_KETRMAX + repeat_type;
4022        if (lengthptr == NULL && *bracode != OP_ONCE)
4023          {
4024          uschar *scode = bracode;
4025          do
4026            {
4027            if (could_be_empty_branch(scode, ketcode, utf8))
4028              {
4029              *bracode += OP_SBRA - OP_BRA;
4030              break;
4031              }
4032            scode += GET(scode, 1);
4033            }
4034          while (*scode == OP_ALT);
4035          }
4036        }
4037      }
4038
4039    /* Else there's some kind of shambles */
4040
4041    else
4042      {
4043      *errorcodeptr = ERR11;
4044      goto FAILED;
4045      }
4046
4047    /* If the character following a repeat is '+', or if certain optimization
4048    tests above succeeded, possessive_quantifier is TRUE. For some of the
4049    simpler opcodes, there is an special alternative opcode for this. For
4050    anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4051    The '+' notation is just syntactic sugar, taken from Sun's Java package,
4052    but the special opcodes can optimize it a bit. The repeated item starts at
4053    tempcode, not at previous, which might be the first part of a string whose
4054    (former) last char we repeated.
4055
4056    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4057    an 'upto' may follow. We skip over an 'exact' item, and then test the
4058    length of what remains before proceeding. */
4059
4060    if (possessive_quantifier)
4061      {
4062      int len;
4063      if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4064          *tempcode == OP_NOTEXACT)
4065        tempcode += _pcre_OP_lengths[*tempcode] +
4066          ((*tempcode == OP_TYPEEXACT &&
4067             (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4068      len = code - tempcode;
4069      if (len > 0) switch (*tempcode)
4070        {
4071        case OP_STAR:  *tempcode = OP_POSSTAR; break;
4072        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4073        case OP_QUERY: *tempcode = OP_POSQUERY; break;
4074        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4075
4076        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4077        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4078        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4079        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4080
4081        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4082        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4083        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4084        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4085
4086        default:
4087        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4088        code += 1 + LINK_SIZE;
4089        len += 1 + LINK_SIZE;
4090        tempcode[0] = OP_ONCE;
4091        *code++ = OP_KET;
4092        PUTINC(code, 0, len);
4093        PUT(tempcode, 1, len);
4094        break;
4095        }
4096      }
4097
4098    /* In all case we no longer have a previous item. We also set the
4099    "follows varying string" flag for subsequently encountered reqbytes if
4100    it isn't already set and we have just passed a varying length item. */
4101
4102    END_REPEAT:
4103    previous = NULL;
4104    cd->req_varyopt |= reqvary;
4105    break;
4106
4107
4108    /* ===================================================================*/
4109    /* Start of nested parenthesized sub-expression, or comment or lookahead or
4110    lookbehind or option setting or condition or all the other extended
4111    parenthesis forms.  */
4112
4113    case '(':
4114    newoptions = options;
4115    skipbytes = 0;
4116    bravalue = OP_CBRA;
4117    save_hwm = cd->hwm;
4118    reset_bracount = FALSE;
4119
4120    /* First deal with various "verbs" that can be introduced by '*'. */
4121
4122    if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4123      {
4124      int i, namelen;
4125      const char *vn = verbnames;
4126      const uschar *name = ++ptr;
4127      previous = NULL;
4128      while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4129      if (*ptr == ':')
4130        {
4131        *errorcodeptr = ERR59;   /* Not supported */
4132        goto FAILED;
4133        }
4134      if (*ptr != ')')
4135        {
4136        *errorcodeptr = ERR60;
4137        goto FAILED;
4138        }
4139      namelen = ptr - name;
4140      for (i = 0; i < verbcount; i++)
4141        {
4142        if (namelen == verbs[i].len &&
4143            strncmp((char *)name, vn, namelen) == 0)
4144          {
4145          *code = verbs[i].op;
4146          if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4147          break;
4148          }
4149        vn += verbs[i].len + 1;
4150        }
4151      if (i < verbcount) continue;
4152      *errorcodeptr = ERR60;
4153      goto FAILED;
4154      }
4155
4156    /* Deal with the extended parentheses; all are introduced by '?', and the
4157    appearance of any of them means that this is not a capturing group. */
4158
4159    else if (*ptr == '?')
4160      {
4161      int i, set, unset, namelen;
4162      int *optset;
4163      const uschar *name;
4164      uschar *slot;
4165
4166      switch (*(++ptr))
4167        {
4168        case '#':                 /* Comment; skip to ket */
4169        ptr++;
4170        while (*ptr != 0 && *ptr != ')') ptr++;
4171        if (*ptr == 0)
4172          {
4173          *errorcodeptr = ERR18;
4174          goto FAILED;
4175          }
4176        continue;
4177
4178
4179        /* ------------------------------------------------------------ */
4180        case '|':                 /* Reset capture count for each branch */
4181        reset_bracount = TRUE;
4182        /* Fall through */
4183
4184        /* ------------------------------------------------------------ */
4185        case ':':                 /* Non-capturing bracket */
4186        bravalue = OP_BRA;
4187        ptr++;
4188        break;
4189
4190
4191        /* ------------------------------------------------------------ */
4192        case '(':
4193        bravalue = OP_COND;       /* Conditional group */
4194
4195        /* A condition can be an assertion, a number (referring to a numbered
4196        group), a name (referring to a named group), or 'R', referring to
4197        recursion. R<digits> and R&name are also permitted for recursion tests.
4198
4199        There are several syntaxes for testing a named group: (?(name)) is used
4200        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4201
4202        There are two unfortunate ambiguities, caused by history. (a) 'R' can
4203        be the recursive thing or the name 'R' (and similarly for 'R' followed
4204        by digits), and (b) a number could be a name that consists of digits.
4205        In both cases, we look for a name first; if not found, we try the other
4206        cases. */
4207
4208        /* For conditions that are assertions, check the syntax, and then exit
4209        the switch. This will take control down to where bracketed groups,
4210        including assertions, are processed. */
4211
4212        if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4213          break;
4214
4215        /* Most other conditions use OP_CREF (a couple change to OP_RREF
4216        below), and all need to skip 3 bytes at the start of the group. */
4217
4218        code[1+LINK_SIZE] = OP_CREF;
4219        skipbytes = 3;
4220        refsign = -1;
4221
4222        /* Check for a test for recursion in a named group. */
4223
4224        if (ptr[1] == 'R' && ptr[2] == '&')
4225          {
4226          terminator = -1;
4227          ptr += 2;
4228          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4229          }
4230
4231        /* Check for a test for a named group's having been set, using the Perl
4232        syntax (?(<name>) or (?('name') */
4233
4234        else if (ptr[1] == '<')
4235          {
4236          terminator = '>';
4237          ptr++;
4238          }
4239        else if (ptr[1] == '\'')
4240          {
4241          terminator = '\'';
4242          ptr++;
4243          }
4244        else
4245          {
4246          terminator = 0;
4247          if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4248          }
4249
4250        /* We now expect to read a name; any thing else is an error */
4251
4252        if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4253          {
4254          ptr += 1;  /* To get the right offset */
4255          *errorcodeptr = ERR28;
4256          goto FAILED;
4257          }
4258
4259        /* Read the name, but also get it as a number if it's all digits */
4260
4261        recno = 0;
4262        name = ++ptr;
4263        while ((cd->ctypes[*ptr] & ctype_word) != 0)
4264          {
4265          if (recno >= 0)
4266            recno = ((digitab[*ptr] & ctype_digit) != 0)?
4267              recno * 10 + *ptr - '0' : -1;
4268          ptr++;
4269          }
4270        namelen = ptr - name;
4271
4272        if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4273          {
4274          ptr--;      /* Error offset */
4275          *errorcodeptr = ERR26;
4276          goto FAILED;
4277          }
4278
4279        /* Do no further checking in the pre-compile phase. */
4280
4281        if (lengthptr != NULL) break;
4282
4283        /* In the real compile we do the work of looking for the actual
4284        reference. If the string started with "+" or "-" we require the rest to
4285        be digits, in which case recno will be set. */
4286
4287        if (refsign > 0)
4288          {
4289          if (recno <= 0)
4290            {
4291            *errorcodeptr = ERR58;
4292            goto FAILED;
4293            }
4294          recno = (refsign == '-')?
4295            cd->bracount - recno + 1 : recno +cd->bracount;
4296          if (recno <= 0 || recno > cd->final_bracount)
4297            {
4298            *errorcodeptr = ERR15;
4299            goto FAILED;
4300            }
4301          PUT2(code, 2+LINK_SIZE, recno);
4302          break;
4303          }
4304
4305        /* Otherwise (did not start with "+" or "-"), start by looking for the
4306        name. */
4307
4308        slot = cd->name_table;
4309        for (i = 0; i < cd->names_found; i++)
4310          {
4311          if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4312          slot += cd->name_entry_size;
4313          }
4314
4315        /* Found a previous named subpattern */
4316
4317        if (i < cd->names_found)
4318          {
4319          recno = GET2(slot, 0);
4320          PUT2(code, 2+LINK_SIZE, recno);
4321          }
4322
4323        /* Search the pattern for a forward reference */
4324
4325        else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4326                        (options & PCRE_EXTENDED) != 0)) > 0)
4327          {
4328          PUT2(code, 2+LINK_SIZE, i);
4329          }
4330
4331        /* If terminator == 0 it means that the name followed directly after
4332        the opening parenthesis [e.g. (?(abc)...] and in this case there are
4333        some further alternatives to try. For the cases where terminator != 0
4334        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4335        now checked all the possibilities, so give an error. */
4336
4337        else if (terminator != 0)
4338          {
4339          *errorcodeptr = ERR15;
4340          goto FAILED;
4341          }
4342
4343        /* Check for (?(R) for recursion. Allow digits after R to specify a
4344        specific group number. */
4345
4346        else if (*name == 'R')
4347          {
4348          recno = 0;
4349          for (i = 1; i < namelen; i++)
4350            {
4351            if ((digitab[name[i]] & ctype_digit) == 0)
4352              {
4353              *errorcodeptr = ERR15;
4354              goto FAILED;
4355              }
4356            recno = recno * 10 + name[i] - '0';
4357            }
4358          if (recno == 0) recno = RREF_ANY;
4359          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4360          PUT2(code, 2+LINK_SIZE, recno);
4361          }
4362
4363        /* Similarly, check for the (?(DEFINE) "condition", which is always
4364        false. */
4365
4366        else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4367          {
4368          code[1+LINK_SIZE] = OP_DEF;
4369          skipbytes = 1;
4370          }
4371
4372        /* Check for the "name" actually being a subpattern number. We are
4373        in the second pass here, so final_bracount is set. */
4374
4375        else if (recno > 0 && recno <= cd->final_bracount)
4376          {
4377          PUT2(code, 2+LINK_SIZE, recno);
4378          }
4379
4380        /* Either an unidentified subpattern, or a reference to (?(0) */
4381
4382        else
4383          {
4384          *errorcodeptr = (recno == 0)? ERR35: ERR15;
4385          goto FAILED;
4386          }
4387        break;
4388
4389
4390        /* ------------------------------------------------------------ */
4391        case '=':                 /* Positive lookahead */
4392        bravalue = OP_ASSERT;
4393        ptr++;
4394        break;
4395
4396
4397        /* ------------------------------------------------------------ */
4398        case '!':                 /* Negative lookahead */
4399        ptr++;
4400        if (*ptr == ')')          /* Optimize (?!) */
4401          {
4402          *code++ = OP_FAIL;
4403          previous = NULL;
4404          continue;
4405          }
4406        bravalue = OP_ASSERT_NOT;
4407        break;
4408
4409
4410        /* ------------------------------------------------------------ */
4411        case '<':                 /* Lookbehind or named define */
4412        switch (ptr[1])
4413          {
4414          case '=':               /* Positive lookbehind */
4415          bravalue = OP_ASSERTBACK;
4416          ptr += 2;
4417          break;
4418
4419          case '!':               /* Negative lookbehind */
4420          bravalue = OP_ASSERTBACK_NOT;
4421          ptr += 2;
4422          break;
4423
4424          default:                /* Could be name define, else bad */
4425          if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4426          ptr++;                  /* Correct offset for error */
4427          *errorcodeptr = ERR24;
4428          goto FAILED;
4429          }
4430        break;
4431
4432
4433        /* ------------------------------------------------------------ */
4434        case '>':                 /* One-time brackets */
4435        bravalue = OP_ONCE;
4436        ptr++;
4437        break;
4438
4439
4440        /* ------------------------------------------------------------ */
4441        case 'C':                 /* Callout - may be followed by digits; */
4442        previous_callout = code;  /* Save for later completion */
4443        after_manual_callout = 1; /* Skip one item before completing */
4444        *code++ = OP_CALLOUT;
4445          {
4446          int n = 0;
4447          while ((digitab[*(++ptr)] & ctype_digit) != 0)
4448            n = n * 10 + *ptr - '0';
4449          if (*ptr != ')')
4450            {
4451            *errorcodeptr = ERR39;
4452            goto FAILED;
4453            }
4454          if (n > 255)
4455            {
4456            *errorcodeptr = ERR38;
4457            goto FAILED;
4458            }
4459          *code++ = n;
4460          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4461          PUT(code, LINK_SIZE, 0);                    /* Default length */
4462          code += 2 * LINK_SIZE;
4463          }
4464        previous = NULL;
4465        continue;
4466
4467
4468        /* ------------------------------------------------------------ */
4469        case 'P':                 /* Python-style named subpattern handling */
4470        if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4471          {
4472          is_recurse = *ptr == '>';
4473          terminator = ')';
4474          goto NAMED_REF_OR_RECURSE;
4475          }
4476        else if (*ptr != '<')    /* Test for Python-style definition */
4477          {
4478          *errorcodeptr = ERR41;
4479          goto FAILED;
4480          }
4481        /* Fall through to handle (?P< as (?< is handled */
4482
4483
4484        /* ------------------------------------------------------------ */
4485        DEFINE_NAME:    /* Come here from (?< handling */
4486        case '\'':
4487          {
4488          terminator = (*ptr == '<')? '>' : '\'';
4489          name = ++ptr;
4490
4491          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4492          namelen = ptr - name;
4493
4494          /* In the pre-compile phase, just do a syntax check. */
4495
4496          if (lengthptr != NULL)
4497            {
4498            if (*ptr != terminator)
4499              {
4500              *errorcodeptr = ERR42;
4501              goto FAILED;
4502              }
4503            if (cd->names_found >= MAX_NAME_COUNT)
4504              {
4505              *errorcodeptr = ERR49;
4506              goto FAILED;
4507              }
4508            if (namelen + 3 > cd->name_entry_size)
4509              {
4510              cd->name_entry_size = namelen + 3;
4511              if (namelen > MAX_NAME_SIZE)
4512                {
4513                *errorcodeptr = ERR48;
4514                goto FAILED;
4515                }
4516              }
4517            }
4518
4519          /* In the real compile, create the entry in the table */
4520
4521          else
4522            {
4523            slot = cd->name_table;
4524            for (i = 0; i < cd->names_found; i++)
4525              {
4526              int crc = memcmp(name, slot+2, namelen);
4527              if (crc == 0)
4528                {
4529                if (slot[2+namelen] == 0)
4530                  {
4531                  if ((options & PCRE_DUPNAMES) == 0)
4532                    {
4533                    *errorcodeptr = ERR43;
4534                    goto FAILED;
4535                    }
4536                  }
4537                else crc = -1;      /* Current name is substring */
4538                }
4539              if (crc < 0)
4540                {
4541                memmove(slot + cd->name_entry_size, slot,
4542                  (cd->names_found - i) * cd->name_entry_size);
4543                break;
4544                }
4545              slot += cd->name_entry_size;
4546              }
4547
4548            PUT2(slot, 0, cd->bracount + 1);
4549            memcpy(slot + 2, name, namelen);
4550            slot[2+namelen] = 0;
4551            }
4552          }
4553
4554        /* In both cases, count the number of names we've encountered. */
4555
4556        ptr++;                    /* Move past > or ' */
4557        cd->names_found++;
4558        goto NUMBERED_GROUP;
4559
4560
4561        /* ------------------------------------------------------------ */
4562        case '&':                 /* Perl recursion/subroutine syntax */
4563        terminator = ')';
4564        is_recurse = TRUE;
4565        /* Fall through */
4566
4567        /* We come here from the Python syntax above that handles both
4568        references (?P=name) and recursion (?P>name), as well as falling
4569        through from the Perl recursion syntax (?&name). We also come here from
4570        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4571        .NET syntax. */
4572
4573        NAMED_REF_OR_RECURSE:
4574        name = ++ptr;
4575        while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4576        namelen = ptr - name;
4577
4578        /* In the pre-compile phase, do a syntax check and set a dummy
4579        reference number. */
4580
4581        if (lengthptr != NULL)
4582          {
4583          if (namelen == 0)
4584            {
4585            *errorcodeptr = ERR62;
4586            goto FAILED;
4587            }
4588          if (*ptr != terminator)
4589            {
4590            *errorcodeptr = ERR42;
4591            goto FAILED;
4592            }
4593          if (namelen > MAX_NAME_SIZE)
4594            {
4595            *errorcodeptr = ERR48;
4596            goto FAILED;
4597            }
4598          recno = 0;
4599          }
4600
4601        /* In the real compile, seek the name in the table. We check the name
4602        first, and then check that we have reached the end of the name in the
4603        table. That way, if the name that is longer than any in the table,
4604        the comparison will fail without reading beyond the table entry. */
4605
4606        else
4607          {
4608          slot = cd->name_table;
4609          for (i = 0; i < cd->names_found; i++)
4610            {
4611            if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4612                slot[2+namelen] == 0)
4613              break;
4614            slot += cd->name_entry_size;
4615            }
4616
4617          if (i < cd->names_found)         /* Back reference */
4618            {
4619            recno = GET2(slot, 0);
4620            }
4621          else if ((recno =                /* Forward back reference */
4622                    find_parens(ptr, cd->bracount, name, namelen,
4623                      (options & PCRE_EXTENDED) != 0)) <= 0)
4624            {
4625            *errorcodeptr = ERR15;
4626            goto FAILED;
4627            }
4628          }
4629
4630        /* In both phases, we can now go to the code than handles numerical
4631        recursion or backreferences. */
4632
4633        if (is_recurse) goto HANDLE_RECURSION;
4634          else goto HANDLE_REFERENCE;
4635
4636
4637        /* ------------------------------------------------------------ */
4638        case 'R':                 /* Recursion */
4639        ptr++;                    /* Same as (?0)      */
4640        /* Fall through */
4641
4642
4643        /* ------------------------------------------------------------ */
4644        case '-': case '+':
4645        case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4646        case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4647          {
4648          const uschar *called;
4649
4650          if ((refsign = *ptr) == '+')
4651            {
4652            ptr++;
4653            if ((digitab[*ptr] & ctype_digit) == 0)
4654              {
4655              *errorcodeptr = ERR63;
4656              goto FAILED;
4657              }
4658            }
4659          else if (refsign == '-')
4660            {
4661            if ((digitab[ptr[1]] & ctype_digit) == 0)
4662              goto OTHER_CHAR_AFTER_QUERY;
4663            ptr++;
4664            }
4665
4666          recno = 0;
4667          while((digitab[*ptr] & ctype_digit) != 0)
4668            recno = recno * 10 + *ptr++ - '0';
4669
4670          if (*ptr != ')')
4671            {
4672            *errorcodeptr = ERR29;
4673            goto FAILED;
4674            }
4675
4676          if (refsign == '-')
4677            {
4678            if (recno == 0)
4679              {
4680              *errorcodeptr = ERR58;
4681              goto FAILED;
4682              }
4683            recno = cd->bracount - recno + 1;
4684            if (recno <= 0)
4685              {
4686              *errorcodeptr = ERR15;
4687              goto FAILED;
4688              }
4689            }
4690          else if (refsign == '+')
4691            {
4692            if (recno == 0)
4693              {
4694              *errorcodeptr = ERR58;
4695              goto FAILED;
4696              }
4697            recno += cd->bracount;
4698            }
4699
4700          /* Come here from code above that handles a named recursion */
4701
4702          HANDLE_RECURSION:
4703
4704          previous = code;
4705          called = cd->start_code;
4706
4707          /* When we are actually compiling, find the bracket that is being
4708          referenced. Temporarily end the regex in case it doesn't exist before
4709          this point. If we end up with a forward reference, first check that
4710          the bracket does occur later so we can give the error (and position)
4711          now. Then remember this forward reference in the workspace so it can
4712          be filled in at the end. */
4713
4714          if (lengthptr == NULL)
4715            {
4716            *code = OP_END;
4717            if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4718
4719            /* Forward reference */
4720
4721            if (called == NULL)
4722              {
4723              if (find_parens(ptr, cd->bracount, NULL, recno,
4724                   (options & PCRE_EXTENDED) != 0) < 0)
4725                {
4726                *errorcodeptr = ERR15;
4727                goto FAILED;
4728                }
4729              called = cd->start_code + recno;
4730              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4731              }
4732
4733            /* If not a forward reference, and the subpattern is still open,
4734            this is a recursive call. We check to see if this is a left
4735            recursion that could loop for ever, and diagnose that case. */
4736
4737            else if (GET(called, 1) == 0 &&
4738                     could_be_empty(called, code, bcptr, utf8))
4739              {
4740              *errorcodeptr = ERR40;
4741              goto FAILED;
4742              }
4743            }
4744
4745          /* Insert the recursion/subroutine item, automatically wrapped inside
4746          "once" brackets. Set up a "previous group" length so that a
4747          subsequent quantifier will work. */
4748
4749          *code = OP_ONCE;
4750          PUT(code, 1, 2 + 2*LINK_SIZE);
4751          code += 1 + LINK_SIZE;
4752
4753          *code = OP_RECURSE;
4754          PUT(code, 1, called - cd->start_code);
4755          code += 1 + LINK_SIZE;
4756
4757          *code = OP_KET;
4758          PUT(code, 1, 2 + 2*LINK_SIZE);
4759          code += 1 + LINK_SIZE;
4760
4761          length_prevgroup = 3 + 3*LINK_SIZE;
4762          }
4763
4764        /* Can't determine a first byte now */
4765
4766        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4767        continue;
4768
4769
4770        /* ------------------------------------------------------------ */
4771        default:              /* Other characters: check option setting */
4772        OTHER_CHAR_AFTER_QUERY:
4773        set = unset = 0;
4774        optset = &set;
4775
4776        while (*ptr != ')' && *ptr != ':')
4777          {
4778          switch (*ptr++)
4779            {
4780            case '-': optset = &unset; break;
4781
4782            case 'J':    /* Record that it changed in the external options */
4783            *optset |= PCRE_DUPNAMES;
4784            cd->external_flags |= PCRE_JCHANGED;
4785            break;
4786
4787            case 'i': *optset |= PCRE_CASELESS; break;
4788            case 'm': *optset |= PCRE_MULTILINE; break;
4789            case 's': *optset |= PCRE_DOTALL; break;
4790            case 'x': *optset |= PCRE_EXTENDED; break;
4791            case 'U': *optset |= PCRE_UNGREEDY; break;
4792            case 'X': *optset |= PCRE_EXTRA; break;
4793
4794            default:  *errorcodeptr = ERR12;
4795                      ptr--;    /* Correct the offset */
4796                      goto FAILED;
4797            }
4798          }
4799
4800        /* Set up the changed option bits, but don't change anything yet. */
4801
4802        newoptions = (options | set) & (~unset);
4803
4804        /* If the options ended with ')' this is not the start of a nested
4805        group with option changes, so the options change at this level. If this
4806        item is right at the start of the pattern, the options can be
4807        abstracted and made external in the pre-compile phase, and ignored in
4808        the compile phase. This can be helpful when matching -- for instance in
4809        caseless checking of required bytes.
4810
4811        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4812        definitely *not* at the start of the pattern because something has been
4813        compiled. In the pre-compile phase, however, the code pointer can have
4814        that value after the start, because it gets reset as code is discarded
4815        during the pre-compile. However, this can happen only at top level - if
4816        we are within parentheses, the starting BRA will still be present. At
4817        any parenthesis level, the length value can be used to test if anything
4818        has been compiled at that level. Thus, a test for both these conditions
4819        is necessary to ensure we correctly detect the start of the pattern in
4820        both phases.
4821
4822        If we are not at the pattern start, compile code to change the ims
4823        options if this setting actually changes any of them. We also pass the
4824        new setting back so that it can be put at the start of any following
4825        branches, and when this group ends (if we are in a group), a resetting
4826        item can be compiled. */
4827
4828        if (*ptr == ')')
4829          {
4830          if (code == cd->start_code + 1 + LINK_SIZE &&
4831               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4832            {
4833            cd->external_options = newoptions;
4834            options = newoptions;
4835            }
4836         else
4837            {
4838            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4839              {
4840              *code++ = OP_OPT;
4841              *code++ = newoptions & PCRE_IMS;
4842              }
4843
4844            /* Change options at this level, and pass them back for use
4845            in subsequent branches. Reset the greedy defaults and the case
4846            value for firstbyte and reqbyte. */
4847
4848            *optionsptr = options = newoptions;
4849            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4850            greedy_non_default = greedy_default ^ 1;
4851            req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4852            }
4853
4854          previous = NULL;       /* This item can't be repeated */
4855          continue;              /* It is complete */
4856          }
4857
4858        /* If the options ended with ':' we are heading into a nested group
4859        with possible change of options. Such groups are non-capturing and are
4860        not assertions of any kind. All we need to do is skip over the ':';
4861        the newoptions value is handled below. */
4862
4863        bravalue = OP_BRA;
4864        ptr++;
4865        }     /* End of switch for character following (? */
4866      }       /* End of (? handling */
4867
4868    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4869    all unadorned brackets become non-capturing and behave like (?:...)
4870    brackets. */
4871
4872    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4873      {
4874      bravalue = OP_BRA;
4875      }
4876
4877    /* Else we have a capturing group. */
4878
4879    else
4880      {
4881      NUMBERED_GROUP:
4882      cd->bracount += 1;
4883      PUT2(code, 1+LINK_SIZE, cd->bracount);
4884      skipbytes = 2;
4885      }
4886
4887    /* Process nested bracketed regex. Assertions may not be repeated, but
4888    other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4889    non-register variable in order to be able to pass its address because some
4890    compilers complain otherwise. Pass in a new setting for the ims options if
4891    they have changed. */
4892
4893    previous = (bravalue >= OP_ONCE)? code : NULL;
4894    *code = bravalue;
4895    tempcode = code;
4896    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4897    length_prevgroup = 0;              /* Initialize for pre-compile phase */
4898
4899    if (!compile_regex(
4900         newoptions,                   /* The complete new option state */
4901         options & PCRE_IMS,           /* The previous ims option state */
4902         &tempcode,                    /* Where to put code (updated) */
4903         &ptr,                         /* Input pointer (updated) */
4904         errorcodeptr,                 /* Where to put an error message */
4905         (bravalue == OP_ASSERTBACK ||
4906          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4907         reset_bracount,               /* True if (?| group */
4908         skipbytes,                    /* Skip over bracket number */
4909         &subfirstbyte,                /* For possible first char */
4910         &subreqbyte,                  /* For possible last char */
4911         bcptr,                        /* Current branch chain */
4912         cd,                           /* Tables block */
4913         (lengthptr == NULL)? NULL :   /* Actual compile phase */
4914           &length_prevgroup           /* Pre-compile phase */
4915         ))
4916      goto FAILED;
4917
4918    /* At the end of compiling, code is still pointing to the start of the
4919    group, while tempcode has been updated to point past the end of the group
4920    and any option resetting that may follow it. The pattern pointer (ptr)
4921    is on the bracket. */
4922
4923    /* If this is a conditional bracket, check that there are no more than
4924    two branches in the group, or just one if it's a DEFINE group. We do this
4925    in the real compile phase, not in the pre-pass, where the whole group may
4926    not be available. */
4927
4928    if (bravalue == OP_COND && lengthptr == NULL)
4929      {
4930      uschar *tc = code;
4931      int condcount = 0;
4932
4933      do {
4934         condcount++;
4935         tc += GET(tc,1);
4936         }
4937      while (*tc != OP_KET);
4938
4939      /* A DEFINE group is never obeyed inline (the "condition" is always
4940      false). It must have only one branch. */
4941
4942      if (code[LINK_SIZE+1] == OP_DEF)
4943        {
4944        if (condcount > 1)
4945          {
4946          *errorcodeptr = ERR54;
4947          goto FAILED;
4948          }
4949        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4950        }
4951
4952      /* A "normal" conditional group. If there is just one branch, we must not
4953      make use of its firstbyte or reqbyte, because this is equivalent to an
4954      empty second branch. */
4955
4956      else
4957        {
4958        if (condcount > 2)
4959          {
4960          *errorcodeptr = ERR27;
4961          goto FAILED;
4962          }
4963        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4964        }
4965      }
4966
4967    /* Error if hit end of pattern */
4968
4969    if (*ptr != ')')
4970      {
4971      *errorcodeptr = ERR14;
4972      goto FAILED;
4973      }
4974
4975    /* In the pre-compile phase, update the length by the length of the group,
4976    less the brackets at either end. Then reduce the compiled code to just a
4977    set of non-capturing brackets so that it doesn't use much memory if it is
4978    duplicated by a quantifier.*/
4979
4980    if (lengthptr != NULL)
4981      {
4982      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4983        {
4984        *errorcodeptr = ERR20;
4985        goto FAILED;
4986        }
4987      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4988      *code++ = OP_BRA;
4989      PUTINC(code, 0, 1 + LINK_SIZE);
4990      *code++ = OP_KET;
4991      PUTINC(code, 0, 1 + LINK_SIZE);
4992      break;    /* No need to waste time with special character handling */
4993      }
4994
4995    /* Otherwise update the main code pointer to the end of the group. */
4996
4997    code = tempcode;
4998
4999    /* For a DEFINE group, required and first character settings are not
5000    relevant. */
5001
5002    if (bravalue == OP_DEF) break;
5003
5004    /* Handle updating of the required and first characters for other types of
5005    group. Update for normal brackets of all kinds, and conditions with two
5006    branches (see code above). If the bracket is followed by a quantifier with
5007    zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5008    zerofirstbyte outside the main loop so that they can be accessed for the
5009    back off. */
5010
5011    zeroreqbyte = reqbyte;
5012    zerofirstbyte = firstbyte;
5013    groupsetfirstbyte = FALSE;
5014
5015    if (bravalue >= OP_ONCE)
5016      {
5017      /* If we have not yet set a firstbyte in this branch, take it from the
5018      subpattern, remembering that it was set here so that a repeat of more
5019      than one can replicate it as reqbyte if necessary. If the subpattern has
5020      no firstbyte, set "none" for the whole branch. In both cases, a zero
5021      repeat forces firstbyte to "none". */
5022
5023      if (firstbyte == REQ_UNSET)
5024        {
5025        if (subfirstbyte >= 0)
5026          {
5027          firstbyte = subfirstbyte;
5028          groupsetfirstbyte = TRUE;
5029          }
5030        else firstbyte = REQ_NONE;
5031        zerofirstbyte = REQ_NONE;
5032        }
5033
5034      /* If firstbyte was previously set, convert the subpattern's firstbyte
5035      into reqbyte if there wasn't one, using the vary flag that was in
5036      existence beforehand. */
5037
5038      else if (subfirstbyte >= 0 && subreqbyte < 0)
5039        subreqbyte = subfirstbyte | tempreqvary;
5040
5041      /* If the subpattern set a required byte (or set a first byte that isn't
5042      really the first byte - see above), set it. */
5043
5044      if (subreqbyte >= 0) reqbyte = subreqbyte;
5045      }
5046
5047    /* For a forward assertion, we take the reqbyte, if set. This can be
5048    helpful if the pattern that follows the assertion doesn't set a different
5049    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5050    for an assertion, however because it leads to incorrect effect for patterns
5051    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5052    of a firstbyte. This is overcome by a scan at the end if there's no
5053    firstbyte, looking for an asserted first char. */
5054
5055    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5056    break;     /* End of processing '(' */
5057
5058
5059    /* ===================================================================*/
5060    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5061    are arranged to be the negation of the corresponding OP_values. For the
5062    back references, the values are ESC_REF plus the reference number. Only
5063    back references and those types that consume a character may be repeated.
5064    We can test for values between ESC_b and ESC_Z for the latter; this may
5065    have to change if any new ones are ever created. */
5066
5067    case '\\':
5068    tempptr = ptr;
5069    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5070    if (*errorcodeptr != 0) goto FAILED;
5071
5072    if (c < 0)
5073      {
5074      if (-c == ESC_Q)            /* Handle start of quoted string */
5075        {
5076        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5077          else inescq = TRUE;
5078        continue;
5079        }
5080
5081      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5082
5083      /* For metasequences that actually match a character, we disable the
5084      setting of a first character if it hasn't already been set. */
5085
5086      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5087        firstbyte = REQ_NONE;
5088
5089      /* Set values to reset to if this is followed by a zero repeat. */
5090
5091      zerofirstbyte = firstbyte;
5092      zeroreqbyte = reqbyte;
5093
5094      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5095      We also support \k{name} (.NET syntax) */
5096
5097      if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5098        {
5099        is_recurse = FALSE;
5100        terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5101        goto NAMED_REF_OR_RECURSE;
5102        }
5103
5104      /* Back references are handled specially; must disable firstbyte if
5105      not set to cope with cases like (?=(\w+))\1: which would otherwise set
5106      ':' later. */
5107
5108      if (-c >= ESC_REF)
5109        {
5110        recno = -c - ESC_REF;
5111
5112        HANDLE_REFERENCE:    /* Come here from named backref handling */
5113        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5114        previous = code;
5115        *code++ = OP_REF;
5116        PUT2INC(code, 0, recno);
5117        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5118        if (recno > cd->top_backref) cd->top_backref = recno;
5119        }
5120
5121      /* So are Unicode property matches, if supported. */
5122
5123#ifdef SUPPORT_UCP
5124      else if (-c == ESC_P || -c == ESC_p)
5125        {
5126        BOOL negated;
5127        int pdata;
5128        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5129        if (ptype < 0) goto FAILED;
5130        previous = code;
5131        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5132        *code++ = ptype;
5133        *code++ = pdata;
5134        }
5135#else
5136
5137      /* If Unicode properties are not supported, \X, \P, and \p are not
5138      allowed. */
5139
5140      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5141        {
5142        *errorcodeptr = ERR45;
5143        goto FAILED;
5144        }
5145#endif
5146
5147      /* For the rest (including \X when Unicode properties are supported), we
5148      can obtain the OP value by negating the escape value. */
5149
5150      else
5151        {
5152        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5153        *code++ = -c;
5154        }
5155      continue;
5156      }
5157
5158    /* We have a data character whose value is in c. In UTF-8 mode it may have
5159    a value > 127. We set its representation in the length/buffer, and then
5160    handle it as a data character. */
5161
5162#ifdef SUPPORT_UTF8
5163    if (utf8 && c > 127)
5164      mclength = _pcre_ord2utf8(c, mcbuffer);
5165    else
5166#endif
5167
5168     {
5169     mcbuffer[0] = c;
5170     mclength = 1;
5171     }
5172    goto ONE_CHAR;
5173
5174
5175    /* ===================================================================*/
5176    /* Handle a literal character. It is guaranteed not to be whitespace or #
5177    when the extended flag is set. If we are in UTF-8 mode, it may be a
5178    multi-byte literal character. */
5179
5180    default:
5181    NORMAL_CHAR:
5182    mclength = 1;
5183    mcbuffer[0] = c;
5184
5185#ifdef SUPPORT_UTF8
5186    if (utf8 && c >= 0xc0)
5187      {
5188      while ((ptr[1] & 0xc0) == 0x80)
5189        mcbuffer[mclength++] = *(++ptr);
5190      }
5191#endif
5192
5193    /* At this point we have the character's bytes in mcbuffer, and the length
5194    in mclength. When not in UTF-8 mode, the length is always 1. */
5195
5196    ONE_CHAR:
5197    previous = code;
5198    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5199    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5200
5201    /* Remember if \r or \n were seen */
5202
5203    if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5204      cd->external_flags |= PCRE_HASCRORLF;
5205
5206    /* Set the first and required bytes appropriately. If no previous first
5207    byte, set it from this character, but revert to none on a zero repeat.
5208    Otherwise, leave the firstbyte value alone, and don't change it on a zero
5209    repeat. */
5210
5211    if (firstbyte == REQ_UNSET)
5212      {
5213      zerofirstbyte = REQ_NONE;
5214      zeroreqbyte = reqbyte;
5215
5216      /* If the character is more than one byte long, we can set firstbyte
5217      only if it is not to be matched caselessly. */
5218
5219      if (mclength == 1 || req_caseopt == 0)
5220        {
5221        firstbyte = mcbuffer[0] | req_caseopt;
5222        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5223        }
5224      else firstbyte = reqbyte = REQ_NONE;
5225      }
5226
5227    /* firstbyte was previously set; we can set reqbyte only the length is
5228    1 or the matching is caseful. */
5229
5230    else
5231      {
5232      zerofirstbyte = firstbyte;
5233      zeroreqbyte = reqbyte;
5234      if (mclength == 1 || req_caseopt == 0)
5235        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5236      }
5237
5238    break;            /* End of literal character handling */
5239    }
5240  }                   /* end of big loop */
5241
5242
5243/* Control never reaches here by falling through, only by a goto for all the
5244error states. Pass back the position in the pattern so that it can be displayed
5245to the user for diagnosing the error. */
5246
5247FAILED:
5248*ptrptr = ptr;
5249return FALSE;
5250}
5251
5252
5253
5254
5255/*************************************************
5256*     Compile sequence of alternatives           *
5257*************************************************/
5258
5259/* On entry, ptr is pointing past the bracket character, but on return it
5260points to the closing bracket, or vertical bar, or end of string. The code
5261variable is pointing at the byte into which the BRA operator has been stored.
5262If the ims options are changed at the start (for a (?ims: group) or during any
5263branch, we need to insert an OP_OPT item at the start of every following branch
5264to ensure they get set correctly at run time, and also pass the new options
5265into every subsequent branch compile.
5266
5267This function is used during the pre-compile phase when we are trying to find
5268out the amount of memory needed, as well as during the real compile phase. The
5269value of lengthptr distinguishes the two phases.
5270
5271Arguments:
5272  options        option bits, including any changes for this subpattern
5273  oldims         previous settings of ims option bits
5274  codeptr        -> the address of the current code pointer
5275  ptrptr         -> the address of the current pattern pointer
5276  errorcodeptr   -> pointer to error code variable
5277  lookbehind     TRUE if this is a lookbehind assertion
5278  reset_bracount TRUE to reset the count for each branch
5279  skipbytes      skip this many bytes at start (for brackets and OP_COND)
5280  firstbyteptr   place to put the first required character, or a negative number
5281  reqbyteptr     place to put the last required character, or a negative number
5282  bcptr          pointer to the chain of currently open branches
5283  cd             points to the data block with tables pointers etc.
5284  lengthptr      NULL during the real compile phase
5285                 points to length accumulator during pre-compile phase
5286
5287Returns:         TRUE on success
5288*/
5289
5290static BOOL
5291compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5292  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5293  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5294  int *lengthptr)
5295{
5296const uschar *ptr = *ptrptr;
5297uschar *code = *codeptr;
5298uschar *last_branch = code;
5299uschar *start_bracket = code;
5300uschar *reverse_count = NULL;
5301int firstbyte, reqbyte;
5302int branchfirstbyte, branchreqbyte;
5303int length;
5304int orig_bracount;
5305int max_bracount;
5306branch_chain bc;
5307
5308bc.outer = bcptr;
5309bc.current = code;
5310
5311firstbyte = reqbyte = REQ_UNSET;
5312
5313/* Accumulate the length for use in the pre-compile phase. Start with the
5314length of the BRA and KET and any extra bytes that are required at the
5315beginning. We accumulate in a local variable to save frequent testing of
5316lenthptr for NULL. We cannot do this by looking at the value of code at the
5317start and end of each alternative, because compiled items are discarded during
5318the pre-compile phase so that the work space is not exceeded. */
5319
5320length = 2 + 2*LINK_SIZE + skipbytes;
5321
5322/* WARNING: If the above line is changed for any reason, you must also change
5323the code that abstracts option settings at the start of the pattern and makes
5324them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5325pre-compile phase to find out whether anything has yet been compiled or not. */
5326
5327/* Offset is set zero to mark that this bracket is still open */
5328
5329PUT(code, 1, 0);
5330code += 1 + LINK_SIZE + skipbytes;
5331
5332/* Loop for each alternative branch */
5333
5334orig_bracount = max_bracount = cd->bracount;
5335for (;;)
5336  {
5337  /* For a (?| group, reset the capturing bracket count so that each branch
5338  uses the same numbers. */
5339
5340  if (reset_bracount) cd->bracount = orig_bracount;
5341
5342  /* Handle a change of ims options at the start of the branch */
5343
5344  if ((options & PCRE_IMS) != oldims)
5345    {
5346    *code++ = OP_OPT;
5347    *code++ = options & PCRE_IMS;
5348    length += 2;
5349    }
5350
5351  /* Set up dummy OP_REVERSE if lookbehind assertion */
5352
5353  if (lookbehind)
5354    {
5355    *code++ = OP_REVERSE;
5356    reverse_count = code;
5357    PUTINC(code, 0, 0);
5358    length += 1 + LINK_SIZE;
5359    }
5360
5361  /* Now compile the branch; in the pre-compile phase its length gets added
5362  into the length. */
5363
5364  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5365        &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5366    {
5367    *ptrptr = ptr;
5368    return FALSE;
5369    }
5370
5371  /* Keep the highest bracket count in case (?| was used and some branch
5372  has fewer than the rest. */
5373
5374  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5375
5376  /* In the real compile phase, there is some post-processing to be done. */
5377
5378  if (lengthptr == NULL)
5379    {
5380    /* If this is the first branch, the firstbyte and reqbyte values for the
5381    branch become the values for the regex. */
5382
5383    if (*last_branch != OP_ALT)
5384      {
5385      firstbyte = branchfirstbyte;
5386      reqbyte = branchreqbyte;
5387      }
5388
5389    /* If this is not the first branch, the first char and reqbyte have to
5390    match the values from all the previous branches, except that if the
5391    previous value for reqbyte didn't have REQ_VARY set, it can still match,
5392    and we set REQ_VARY for the regex. */
5393
5394    else
5395      {
5396      /* If we previously had a firstbyte, but it doesn't match the new branch,
5397      we have to abandon the firstbyte for the regex, but if there was
5398      previously no reqbyte, it takes on the value of the old firstbyte. */
5399
5400      if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5401        {
5402        if (reqbyte < 0) reqbyte = firstbyte;
5403        firstbyte = REQ_NONE;
5404        }
5405
5406      /* If we (now or from before) have no firstbyte, a firstbyte from the
5407      branch becomes a reqbyte if there isn't a branch reqbyte. */
5408
5409      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5410          branchreqbyte = branchfirstbyte;
5411
5412      /* Now ensure that the reqbytes match */
5413
5414      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5415        reqbyte = REQ_NONE;
5416      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5417      }
5418
5419    /* If lookbehind, check that this branch matches a fixed-length string, and
5420    put the length into the OP_REVERSE item. Temporarily mark the end of the
5421    branch with OP_END. */
5422
5423    if (lookbehind)
5424      {
5425      int fixed_length;
5426      *code = OP_END;
5427      fixed_length = find_fixedlength(last_branch, options);
5428      DPRINTF(("fixed length = %d\n", fixed_length));
5429      if (fixed_length < 0)
5430        {
5431        *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5432        *ptrptr = ptr;
5433        return FALSE;
5434        }
5435      PUT(reverse_count, 0, fixed_length);
5436      }
5437    }
5438
5439  /* Reached end of expression, either ')' or end of pattern. In the real
5440  compile phase, go back through the alternative branches and reverse the chain
5441  of offsets, with the field in the BRA item now becoming an offset to the
5442  first alternative. If there are no alternatives, it points to the end of the
5443  group. The length in the terminating ket is always the length of the whole
5444  bracketed item. If any of the ims options were changed inside the group,
5445  compile a resetting op-code following, except at the very end of the pattern.
5446  Return leaving the pointer at the terminating char. */
5447
5448  if (*ptr != '|')
5449    {
5450    if (lengthptr == NULL)
5451      {
5452      int branch_length = code - last_branch;
5453      do
5454        {
5455        int prev_length = GET(last_branch, 1);
5456        PUT(last_branch, 1, branch_length);
5457        branch_length = prev_length;
5458        last_branch -= branch_length;
5459        }
5460      while (branch_length > 0);
5461      }
5462
5463    /* Fill in the ket */
5464
5465    *code = OP_KET;
5466    PUT(code, 1, code - start_bracket);
5467    code += 1 + LINK_SIZE;
5468
5469    /* Resetting option if needed */
5470
5471    if ((options & PCRE_IMS) != oldims && *ptr == ')')
5472      {
5473      *code++ = OP_OPT;
5474      *code++ = oldims;
5475      length += 2;
5476      }
5477
5478    /* Retain the highest bracket number, in case resetting was used. */
5479
5480    cd->bracount = max_bracount;
5481
5482    /* Set values to pass back */
5483
5484    *codeptr = code;
5485    *ptrptr = ptr;
5486    *firstbyteptr = firstbyte;
5487    *reqbyteptr = reqbyte;
5488    if (lengthptr != NULL)
5489      {
5490      if (OFLOW_MAX - *lengthptr < length)
5491        {
5492        *errorcodeptr = ERR20;
5493        return FALSE;
5494        }
5495      *lengthptr += length;
5496      }
5497    return TRUE;
5498    }
5499
5500  /* Another branch follows. In the pre-compile phase, we can move the code
5501  pointer back to where it was for the start of the first branch. (That is,
5502  pretend that each branch is the only one.)
5503
5504  In the real compile phase, insert an ALT node. Its length field points back
5505  to the previous branch while the bracket remains open. At the end the chain
5506  is reversed. It's done like this so that the start of the bracket has a
5507  zero offset until it is closed, making it possible to detect recursion. */
5508
5509  if (lengthptr != NULL)
5510    {
5511    code = *codeptr + 1 + LINK_SIZE + skipbytes;
5512    length += 1 + LINK_SIZE;
5513    }
5514  else
5515    {
5516    *code = OP_ALT;
5517    PUT(code, 1, code - last_branch);
5518    bc.current = last_branch = code;
5519    code += 1 + LINK_SIZE;
5520    }
5521
5522  ptr++;
5523  }
5524/* Control never reaches here */
5525}
5526
5527
5528
5529
5530/*************************************************
5531*          Check for anchored expression         *
5532*************************************************/
5533
5534/* Try to find out if this is an anchored regular expression. Consider each
5535alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5536all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5537it's anchored. However, if this is a multiline pattern, then only OP_SOD
5538counts, since OP_CIRC can match in the middle.
5539
5540We can also consider a regex to be anchored if OP_SOM starts all its branches.
5541This is the code for \G, which means "match at start of match position, taking
5542into account the match offset".
5543
5544A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5545because that will try the rest of the pattern at all possible matching points,
5546so there is no point trying again.... er ....
5547
5548.... except when the .* appears inside capturing parentheses, and there is a
5549subsequent back reference to those parentheses. We haven't enough information
5550to catch that case precisely.
5551
5552At first, the best we could do was to detect when .* was in capturing brackets
5553and the highest back reference was greater than or equal to that level.
5554However, by keeping a bitmap of the first 31 back references, we can catch some
5555of the more common cases more precisely.
5556
5557Arguments:
5558  code           points to start of expression (the bracket)
5559  options        points to the options setting
5560  bracket_map    a bitmap of which brackets we are inside while testing; this
5561                  handles up to substring 31; after that we just have to take
5562                  the less precise approach
5563  backref_map    the back reference bitmap
5564
5565Returns:     TRUE or FALSE
5566*/
5567
5568static BOOL
5569is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5570  unsigned int backref_map)
5571{
5572do {
5573   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5574     options, PCRE_MULTILINE, FALSE);
5575   register int op = *scode;
5576
5577   /* Non-capturing brackets */
5578
5579   if (op == OP_BRA)
5580     {
5581     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5582     }
5583
5584   /* Capturing brackets */
5585
5586   else if (op == OP_CBRA)
5587     {
5588     int n = GET2(scode, 1+LINK_SIZE);
5589     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5590     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5591     }
5592
5593   /* Other brackets */
5594
5595   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5596     {
5597     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5598     }
5599
5600   /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5601   are or may be referenced. */
5602
5603   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5604             op == OP_TYPEPOSSTAR) &&
5605            (*options & PCRE_DOTALL) != 0)
5606     {
5607     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5608     }
5609
5610   /* Check for explicit anchoring */
5611
5612   else if (op != OP_SOD && op != OP_SOM &&
5613           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5614     return FALSE;
5615   code += GET(code, 1);
5616   }
5617while (*code == OP_ALT);   /* Loop for each alternative */
5618return TRUE;
5619}
5620
5621
5622
5623/*************************************************
5624*         Check for starting with ^ or .*        *
5625*************************************************/
5626
5627/* This is called to find out if every branch starts with ^ or .* so that
5628"first char" processing can be done to speed things up in multiline
5629matching and for non-DOTALL patterns that start with .* (which must start at
5630the beginning or after \n). As in the case of is_anchored() (see above), we
5631have to take account of back references to capturing brackets that contain .*
5632because in that case we can't make the assumption.
5633
5634Arguments:
5635  code           points to start of expression (the bracket)
5636  bracket_map    a bitmap of which brackets we are inside while testing; this
5637                  handles up to substring 31; after that we just have to take
5638                  the less precise approach
5639  backref_map    the back reference bitmap
5640
5641Returns:         TRUE or FALSE
5642*/
5643
5644static BOOL
5645is_startline(const uschar *code, unsigned int bracket_map,
5646  unsigned int backref_map)
5647{
5648do {
5649   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5650     NULL, 0, FALSE);
5651   register int op = *scode;
5652
5653   /* Non-capturing brackets */
5654
5655   if (op == OP_BRA)
5656     {
5657     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5658     }
5659
5660   /* Capturing brackets */
5661
5662   else if (op == OP_CBRA)
5663     {
5664     int n = GET2(scode, 1+LINK_SIZE);
5665     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5666     if (!is_startline(scode, new_map, backref_map)) return FALSE;
5667     }
5668
5669   /* Other brackets */
5670
5671   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5672     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5673
5674   /* .* means "start at start or after \n" if it isn't in brackets that
5675   may be referenced. */
5676
5677   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5678     {
5679     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5680     }
5681
5682   /* Check for explicit circumflex */
5683
5684   else if (op != OP_CIRC) return FALSE;
5685
5686   /* Move on to the next alternative */
5687
5688   code += GET(code, 1);
5689   }
5690while (*code == OP_ALT);  /* Loop for each alternative */
5691return TRUE;
5692}
5693
5694
5695
5696/*************************************************
5697*       Check for asserted fixed first char      *
5698*************************************************/
5699
5700/* During compilation, the "first char" settings from forward assertions are
5701discarded, because they can cause conflicts with actual literals that follow.
5702However, if we end up without a first char setting for an unanchored pattern,
5703it is worth scanning the regex to see if there is an initial asserted first
5704char. If all branches start with the same asserted char, or with a bracket all
5705of whose alternatives start with the same asserted char (recurse ad lib), then
5706we return that char, otherwise -1.
5707
5708Arguments:
5709  code       points to start of expression (the bracket)
5710  options    pointer to the options (used to check casing changes)
5711  inassert   TRUE if in an assertion
5712
5713Returns:     -1 or the fixed first char
5714*/
5715
5716static int
5717find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5718{
5719register int c = -1;
5720do {
5721   int d;
5722   const uschar *scode =
5723     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5724   register int op = *scode;
5725
5726   switch(op)
5727     {
5728     default:
5729     return -1;
5730
5731     case OP_BRA:
5732     case OP_CBRA:
5733     case OP_ASSERT:
5734     case OP_ONCE:
5735     case OP_COND:
5736     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5737       return -1;
5738     if (c < 0) c = d; else if (c != d) return -1;
5739     break;
5740
5741     case OP_EXACT:       /* Fall through */
5742     scode += 2;
5743
5744     case OP_CHAR:
5745     case OP_CHARNC:
5746     case OP_PLUS:
5747     case OP_MINPLUS:
5748     case OP_POSPLUS:
5749     if (!inassert) return -1;
5750     if (c < 0)
5751       {
5752       c = scode[1];
5753       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5754       }
5755     else if (c != scode[1]) return -1;
5756     break;
5757     }
5758
5759   code += GET(code, 1);
5760   }
5761while (*code == OP_ALT);
5762return c;
5763}
5764
5765
5766
5767/*************************************************
5768*        Compile a Regular Expression            *
5769*************************************************/
5770
5771/* This function takes a string and returns a pointer to a block of store
5772holding a compiled version of the expression. The original API for this
5773function had no error code return variable; it is retained for backwards
5774compatibility. The new function is given a new name.
5775
5776Arguments:
5777  pattern       the regular expression
5778  options       various option bits
5779  errorcodeptr  pointer to error code variable (pcre_compile2() only)
5780                  can be NULL if you don't want a code value
5781  errorptr      pointer to pointer to error text
5782  erroroffset   ptr offset in pattern where error was detected
5783  tables        pointer to character tables or NULL
5784
5785Returns:        pointer to compiled data block, or NULL on error,
5786                with errorptr and erroroffset set
5787*/
5788
5789PCRE_EXP_DEFN pcre *
5790pcre_compile(const char *pattern, int options, const char **errorptr,
5791  int *erroroffset, const unsigned char *tables)
5792{
5793return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5794}
5795
5796
5797PCRE_EXP_DEFN pcre *
5798pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5799  const char **errorptr, int *erroroffset, const unsigned char *tables)
5800{
5801real_pcre *re;
5802int length = 1;  /* For final END opcode */
5803int firstbyte, reqbyte, newline;
5804int errorcode = 0;
5805int skipatstart = 0;
5806#ifdef SUPPORT_UTF8
5807BOOL utf8;
5808#endif
5809size_t size;
5810uschar *code;
5811const uschar *codestart;
5812const uschar *ptr;
5813compile_data compile_block;
5814compile_data *cd = &compile_block;
5815
5816/* This space is used for "compiling" into during the first phase, when we are
5817computing the amount of memory that is needed. Compiled items are thrown away
5818as soon as possible, so that a fairly large buffer should be sufficient for
5819this purpose. The same space is used in the second phase for remembering where
5820to fill in forward references to subpatterns. */
5821
5822uschar cworkspace[COMPILE_WORK_SIZE];
5823
5824/* Set this early so that early errors get offset 0. */
5825
5826ptr = (const uschar *)pattern;
5827
5828/* We can't pass back an error message if errorptr is NULL; I guess the best we
5829can do is just return NULL, but we can set a code value if there is a code
5830pointer. */
5831
5832if (errorptr == NULL)
5833  {
5834  if (errorcodeptr != NULL) *errorcodeptr = 99;
5835  return NULL;
5836  }
5837
5838*errorptr = NULL;
5839if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5840
5841/* However, we can give a message for this error */
5842
5843if (erroroffset == NULL)
5844  {
5845  errorcode = ERR16;
5846  goto PCRE_EARLY_ERROR_RETURN2;
5847  }
5848
5849*erroroffset = 0;
5850
5851/* Can't support UTF8 unless PCRE has been compiled to include the code. */
5852
5853#ifdef SUPPORT_UTF8
5854utf8 = (options & PCRE_UTF8) != 0;
5855if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5856     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5857  {
5858  errorcode = ERR44;
5859  goto PCRE_EARLY_ERROR_RETURN2;
5860  }
5861#else
5862if ((options & PCRE_UTF8) != 0)
5863  {
5864  errorcode = ERR32;
5865  goto PCRE_EARLY_ERROR_RETURN;
5866  }
5867#endif
5868
5869if ((options & ~PUBLIC_OPTIONS) != 0)
5870  {
5871  errorcode = ERR17;
5872  goto PCRE_EARLY_ERROR_RETURN;
5873  }
5874
5875/* Set up pointers to the individual character tables */
5876
5877if (tables == NULL) tables = _pcre_default_tables;
5878cd->lcc = tables + lcc_offset;
5879cd->fcc = tables + fcc_offset;
5880cd->cbits = tables + cbits_offset;
5881cd->ctypes = tables + ctypes_offset;
5882
5883/* Check for global one-time settings at the start of the pattern, and remember
5884the offset for later. */
5885
5886while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5887  {
5888  int newnl = 0;
5889  int newbsr = 0;
5890
5891  if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5892    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5893  else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
5894    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5895  else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
5896    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5897  else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5898    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5899  else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
5900    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5901
5902  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5903    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5904  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5905    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5906
5907  if (newnl != 0)
5908    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5909  else if (newbsr != 0)
5910    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5911  else break;
5912  }
5913
5914/* Check validity of \R options. */
5915
5916switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5917  {
5918  case 0:
5919  case PCRE_BSR_ANYCRLF:
5920  case PCRE_BSR_UNICODE:
5921  break;
5922  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5923  }
5924
5925/* Handle different types of newline. The three bits give seven cases. The
5926current code allows for fixed one- or two-byte sequences, plus "any" and
5927"anycrlf". */
5928
5929switch (options & PCRE_NEWLINE_BITS)
5930  {
5931  case 0: newline = NEWLINE; break;   /* Build-time default */
5932  case PCRE_NEWLINE_CR: newline = '\r'; break;
5933  case PCRE_NEWLINE_LF: newline = '\n'; break;
5934  case PCRE_NEWLINE_CR+
5935       PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5936  case PCRE_NEWLINE_ANY: newline = -1; break;
5937  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5938  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5939  }
5940
5941if (newline == -2)
5942  {
5943  cd->nltype = NLTYPE_ANYCRLF;
5944  }
5945else if (newline < 0)
5946  {
5947  cd->nltype = NLTYPE_ANY;
5948  }
5949else
5950  {
5951  cd->nltype = NLTYPE_FIXED;
5952  if (newline > 255)
5953    {
5954    cd->nllen = 2;
5955    cd->nl[0] = (newline >> 8) & 255;
5956    cd->nl[1] = newline & 255;
5957    }
5958  else
5959    {
5960    cd->nllen = 1;
5961    cd->nl[0] = newline;
5962    }
5963  }
5964
5965/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5966references to help in deciding whether (.*) can be treated as anchored or not.
5967*/
5968
5969cd->top_backref = 0;
5970cd->backref_map = 0;
5971
5972/* Reflect pattern for debugging output */
5973
5974DPRINTF(("------------------------------------------------------------------\n"));
5975DPRINTF(("%s\n", pattern));
5976
5977/* Pretend to compile the pattern while actually just accumulating the length
5978of memory required. This behaviour is triggered by passing a non-NULL final
5979argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5980to compile parts of the pattern into; the compiled code is discarded when it is
5981no longer needed, so hopefully this workspace will never overflow, though there
5982is a test for its doing so. */
5983
5984cd->bracount = cd->final_bracount = 0;
5985cd->names_found = 0;
5986cd->name_entry_size = 0;
5987cd->name_table = NULL;
5988cd->start_workspace = cworkspace;
5989cd->start_code = cworkspace;
5990cd->hwm = cworkspace;
5991cd->start_pattern = (const uschar *)pattern;
5992cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5993cd->req_varyopt = 0;
5994cd->external_options = options;
5995cd->external_flags = 0;
5996
5997/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5998don't need to look at the result of the function here. The initial options have
5999been put into the cd block so that they can be changed if an option setting is
6000found within the regex right at the beginning. Bringing initial option settings
6001outside can help speed up starting point checks. */
6002
6003ptr += skipatstart;
6004code = cworkspace;
6005*code = OP_BRA;
6006(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6007  &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6008  &length);
6009if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6010
6011DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6012  cd->hwm - cworkspace));
6013
6014if (length > MAX_PATTERN_SIZE)
6015  {
6016  errorcode = ERR20;
6017  goto PCRE_EARLY_ERROR_RETURN;
6018  }
6019
6020/* Compute the size of data block needed and get it, either from malloc or
6021externally provided function. Integer overflow should no longer be possible
6022because nowadays we limit the maximum value of cd->names_found and
6023cd->name_entry_size. */
6024
6025size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6026re = (real_pcre *)(pcre_malloc)(size);
6027
6028if (re == NULL)
6029  {
6030  errorcode = ERR21;
6031  goto PCRE_EARLY_ERROR_RETURN;
6032  }
6033
6034/* Put in the magic number, and save the sizes, initial options, internal
6035flags, and character table pointer. NULL is used for the default character
6036tables. The nullpad field is at the end; it's there to help in the case when a
6037regex compiled on a system with 4-byte pointers is run on another with 8-byte
6038pointers. */
6039
6040re->magic_number = MAGIC_NUMBER;
6041re->size = size;
6042re->options = cd->external_options;
6043re->flags = cd->external_flags;
6044re->dummy1 = 0;
6045re->first_byte = 0;
6046re->req_byte = 0;
6047re->name_table_offset = sizeof(real_pcre);
6048re->name_entry_size = cd->name_entry_size;
6049re->name_count = cd->names_found;
6050re->ref_count = 0;
6051re->tables = (tables == _pcre_default_tables)? NULL : tables;
6052re->nullpad = NULL;
6053
6054/* The starting points of the name/number translation table and of the code are
6055passed around in the compile data block. The start/end pattern and initial
6056options are already set from the pre-compile phase, as is the name_entry_size
6057field. Reset the bracket count and the names_found field. Also reset the hwm
6058field; this time it's used for remembering forward references to subpatterns.
6059*/
6060
6061cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6062cd->bracount = 0;
6063cd->names_found = 0;
6064cd->name_table = (uschar *)re + re->name_table_offset;
6065codestart = cd->name_table + re->name_entry_size * re->name_count;
6066cd->start_code = codestart;
6067cd->hwm = cworkspace;
6068cd->req_varyopt = 0;
6069cd->had_accept = FALSE;
6070
6071/* Set up a starting, non-extracting bracket, then compile the expression. On
6072error, errorcode will be set non-zero, so we don't need to look at the result
6073of the function here. */
6074
6075ptr = (const uschar *)pattern + skipatstart;
6076code = (uschar *)codestart;
6077*code = OP_BRA;
6078(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6079  &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6080re->top_bracket = cd->bracount;
6081re->top_backref = cd->top_backref;
6082re->flags = cd->external_flags;
6083
6084if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6085
6086/* If not reached end of pattern on success, there's an excess bracket. */
6087
6088if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6089
6090/* Fill in the terminating state and check for disastrous overflow, but
6091if debugging, leave the test till after things are printed out. */
6092
6093*code++ = OP_END;
6094
6095#ifndef DEBUG
6096if (code - codestart > length) errorcode = ERR23;
6097#endif
6098
6099/* Fill in any forward references that are required. */
6100
6101while (errorcode == 0 && cd->hwm > cworkspace)
6102  {
6103  int offset, recno;
6104  const uschar *groupptr;
6105  cd->hwm -= LINK_SIZE;
6106  offset = GET(cd->hwm, 0);
6107  recno = GET(codestart, offset);
6108  groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6109  if (groupptr == NULL) errorcode = ERR53;
6110    else PUT(((uschar *)codestart), offset, groupptr - codestart);
6111  }
6112
6113/* Give an error if there's back reference to a non-existent capturing
6114subpattern. */
6115
6116if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6117
6118/* Failed to compile, or error while post-processing */
6119
6120if (errorcode != 0)
6121  {
6122  (pcre_free)(re);
6123  PCRE_EARLY_ERROR_RETURN:
6124  *erroroffset = ptr - (const uschar *)pattern;
6125  PCRE_EARLY_ERROR_RETURN2:
6126  *errorptr = find_error_text(errorcode);
6127  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6128  return NULL;
6129  }
6130
6131/* If the anchored option was not passed, set the flag if we can determine that
6132the pattern is anchored by virtue of ^ characters or \A or anything else (such
6133as starting with .* when DOTALL is set).
6134
6135Otherwise, if we know what the first byte has to be, save it, because that
6136speeds up unanchored matches no end. If not, see if we can set the
6137PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6138start with ^. and also when all branches start with .* for non-DOTALL matches.
6139*/
6140
6141if ((re->options & PCRE_ANCHORED) == 0)
6142  {
6143  int temp_options = re->options;   /* May get changed during these scans */
6144  if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6145    re->options |= PCRE_ANCHORED;
6146  else
6147    {
6148    if (firstbyte < 0)
6149      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6150    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6151      {
6152      int ch = firstbyte & 255;
6153      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6154         cd->fcc[ch] == ch)? ch : firstbyte;
6155      re->flags |= PCRE_FIRSTSET;
6156      }
6157    else if (is_startline(codestart, 0, cd->backref_map))
6158      re->flags |= PCRE_STARTLINE;
6159    }
6160  }
6161
6162/* For an anchored pattern, we use the "required byte" only if it follows a
6163variable length item in the regex. Remove the caseless flag for non-caseable
6164bytes. */
6165
6166if (reqbyte >= 0 &&
6167     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6168  {
6169  int ch = reqbyte & 255;
6170  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6171    cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6172  re->flags |= PCRE_REQCHSET;
6173  }
6174
6175/* Print out the compiled data if debugging is enabled. This is never the
6176case when building a production library. */
6177
6178#ifdef DEBUG
6179
6180printf("Length = %d top_bracket = %d top_backref = %d\n",
6181  length, re->top_bracket, re->top_backref);
6182
6183printf("Options=%08x\n", re->options);
6184
6185if ((re->flags & PCRE_FIRSTSET) != 0)
6186  {
6187  int ch = re->first_byte & 255;
6188  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6189    "" : " (caseless)";
6190  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6191    else printf("First char = \\x%02x%s\n", ch, caseless);
6192  }
6193
6194if ((re->flags & PCRE_REQCHSET) != 0)
6195  {
6196  int ch = re->req_byte & 255;
6197  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6198    "" : " (caseless)";
6199  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6200    else printf("Req char = \\x%02x%s\n", ch, caseless);
6201  }
6202
6203pcre_printint(re, stdout, TRUE);
6204
6205/* This check is done here in the debugging case so that the code that
6206was compiled can be seen. */
6207
6208if (code - codestart > length)
6209  {
6210  (pcre_free)(re);
6211  *errorptr = find_error_text(ERR23);
6212  *erroroffset = ptr - (uschar *)pattern;
6213  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6214  return NULL;
6215  }
6216#endif   /* DEBUG */
6217
6218return (pcre *)re;
6219}
6220
6221/* End of pcre_compile.c */
Note: See TracBrowser for help on using the repository browser.