1 | /************************************************* |
---|
2 | * Perl-Compatible Regular Expressions * |
---|
3 | *************************************************/ |
---|
4 | |
---|
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
---|
6 | and semantics are as close as possible to those of the Perl 5 language. |
---|
7 | |
---|
8 | Written by Philip Hazel |
---|
9 | Copyright (c) 1997-2008 University of Cambridge |
---|
10 | |
---|
11 | ----------------------------------------------------------------------------- |
---|
12 | Redistribution and use in source and binary forms, with or without |
---|
13 | modification, are permitted provided that the following conditions are met: |
---|
14 | |
---|
15 | * Redistributions of source code must retain the above copyright notice, |
---|
16 | this list of conditions and the following disclaimer. |
---|
17 | |
---|
18 | * Redistributions in binary form must reproduce the above copyright |
---|
19 | notice, this list of conditions and the following disclaimer in the |
---|
20 | documentation and/or other materials provided with the distribution. |
---|
21 | |
---|
22 | * Neither the name of the University of Cambridge nor the names of its |
---|
23 | contributors may be used to endorse or promote products derived from |
---|
24 | this software without specific prior written permission. |
---|
25 | |
---|
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
---|
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
36 | POSSIBILITY OF SUCH DAMAGE. |
---|
37 | ----------------------------------------------------------------------------- |
---|
38 | */ |
---|
39 | |
---|
40 | |
---|
41 | /* This module contains the external function pcre_dfa_exec(), which is an |
---|
42 | alternative matching function that uses a sort of DFA algorithm (not a true |
---|
43 | FSM). This is NOT Perl- compatible, but it has advantages in certain |
---|
44 | applications. */ |
---|
45 | |
---|
46 | |
---|
47 | #ifdef HAVE_CONFIG_H |
---|
48 | #include "config.h" |
---|
49 | #endif |
---|
50 | |
---|
51 | #define NLBLOCK md /* Block containing newline information */ |
---|
52 | #define PSSTART start_subject /* Field containing processed string start */ |
---|
53 | #define PSEND end_subject /* Field containing processed string end */ |
---|
54 | |
---|
55 | #include "pcre_internal.h" |
---|
56 | |
---|
57 | |
---|
58 | /* For use to indent debugging output */ |
---|
59 | |
---|
60 | #define SP " " |
---|
61 | |
---|
62 | |
---|
63 | |
---|
64 | /************************************************* |
---|
65 | * Code parameters and static tables * |
---|
66 | *************************************************/ |
---|
67 | |
---|
68 | /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes |
---|
69 | into others, under special conditions. A gap of 20 between the blocks should be |
---|
70 | enough. The resulting opcodes don't have to be less than 256 because they are |
---|
71 | never stored, so we push them well clear of the normal opcodes. */ |
---|
72 | |
---|
73 | #define OP_PROP_EXTRA 300 |
---|
74 | #define OP_EXTUNI_EXTRA 320 |
---|
75 | #define OP_ANYNL_EXTRA 340 |
---|
76 | #define OP_HSPACE_EXTRA 360 |
---|
77 | #define OP_VSPACE_EXTRA 380 |
---|
78 | |
---|
79 | |
---|
80 | /* This table identifies those opcodes that are followed immediately by a |
---|
81 | character that is to be tested in some way. This makes is possible to |
---|
82 | centralize the loading of these characters. In the case of Type * etc, the |
---|
83 | "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a |
---|
84 | small value. ***NOTE*** If the start of this table is modified, the two tables |
---|
85 | that follow must also be modified. */ |
---|
86 | |
---|
87 | static const uschar coptable[] = { |
---|
88 | 0, /* End */ |
---|
89 | 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ |
---|
90 | 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ |
---|
91 | 0, 0, 0, /* Any, AllAny, Anybyte */ |
---|
92 | 0, 0, 0, /* NOTPROP, PROP, EXTUNI */ |
---|
93 | 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ |
---|
94 | 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ |
---|
95 | 1, /* Char */ |
---|
96 | 1, /* Charnc */ |
---|
97 | 1, /* not */ |
---|
98 | /* Positive single-char repeats */ |
---|
99 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ |
---|
100 | 3, 3, 3, /* upto, minupto, exact */ |
---|
101 | 1, 1, 1, 3, /* *+, ++, ?+, upto+ */ |
---|
102 | /* Negative single-char repeats - only for chars < 256 */ |
---|
103 | 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ |
---|
104 | 3, 3, 3, /* NOT upto, minupto, exact */ |
---|
105 | 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */ |
---|
106 | /* Positive type repeats */ |
---|
107 | 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ |
---|
108 | 3, 3, 3, /* Type upto, minupto, exact */ |
---|
109 | 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */ |
---|
110 | /* Character class & ref repeats */ |
---|
111 | 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ |
---|
112 | 0, 0, /* CRRANGE, CRMINRANGE */ |
---|
113 | 0, /* CLASS */ |
---|
114 | 0, /* NCLASS */ |
---|
115 | 0, /* XCLASS - variable length */ |
---|
116 | 0, /* REF */ |
---|
117 | 0, /* RECURSE */ |
---|
118 | 0, /* CALLOUT */ |
---|
119 | 0, /* Alt */ |
---|
120 | 0, /* Ket */ |
---|
121 | 0, /* KetRmax */ |
---|
122 | 0, /* KetRmin */ |
---|
123 | 0, /* Assert */ |
---|
124 | 0, /* Assert not */ |
---|
125 | 0, /* Assert behind */ |
---|
126 | 0, /* Assert behind not */ |
---|
127 | 0, /* Reverse */ |
---|
128 | 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ |
---|
129 | 0, 0, 0, /* SBRA, SCBRA, SCOND */ |
---|
130 | 0, /* CREF */ |
---|
131 | 0, /* RREF */ |
---|
132 | 0, /* DEF */ |
---|
133 | 0, 0, /* BRAZERO, BRAMINZERO */ |
---|
134 | 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ |
---|
135 | 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */ |
---|
136 | }; |
---|
137 | |
---|
138 | /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, |
---|
139 | and \w */ |
---|
140 | |
---|
141 | static const uschar toptable1[] = { |
---|
142 | 0, 0, 0, 0, 0, 0, |
---|
143 | ctype_digit, ctype_digit, |
---|
144 | ctype_space, ctype_space, |
---|
145 | ctype_word, ctype_word, |
---|
146 | 0, 0 /* OP_ANY, OP_ALLANY */ |
---|
147 | }; |
---|
148 | |
---|
149 | static const uschar toptable2[] = { |
---|
150 | 0, 0, 0, 0, 0, 0, |
---|
151 | ctype_digit, 0, |
---|
152 | ctype_space, 0, |
---|
153 | ctype_word, 0, |
---|
154 | 1, 1 /* OP_ANY, OP_ALLANY */ |
---|
155 | }; |
---|
156 | |
---|
157 | |
---|
158 | /* Structure for holding data about a particular state, which is in effect the |
---|
159 | current data for an active path through the match tree. It must consist |
---|
160 | entirely of ints because the working vector we are passed, and which we put |
---|
161 | these structures in, is a vector of ints. */ |
---|
162 | |
---|
163 | typedef struct stateblock { |
---|
164 | int offset; /* Offset to opcode */ |
---|
165 | int count; /* Count for repeats */ |
---|
166 | int ims; /* ims flag bits */ |
---|
167 | int data; /* Some use extra data */ |
---|
168 | } stateblock; |
---|
169 | |
---|
170 | #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int)) |
---|
171 | |
---|
172 | |
---|
173 | #ifdef DEBUG |
---|
174 | /************************************************* |
---|
175 | * Print character string * |
---|
176 | *************************************************/ |
---|
177 | |
---|
178 | /* Character string printing function for debugging. |
---|
179 | |
---|
180 | Arguments: |
---|
181 | p points to string |
---|
182 | length number of bytes |
---|
183 | f where to print |
---|
184 | |
---|
185 | Returns: nothing |
---|
186 | */ |
---|
187 | |
---|
188 | static void |
---|
189 | pchars(unsigned char *p, int length, FILE *f) |
---|
190 | { |
---|
191 | int c; |
---|
192 | while (length-- > 0) |
---|
193 | { |
---|
194 | if (isprint(c = *(p++))) |
---|
195 | fprintf(f, "%c", c); |
---|
196 | else |
---|
197 | fprintf(f, "\\x%02x", c); |
---|
198 | } |
---|
199 | } |
---|
200 | #endif |
---|
201 | |
---|
202 | |
---|
203 | |
---|
204 | /************************************************* |
---|
205 | * Execute a Regular Expression - DFA engine * |
---|
206 | *************************************************/ |
---|
207 | |
---|
208 | /* This internal function applies a compiled pattern to a subject string, |
---|
209 | starting at a given point, using a DFA engine. This function is called from the |
---|
210 | external one, possibly multiple times if the pattern is not anchored. The |
---|
211 | function calls itself recursively for some kinds of subpattern. |
---|
212 | |
---|
213 | Arguments: |
---|
214 | md the match_data block with fixed information |
---|
215 | this_start_code the opening bracket of this subexpression's code |
---|
216 | current_subject where we currently are in the subject string |
---|
217 | start_offset start offset in the subject string |
---|
218 | offsets vector to contain the matching string offsets |
---|
219 | offsetcount size of same |
---|
220 | workspace vector of workspace |
---|
221 | wscount size of same |
---|
222 | ims the current ims flags |
---|
223 | rlevel function call recursion level |
---|
224 | recursing regex recursive call level |
---|
225 | |
---|
226 | Returns: > 0 => number of match offset pairs placed in offsets |
---|
227 | = 0 => offsets overflowed; longest matches are present |
---|
228 | -1 => failed to match |
---|
229 | < -1 => some kind of unexpected problem |
---|
230 | |
---|
231 | The following macros are used for adding states to the two state vectors (one |
---|
232 | for the current character, one for the following character). */ |
---|
233 | |
---|
234 | #define ADD_ACTIVE(x,y) \ |
---|
235 | if (active_count++ < wscount) \ |
---|
236 | { \ |
---|
237 | next_active_state->offset = (x); \ |
---|
238 | next_active_state->count = (y); \ |
---|
239 | next_active_state->ims = ims; \ |
---|
240 | next_active_state++; \ |
---|
241 | DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ |
---|
242 | } \ |
---|
243 | else return PCRE_ERROR_DFA_WSSIZE |
---|
244 | |
---|
245 | #define ADD_ACTIVE_DATA(x,y,z) \ |
---|
246 | if (active_count++ < wscount) \ |
---|
247 | { \ |
---|
248 | next_active_state->offset = (x); \ |
---|
249 | next_active_state->count = (y); \ |
---|
250 | next_active_state->ims = ims; \ |
---|
251 | next_active_state->data = (z); \ |
---|
252 | next_active_state++; \ |
---|
253 | DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ |
---|
254 | } \ |
---|
255 | else return PCRE_ERROR_DFA_WSSIZE |
---|
256 | |
---|
257 | #define ADD_NEW(x,y) \ |
---|
258 | if (new_count++ < wscount) \ |
---|
259 | { \ |
---|
260 | next_new_state->offset = (x); \ |
---|
261 | next_new_state->count = (y); \ |
---|
262 | next_new_state->ims = ims; \ |
---|
263 | next_new_state++; \ |
---|
264 | DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ |
---|
265 | } \ |
---|
266 | else return PCRE_ERROR_DFA_WSSIZE |
---|
267 | |
---|
268 | #define ADD_NEW_DATA(x,y,z) \ |
---|
269 | if (new_count++ < wscount) \ |
---|
270 | { \ |
---|
271 | next_new_state->offset = (x); \ |
---|
272 | next_new_state->count = (y); \ |
---|
273 | next_new_state->ims = ims; \ |
---|
274 | next_new_state->data = (z); \ |
---|
275 | next_new_state++; \ |
---|
276 | DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ |
---|
277 | } \ |
---|
278 | else return PCRE_ERROR_DFA_WSSIZE |
---|
279 | |
---|
280 | /* And now, here is the code */ |
---|
281 | |
---|
282 | static int |
---|
283 | internal_dfa_exec( |
---|
284 | dfa_match_data *md, |
---|
285 | const uschar *this_start_code, |
---|
286 | const uschar *current_subject, |
---|
287 | int start_offset, |
---|
288 | int *offsets, |
---|
289 | int offsetcount, |
---|
290 | int *workspace, |
---|
291 | int wscount, |
---|
292 | int ims, |
---|
293 | int rlevel, |
---|
294 | int recursing) |
---|
295 | { |
---|
296 | stateblock *active_states, *new_states, *temp_states; |
---|
297 | stateblock *next_active_state, *next_new_state; |
---|
298 | |
---|
299 | const uschar *ctypes, *lcc, *fcc; |
---|
300 | const uschar *ptr; |
---|
301 | const uschar *end_code, *first_op; |
---|
302 | |
---|
303 | int active_count, new_count, match_count; |
---|
304 | |
---|
305 | /* Some fields in the md block are frequently referenced, so we load them into |
---|
306 | independent variables in the hope that this will perform better. */ |
---|
307 | |
---|
308 | const uschar *start_subject = md->start_subject; |
---|
309 | const uschar *end_subject = md->end_subject; |
---|
310 | const uschar *start_code = md->start_code; |
---|
311 | |
---|
312 | #ifdef SUPPORT_UTF8 |
---|
313 | BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
---|
314 | #else |
---|
315 | BOOL utf8 = FALSE; |
---|
316 | #endif |
---|
317 | |
---|
318 | rlevel++; |
---|
319 | offsetcount &= (-2); |
---|
320 | |
---|
321 | wscount -= 2; |
---|
322 | wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / |
---|
323 | (2 * INTS_PER_STATEBLOCK); |
---|
324 | |
---|
325 | DPRINTF(("\n%.*s---------------------\n" |
---|
326 | "%.*sCall to internal_dfa_exec f=%d r=%d\n", |
---|
327 | rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing)); |
---|
328 | |
---|
329 | ctypes = md->tables + ctypes_offset; |
---|
330 | lcc = md->tables + lcc_offset; |
---|
331 | fcc = md->tables + fcc_offset; |
---|
332 | |
---|
333 | match_count = PCRE_ERROR_NOMATCH; /* A negative number */ |
---|
334 | |
---|
335 | active_states = (stateblock *)(workspace + 2); |
---|
336 | next_new_state = new_states = active_states + wscount; |
---|
337 | new_count = 0; |
---|
338 | |
---|
339 | first_op = this_start_code + 1 + LINK_SIZE + |
---|
340 | ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); |
---|
341 | |
---|
342 | /* The first thing in any (sub) pattern is a bracket of some sort. Push all |
---|
343 | the alternative states onto the list, and find out where the end is. This |
---|
344 | makes is possible to use this function recursively, when we want to stop at a |
---|
345 | matching internal ket rather than at the end. |
---|
346 | |
---|
347 | If the first opcode in the first alternative is OP_REVERSE, we are dealing with |
---|
348 | a backward assertion. In that case, we have to find out the maximum amount to |
---|
349 | move back, and set up each alternative appropriately. */ |
---|
350 | |
---|
351 | if (*first_op == OP_REVERSE) |
---|
352 | { |
---|
353 | int max_back = 0; |
---|
354 | int gone_back; |
---|
355 | |
---|
356 | end_code = this_start_code; |
---|
357 | do |
---|
358 | { |
---|
359 | int back = GET(end_code, 2+LINK_SIZE); |
---|
360 | if (back > max_back) max_back = back; |
---|
361 | end_code += GET(end_code, 1); |
---|
362 | } |
---|
363 | while (*end_code == OP_ALT); |
---|
364 | |
---|
365 | /* If we can't go back the amount required for the longest lookbehind |
---|
366 | pattern, go back as far as we can; some alternatives may still be viable. */ |
---|
367 | |
---|
368 | #ifdef SUPPORT_UTF8 |
---|
369 | /* In character mode we have to step back character by character */ |
---|
370 | |
---|
371 | if (utf8) |
---|
372 | { |
---|
373 | for (gone_back = 0; gone_back < max_back; gone_back++) |
---|
374 | { |
---|
375 | if (current_subject <= start_subject) break; |
---|
376 | current_subject--; |
---|
377 | while (current_subject > start_subject && |
---|
378 | (*current_subject & 0xc0) == 0x80) |
---|
379 | current_subject--; |
---|
380 | } |
---|
381 | } |
---|
382 | else |
---|
383 | #endif |
---|
384 | |
---|
385 | /* In byte-mode we can do this quickly. */ |
---|
386 | |
---|
387 | { |
---|
388 | gone_back = (current_subject - max_back < start_subject)? |
---|
389 | current_subject - start_subject : max_back; |
---|
390 | current_subject -= gone_back; |
---|
391 | } |
---|
392 | |
---|
393 | /* Now we can process the individual branches. */ |
---|
394 | |
---|
395 | end_code = this_start_code; |
---|
396 | do |
---|
397 | { |
---|
398 | int back = GET(end_code, 2+LINK_SIZE); |
---|
399 | if (back <= gone_back) |
---|
400 | { |
---|
401 | int bstate = end_code - start_code + 2 + 2*LINK_SIZE; |
---|
402 | ADD_NEW_DATA(-bstate, 0, gone_back - back); |
---|
403 | } |
---|
404 | end_code += GET(end_code, 1); |
---|
405 | } |
---|
406 | while (*end_code == OP_ALT); |
---|
407 | } |
---|
408 | |
---|
409 | /* This is the code for a "normal" subpattern (not a backward assertion). The |
---|
410 | start of a whole pattern is always one of these. If we are at the top level, |
---|
411 | we may be asked to restart matching from the same point that we reached for a |
---|
412 | previous partial match. We still have to scan through the top-level branches to |
---|
413 | find the end state. */ |
---|
414 | |
---|
415 | else |
---|
416 | { |
---|
417 | end_code = this_start_code; |
---|
418 | |
---|
419 | /* Restarting */ |
---|
420 | |
---|
421 | if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) |
---|
422 | { |
---|
423 | do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); |
---|
424 | new_count = workspace[1]; |
---|
425 | if (!workspace[0]) |
---|
426 | memcpy(new_states, active_states, new_count * sizeof(stateblock)); |
---|
427 | } |
---|
428 | |
---|
429 | /* Not restarting */ |
---|
430 | |
---|
431 | else |
---|
432 | { |
---|
433 | int length = 1 + LINK_SIZE + |
---|
434 | ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); |
---|
435 | do |
---|
436 | { |
---|
437 | ADD_NEW(end_code - start_code + length, 0); |
---|
438 | end_code += GET(end_code, 1); |
---|
439 | length = 1 + LINK_SIZE; |
---|
440 | } |
---|
441 | while (*end_code == OP_ALT); |
---|
442 | } |
---|
443 | } |
---|
444 | |
---|
445 | workspace[0] = 0; /* Bit indicating which vector is current */ |
---|
446 | |
---|
447 | DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code)); |
---|
448 | |
---|
449 | /* Loop for scanning the subject */ |
---|
450 | |
---|
451 | ptr = current_subject; |
---|
452 | for (;;) |
---|
453 | { |
---|
454 | int i, j; |
---|
455 | int clen, dlen; |
---|
456 | unsigned int c, d; |
---|
457 | |
---|
458 | /* Make the new state list into the active state list and empty the |
---|
459 | new state list. */ |
---|
460 | |
---|
461 | temp_states = active_states; |
---|
462 | active_states = new_states; |
---|
463 | new_states = temp_states; |
---|
464 | active_count = new_count; |
---|
465 | new_count = 0; |
---|
466 | |
---|
467 | workspace[0] ^= 1; /* Remember for the restarting feature */ |
---|
468 | workspace[1] = active_count; |
---|
469 | |
---|
470 | #ifdef DEBUG |
---|
471 | printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); |
---|
472 | pchars((uschar *)ptr, strlen((char *)ptr), stdout); |
---|
473 | printf("\"\n"); |
---|
474 | |
---|
475 | printf("%.*sActive states: ", rlevel*2-2, SP); |
---|
476 | for (i = 0; i < active_count; i++) |
---|
477 | printf("%d/%d ", active_states[i].offset, active_states[i].count); |
---|
478 | printf("\n"); |
---|
479 | #endif |
---|
480 | |
---|
481 | /* Set the pointers for adding new states */ |
---|
482 | |
---|
483 | next_active_state = active_states + active_count; |
---|
484 | next_new_state = new_states; |
---|
485 | |
---|
486 | /* Load the current character from the subject outside the loop, as many |
---|
487 | different states may want to look at it, and we assume that at least one |
---|
488 | will. */ |
---|
489 | |
---|
490 | if (ptr < end_subject) |
---|
491 | { |
---|
492 | clen = 1; /* Number of bytes in the character */ |
---|
493 | #ifdef SUPPORT_UTF8 |
---|
494 | if (utf8) { GETCHARLEN(c, ptr, clen); } else |
---|
495 | #endif /* SUPPORT_UTF8 */ |
---|
496 | c = *ptr; |
---|
497 | } |
---|
498 | else |
---|
499 | { |
---|
500 | clen = 0; /* This indicates the end of the subject */ |
---|
501 | c = NOTACHAR; /* This value should never actually be used */ |
---|
502 | } |
---|
503 | |
---|
504 | /* Scan up the active states and act on each one. The result of an action |
---|
505 | may be to add more states to the currently active list (e.g. on hitting a |
---|
506 | parenthesis) or it may be to put states on the new list, for considering |
---|
507 | when we move the character pointer on. */ |
---|
508 | |
---|
509 | for (i = 0; i < active_count; i++) |
---|
510 | { |
---|
511 | stateblock *current_state = active_states + i; |
---|
512 | const uschar *code; |
---|
513 | int state_offset = current_state->offset; |
---|
514 | int count, codevalue; |
---|
515 | #ifdef SUPPORT_UCP |
---|
516 | int chartype, script; |
---|
517 | #endif |
---|
518 | |
---|
519 | #ifdef DEBUG |
---|
520 | printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
---|
521 | if (clen == 0) printf("EOL\n"); |
---|
522 | else if (c > 32 && c < 127) printf("'%c'\n", c); |
---|
523 | else printf("0x%02x\n", c); |
---|
524 | #endif |
---|
525 | |
---|
526 | /* This variable is referred to implicity in the ADD_xxx macros. */ |
---|
527 | |
---|
528 | ims = current_state->ims; |
---|
529 | |
---|
530 | /* A negative offset is a special case meaning "hold off going to this |
---|
531 | (negated) state until the number of characters in the data field have |
---|
532 | been skipped". */ |
---|
533 | |
---|
534 | if (state_offset < 0) |
---|
535 | { |
---|
536 | if (current_state->data > 0) |
---|
537 | { |
---|
538 | DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); |
---|
539 | ADD_NEW_DATA(state_offset, current_state->count, |
---|
540 | current_state->data - 1); |
---|
541 | continue; |
---|
542 | } |
---|
543 | else |
---|
544 | { |
---|
545 | current_state->offset = state_offset = -state_offset; |
---|
546 | } |
---|
547 | } |
---|
548 | |
---|
549 | /* Check for a duplicate state with the same count, and skip if found. */ |
---|
550 | |
---|
551 | for (j = 0; j < i; j++) |
---|
552 | { |
---|
553 | if (active_states[j].offset == state_offset && |
---|
554 | active_states[j].count == current_state->count) |
---|
555 | { |
---|
556 | DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); |
---|
557 | goto NEXT_ACTIVE_STATE; |
---|
558 | } |
---|
559 | } |
---|
560 | |
---|
561 | /* The state offset is the offset to the opcode */ |
---|
562 | |
---|
563 | code = start_code + state_offset; |
---|
564 | codevalue = *code; |
---|
565 | |
---|
566 | /* If this opcode is followed by an inline character, load it. It is |
---|
567 | tempting to test for the presence of a subject character here, but that |
---|
568 | is wrong, because sometimes zero repetitions of the subject are |
---|
569 | permitted. |
---|
570 | |
---|
571 | We also use this mechanism for opcodes such as OP_TYPEPLUS that take an |
---|
572 | argument that is not a data character - but is always one byte long. We |
---|
573 | have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in |
---|
574 | this case. To keep the other cases fast, convert these ones to new opcodes. |
---|
575 | */ |
---|
576 | |
---|
577 | if (coptable[codevalue] > 0) |
---|
578 | { |
---|
579 | dlen = 1; |
---|
580 | #ifdef SUPPORT_UTF8 |
---|
581 | if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else |
---|
582 | #endif /* SUPPORT_UTF8 */ |
---|
583 | d = code[coptable[codevalue]]; |
---|
584 | if (codevalue >= OP_TYPESTAR) |
---|
585 | { |
---|
586 | switch(d) |
---|
587 | { |
---|
588 | case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; |
---|
589 | case OP_NOTPROP: |
---|
590 | case OP_PROP: codevalue += OP_PROP_EXTRA; break; |
---|
591 | case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; |
---|
592 | case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; |
---|
593 | case OP_NOT_HSPACE: |
---|
594 | case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; |
---|
595 | case OP_NOT_VSPACE: |
---|
596 | case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; |
---|
597 | default: break; |
---|
598 | } |
---|
599 | } |
---|
600 | } |
---|
601 | else |
---|
602 | { |
---|
603 | dlen = 0; /* Not strictly necessary, but compilers moan */ |
---|
604 | d = NOTACHAR; /* if these variables are not set. */ |
---|
605 | } |
---|
606 | |
---|
607 | |
---|
608 | /* Now process the individual opcodes */ |
---|
609 | |
---|
610 | switch (codevalue) |
---|
611 | { |
---|
612 | |
---|
613 | /* ========================================================================== */ |
---|
614 | /* Reached a closing bracket. If not at the end of the pattern, carry |
---|
615 | on with the next opcode. Otherwise, unless we have an empty string and |
---|
616 | PCRE_NOTEMPTY is set, save the match data, shifting up all previous |
---|
617 | matches so we always have the longest first. */ |
---|
618 | |
---|
619 | case OP_KET: |
---|
620 | case OP_KETRMIN: |
---|
621 | case OP_KETRMAX: |
---|
622 | if (code != end_code) |
---|
623 | { |
---|
624 | ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); |
---|
625 | if (codevalue != OP_KET) |
---|
626 | { |
---|
627 | ADD_ACTIVE(state_offset - GET(code, 1), 0); |
---|
628 | } |
---|
629 | } |
---|
630 | else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) |
---|
631 | { |
---|
632 | if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; |
---|
633 | else if (match_count > 0 && ++match_count * 2 >= offsetcount) |
---|
634 | match_count = 0; |
---|
635 | count = ((match_count == 0)? offsetcount : match_count * 2) - 2; |
---|
636 | if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); |
---|
637 | if (offsetcount >= 2) |
---|
638 | { |
---|
639 | offsets[0] = current_subject - start_subject; |
---|
640 | offsets[1] = ptr - start_subject; |
---|
641 | DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, |
---|
642 | offsets[1] - offsets[0], current_subject)); |
---|
643 | } |
---|
644 | if ((md->moptions & PCRE_DFA_SHORTEST) != 0) |
---|
645 | { |
---|
646 | DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
---|
647 | "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, |
---|
648 | match_count, rlevel*2-2, SP)); |
---|
649 | return match_count; |
---|
650 | } |
---|
651 | } |
---|
652 | break; |
---|
653 | |
---|
654 | /* ========================================================================== */ |
---|
655 | /* These opcodes add to the current list of states without looking |
---|
656 | at the current character. */ |
---|
657 | |
---|
658 | /*-----------------------------------------------------------------*/ |
---|
659 | case OP_ALT: |
---|
660 | do { code += GET(code, 1); } while (*code == OP_ALT); |
---|
661 | ADD_ACTIVE(code - start_code, 0); |
---|
662 | break; |
---|
663 | |
---|
664 | /*-----------------------------------------------------------------*/ |
---|
665 | case OP_BRA: |
---|
666 | case OP_SBRA: |
---|
667 | do |
---|
668 | { |
---|
669 | ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
---|
670 | code += GET(code, 1); |
---|
671 | } |
---|
672 | while (*code == OP_ALT); |
---|
673 | break; |
---|
674 | |
---|
675 | /*-----------------------------------------------------------------*/ |
---|
676 | case OP_CBRA: |
---|
677 | case OP_SCBRA: |
---|
678 | ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); |
---|
679 | code += GET(code, 1); |
---|
680 | while (*code == OP_ALT) |
---|
681 | { |
---|
682 | ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
---|
683 | code += GET(code, 1); |
---|
684 | } |
---|
685 | break; |
---|
686 | |
---|
687 | /*-----------------------------------------------------------------*/ |
---|
688 | case OP_BRAZERO: |
---|
689 | case OP_BRAMINZERO: |
---|
690 | ADD_ACTIVE(state_offset + 1, 0); |
---|
691 | code += 1 + GET(code, 2); |
---|
692 | while (*code == OP_ALT) code += GET(code, 1); |
---|
693 | ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
---|
694 | break; |
---|
695 | |
---|
696 | /*-----------------------------------------------------------------*/ |
---|
697 | case OP_SKIPZERO: |
---|
698 | code += 1 + GET(code, 2); |
---|
699 | while (*code == OP_ALT) code += GET(code, 1); |
---|
700 | ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); |
---|
701 | break; |
---|
702 | |
---|
703 | /*-----------------------------------------------------------------*/ |
---|
704 | case OP_CIRC: |
---|
705 | if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
---|
706 | ((ims & PCRE_MULTILINE) != 0 && |
---|
707 | ptr != end_subject && |
---|
708 | WAS_NEWLINE(ptr))) |
---|
709 | { ADD_ACTIVE(state_offset + 1, 0); } |
---|
710 | break; |
---|
711 | |
---|
712 | /*-----------------------------------------------------------------*/ |
---|
713 | case OP_EOD: |
---|
714 | if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } |
---|
715 | break; |
---|
716 | |
---|
717 | /*-----------------------------------------------------------------*/ |
---|
718 | case OP_OPT: |
---|
719 | ims = code[1]; |
---|
720 | ADD_ACTIVE(state_offset + 2, 0); |
---|
721 | break; |
---|
722 | |
---|
723 | /*-----------------------------------------------------------------*/ |
---|
724 | case OP_SOD: |
---|
725 | if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } |
---|
726 | break; |
---|
727 | |
---|
728 | /*-----------------------------------------------------------------*/ |
---|
729 | case OP_SOM: |
---|
730 | if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } |
---|
731 | break; |
---|
732 | |
---|
733 | |
---|
734 | /* ========================================================================== */ |
---|
735 | /* These opcodes inspect the next subject character, and sometimes |
---|
736 | the previous one as well, but do not have an argument. The variable |
---|
737 | clen contains the length of the current character and is zero if we are |
---|
738 | at the end of the subject. */ |
---|
739 | |
---|
740 | /*-----------------------------------------------------------------*/ |
---|
741 | case OP_ANY: |
---|
742 | if (clen > 0 && !IS_NEWLINE(ptr)) |
---|
743 | { ADD_NEW(state_offset + 1, 0); } |
---|
744 | break; |
---|
745 | |
---|
746 | /*-----------------------------------------------------------------*/ |
---|
747 | case OP_ALLANY: |
---|
748 | if (clen > 0) |
---|
749 | { ADD_NEW(state_offset + 1, 0); } |
---|
750 | break; |
---|
751 | |
---|
752 | /*-----------------------------------------------------------------*/ |
---|
753 | case OP_EODN: |
---|
754 | if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) |
---|
755 | { ADD_ACTIVE(state_offset + 1, 0); } |
---|
756 | break; |
---|
757 | |
---|
758 | /*-----------------------------------------------------------------*/ |
---|
759 | case OP_DOLL: |
---|
760 | if ((md->moptions & PCRE_NOTEOL) == 0) |
---|
761 | { |
---|
762 | if (clen == 0 || |
---|
763 | (IS_NEWLINE(ptr) && |
---|
764 | ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) |
---|
765 | )) |
---|
766 | { ADD_ACTIVE(state_offset + 1, 0); } |
---|
767 | } |
---|
768 | else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr)) |
---|
769 | { ADD_ACTIVE(state_offset + 1, 0); } |
---|
770 | break; |
---|
771 | |
---|
772 | /*-----------------------------------------------------------------*/ |
---|
773 | |
---|
774 | case OP_DIGIT: |
---|
775 | case OP_WHITESPACE: |
---|
776 | case OP_WORDCHAR: |
---|
777 | if (clen > 0 && c < 256 && |
---|
778 | ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) |
---|
779 | { ADD_NEW(state_offset + 1, 0); } |
---|
780 | break; |
---|
781 | |
---|
782 | /*-----------------------------------------------------------------*/ |
---|
783 | case OP_NOT_DIGIT: |
---|
784 | case OP_NOT_WHITESPACE: |
---|
785 | case OP_NOT_WORDCHAR: |
---|
786 | if (clen > 0 && (c >= 256 || |
---|
787 | ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) |
---|
788 | { ADD_NEW(state_offset + 1, 0); } |
---|
789 | break; |
---|
790 | |
---|
791 | /*-----------------------------------------------------------------*/ |
---|
792 | case OP_WORD_BOUNDARY: |
---|
793 | case OP_NOT_WORD_BOUNDARY: |
---|
794 | { |
---|
795 | int left_word, right_word; |
---|
796 | |
---|
797 | if (ptr > start_subject) |
---|
798 | { |
---|
799 | const uschar *temp = ptr - 1; |
---|
800 | #ifdef SUPPORT_UTF8 |
---|
801 | if (utf8) BACKCHAR(temp); |
---|
802 | #endif |
---|
803 | GETCHARTEST(d, temp); |
---|
804 | left_word = d < 256 && (ctypes[d] & ctype_word) != 0; |
---|
805 | } |
---|
806 | else left_word = 0; |
---|
807 | |
---|
808 | if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; |
---|
809 | else right_word = 0; |
---|
810 | |
---|
811 | if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) |
---|
812 | { ADD_ACTIVE(state_offset + 1, 0); } |
---|
813 | } |
---|
814 | break; |
---|
815 | |
---|
816 | |
---|
817 | /*-----------------------------------------------------------------*/ |
---|
818 | /* Check the next character by Unicode property. We will get here only |
---|
819 | if the support is in the binary; otherwise a compile-time error occurs. |
---|
820 | */ |
---|
821 | |
---|
822 | #ifdef SUPPORT_UCP |
---|
823 | case OP_PROP: |
---|
824 | case OP_NOTPROP: |
---|
825 | if (clen > 0) |
---|
826 | { |
---|
827 | BOOL OK; |
---|
828 | int category = _pcre_ucp_findprop(c, &chartype, &script); |
---|
829 | switch(code[1]) |
---|
830 | { |
---|
831 | case PT_ANY: |
---|
832 | OK = TRUE; |
---|
833 | break; |
---|
834 | |
---|
835 | case PT_LAMP: |
---|
836 | OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
---|
837 | break; |
---|
838 | |
---|
839 | case PT_GC: |
---|
840 | OK = category == code[2]; |
---|
841 | break; |
---|
842 | |
---|
843 | case PT_PC: |
---|
844 | OK = chartype == code[2]; |
---|
845 | break; |
---|
846 | |
---|
847 | case PT_SC: |
---|
848 | OK = script == code[2]; |
---|
849 | break; |
---|
850 | |
---|
851 | /* Should never occur, but keep compilers from grumbling. */ |
---|
852 | |
---|
853 | default: |
---|
854 | OK = codevalue != OP_PROP; |
---|
855 | break; |
---|
856 | } |
---|
857 | |
---|
858 | if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } |
---|
859 | } |
---|
860 | break; |
---|
861 | #endif |
---|
862 | |
---|
863 | |
---|
864 | |
---|
865 | /* ========================================================================== */ |
---|
866 | /* These opcodes likewise inspect the subject character, but have an |
---|
867 | argument that is not a data character. It is one of these opcodes: |
---|
868 | OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, |
---|
869 | OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ |
---|
870 | |
---|
871 | case OP_TYPEPLUS: |
---|
872 | case OP_TYPEMINPLUS: |
---|
873 | case OP_TYPEPOSPLUS: |
---|
874 | count = current_state->count; /* Already matched */ |
---|
875 | if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
---|
876 | if (clen > 0) |
---|
877 | { |
---|
878 | if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
---|
879 | (c < 256 && |
---|
880 | (d != OP_ANY || !IS_NEWLINE(ptr)) && |
---|
881 | ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
---|
882 | { |
---|
883 | if (count > 0 && codevalue == OP_TYPEPOSPLUS) |
---|
884 | { |
---|
885 | active_count--; /* Remove non-match possibility */ |
---|
886 | next_active_state--; |
---|
887 | } |
---|
888 | count++; |
---|
889 | ADD_NEW(state_offset, count); |
---|
890 | } |
---|
891 | } |
---|
892 | break; |
---|
893 | |
---|
894 | /*-----------------------------------------------------------------*/ |
---|
895 | case OP_TYPEQUERY: |
---|
896 | case OP_TYPEMINQUERY: |
---|
897 | case OP_TYPEPOSQUERY: |
---|
898 | ADD_ACTIVE(state_offset + 2, 0); |
---|
899 | if (clen > 0) |
---|
900 | { |
---|
901 | if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
---|
902 | (c < 256 && |
---|
903 | (d != OP_ANY || !IS_NEWLINE(ptr)) && |
---|
904 | ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
---|
905 | { |
---|
906 | if (codevalue == OP_TYPEPOSQUERY) |
---|
907 | { |
---|
908 | active_count--; /* Remove non-match possibility */ |
---|
909 | next_active_state--; |
---|
910 | } |
---|
911 | ADD_NEW(state_offset + 2, 0); |
---|
912 | } |
---|
913 | } |
---|
914 | break; |
---|
915 | |
---|
916 | /*-----------------------------------------------------------------*/ |
---|
917 | case OP_TYPESTAR: |
---|
918 | case OP_TYPEMINSTAR: |
---|
919 | case OP_TYPEPOSSTAR: |
---|
920 | ADD_ACTIVE(state_offset + 2, 0); |
---|
921 | if (clen > 0) |
---|
922 | { |
---|
923 | if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
---|
924 | (c < 256 && |
---|
925 | (d != OP_ANY || !IS_NEWLINE(ptr)) && |
---|
926 | ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
---|
927 | { |
---|
928 | if (codevalue == OP_TYPEPOSSTAR) |
---|
929 | { |
---|
930 | active_count--; /* Remove non-match possibility */ |
---|
931 | next_active_state--; |
---|
932 | } |
---|
933 | ADD_NEW(state_offset, 0); |
---|
934 | } |
---|
935 | } |
---|
936 | break; |
---|
937 | |
---|
938 | /*-----------------------------------------------------------------*/ |
---|
939 | case OP_TYPEEXACT: |
---|
940 | count = current_state->count; /* Number already matched */ |
---|
941 | if (clen > 0) |
---|
942 | { |
---|
943 | if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
---|
944 | (c < 256 && |
---|
945 | (d != OP_ANY || !IS_NEWLINE(ptr)) && |
---|
946 | ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
---|
947 | { |
---|
948 | if (++count >= GET2(code, 1)) |
---|
949 | { ADD_NEW(state_offset + 4, 0); } |
---|
950 | else |
---|
951 | { ADD_NEW(state_offset, count); } |
---|
952 | } |
---|
953 | } |
---|
954 | break; |
---|
955 | |
---|
956 | /*-----------------------------------------------------------------*/ |
---|
957 | case OP_TYPEUPTO: |
---|
958 | case OP_TYPEMINUPTO: |
---|
959 | case OP_TYPEPOSUPTO: |
---|
960 | ADD_ACTIVE(state_offset + 4, 0); |
---|
961 | count = current_state->count; /* Number already matched */ |
---|
962 | if (clen > 0) |
---|
963 | { |
---|
964 | if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
---|
965 | (c < 256 && |
---|
966 | (d != OP_ANY || !IS_NEWLINE(ptr)) && |
---|
967 | ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
---|
968 | { |
---|
969 | if (codevalue == OP_TYPEPOSUPTO) |
---|
970 | { |
---|
971 | active_count--; /* Remove non-match possibility */ |
---|
972 | next_active_state--; |
---|
973 | } |
---|
974 | if (++count >= GET2(code, 1)) |
---|
975 | { ADD_NEW(state_offset + 4, 0); } |
---|
976 | else |
---|
977 | { ADD_NEW(state_offset, count); } |
---|
978 | } |
---|
979 | } |
---|
980 | break; |
---|
981 | |
---|
982 | /* ========================================================================== */ |
---|
983 | /* These are virtual opcodes that are used when something like |
---|
984 | OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its |
---|
985 | argument. It keeps the code above fast for the other cases. The argument |
---|
986 | is in the d variable. */ |
---|
987 | |
---|
988 | #ifdef SUPPORT_UCP |
---|
989 | case OP_PROP_EXTRA + OP_TYPEPLUS: |
---|
990 | case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
---|
991 | case OP_PROP_EXTRA + OP_TYPEPOSPLUS: |
---|
992 | count = current_state->count; /* Already matched */ |
---|
993 | if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } |
---|
994 | if (clen > 0) |
---|
995 | { |
---|
996 | BOOL OK; |
---|
997 | int category = _pcre_ucp_findprop(c, &chartype, &script); |
---|
998 | switch(code[2]) |
---|
999 | { |
---|
1000 | case PT_ANY: |
---|
1001 | OK = TRUE; |
---|
1002 | break; |
---|
1003 | |
---|
1004 | case PT_LAMP: |
---|
1005 | OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
---|
1006 | break; |
---|
1007 | |
---|
1008 | case PT_GC: |
---|
1009 | OK = category == code[3]; |
---|
1010 | break; |
---|
1011 | |
---|
1012 | case PT_PC: |
---|
1013 | OK = chartype == code[3]; |
---|
1014 | break; |
---|
1015 | |
---|
1016 | case PT_SC: |
---|
1017 | OK = script == code[3]; |
---|
1018 | break; |
---|
1019 | |
---|
1020 | /* Should never occur, but keep compilers from grumbling. */ |
---|
1021 | |
---|
1022 | default: |
---|
1023 | OK = codevalue != OP_PROP; |
---|
1024 | break; |
---|
1025 | } |
---|
1026 | |
---|
1027 | if (OK == (d == OP_PROP)) |
---|
1028 | { |
---|
1029 | if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) |
---|
1030 | { |
---|
1031 | active_count--; /* Remove non-match possibility */ |
---|
1032 | next_active_state--; |
---|
1033 | } |
---|
1034 | count++; |
---|
1035 | ADD_NEW(state_offset, count); |
---|
1036 | } |
---|
1037 | } |
---|
1038 | break; |
---|
1039 | |
---|
1040 | /*-----------------------------------------------------------------*/ |
---|
1041 | case OP_EXTUNI_EXTRA + OP_TYPEPLUS: |
---|
1042 | case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
---|
1043 | case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: |
---|
1044 | count = current_state->count; /* Already matched */ |
---|
1045 | if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
---|
1046 | if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
---|
1047 | { |
---|
1048 | const uschar *nptr = ptr + clen; |
---|
1049 | int ncount = 0; |
---|
1050 | if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) |
---|
1051 | { |
---|
1052 | active_count--; /* Remove non-match possibility */ |
---|
1053 | next_active_state--; |
---|
1054 | } |
---|
1055 | while (nptr < end_subject) |
---|
1056 | { |
---|
1057 | int nd; |
---|
1058 | int ndlen = 1; |
---|
1059 | GETCHARLEN(nd, nptr, ndlen); |
---|
1060 | if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
---|
1061 | ncount++; |
---|
1062 | nptr += ndlen; |
---|
1063 | } |
---|
1064 | count++; |
---|
1065 | ADD_NEW_DATA(-state_offset, count, ncount); |
---|
1066 | } |
---|
1067 | break; |
---|
1068 | #endif |
---|
1069 | |
---|
1070 | /*-----------------------------------------------------------------*/ |
---|
1071 | case OP_ANYNL_EXTRA + OP_TYPEPLUS: |
---|
1072 | case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: |
---|
1073 | case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: |
---|
1074 | count = current_state->count; /* Already matched */ |
---|
1075 | if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
---|
1076 | if (clen > 0) |
---|
1077 | { |
---|
1078 | int ncount = 0; |
---|
1079 | switch (c) |
---|
1080 | { |
---|
1081 | case 0x000b: |
---|
1082 | case 0x000c: |
---|
1083 | case 0x0085: |
---|
1084 | case 0x2028: |
---|
1085 | case 0x2029: |
---|
1086 | if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
---|
1087 | goto ANYNL01; |
---|
1088 | |
---|
1089 | case 0x000d: |
---|
1090 | if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; |
---|
1091 | /* Fall through */ |
---|
1092 | |
---|
1093 | ANYNL01: |
---|
1094 | case 0x000a: |
---|
1095 | if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) |
---|
1096 | { |
---|
1097 | active_count--; /* Remove non-match possibility */ |
---|
1098 | next_active_state--; |
---|
1099 | } |
---|
1100 | count++; |
---|
1101 | ADD_NEW_DATA(-state_offset, count, ncount); |
---|
1102 | break; |
---|
1103 | |
---|
1104 | default: |
---|
1105 | break; |
---|
1106 | } |
---|
1107 | } |
---|
1108 | break; |
---|
1109 | |
---|
1110 | /*-----------------------------------------------------------------*/ |
---|
1111 | case OP_VSPACE_EXTRA + OP_TYPEPLUS: |
---|
1112 | case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: |
---|
1113 | case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: |
---|
1114 | count = current_state->count; /* Already matched */ |
---|
1115 | if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
---|
1116 | if (clen > 0) |
---|
1117 | { |
---|
1118 | BOOL OK; |
---|
1119 | switch (c) |
---|
1120 | { |
---|
1121 | case 0x000a: |
---|
1122 | case 0x000b: |
---|
1123 | case 0x000c: |
---|
1124 | case 0x000d: |
---|
1125 | case 0x0085: |
---|
1126 | case 0x2028: |
---|
1127 | case 0x2029: |
---|
1128 | OK = TRUE; |
---|
1129 | break; |
---|
1130 | |
---|
1131 | default: |
---|
1132 | OK = FALSE; |
---|
1133 | break; |
---|
1134 | } |
---|
1135 | |
---|
1136 | if (OK == (d == OP_VSPACE)) |
---|
1137 | { |
---|
1138 | if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) |
---|
1139 | { |
---|
1140 | active_count--; /* Remove non-match possibility */ |
---|
1141 | next_active_state--; |
---|
1142 | } |
---|
1143 | count++; |
---|
1144 | ADD_NEW_DATA(-state_offset, count, 0); |
---|
1145 | } |
---|
1146 | } |
---|
1147 | break; |
---|
1148 | |
---|
1149 | /*-----------------------------------------------------------------*/ |
---|
1150 | case OP_HSPACE_EXTRA + OP_TYPEPLUS: |
---|
1151 | case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: |
---|
1152 | case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: |
---|
1153 | count = current_state->count; /* Already matched */ |
---|
1154 | if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
---|
1155 | if (clen > 0) |
---|
1156 | { |
---|
1157 | BOOL OK; |
---|
1158 | switch (c) |
---|
1159 | { |
---|
1160 | case 0x09: /* HT */ |
---|
1161 | case 0x20: /* SPACE */ |
---|
1162 | case 0xa0: /* NBSP */ |
---|
1163 | case 0x1680: /* OGHAM SPACE MARK */ |
---|
1164 | case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
---|
1165 | case 0x2000: /* EN QUAD */ |
---|
1166 | case 0x2001: /* EM QUAD */ |
---|
1167 | case 0x2002: /* EN SPACE */ |
---|
1168 | case 0x2003: /* EM SPACE */ |
---|
1169 | case 0x2004: /* THREE-PER-EM SPACE */ |
---|
1170 | case 0x2005: /* FOUR-PER-EM SPACE */ |
---|
1171 | case 0x2006: /* SIX-PER-EM SPACE */ |
---|
1172 | case 0x2007: /* FIGURE SPACE */ |
---|
1173 | case 0x2008: /* PUNCTUATION SPACE */ |
---|
1174 | case 0x2009: /* THIN SPACE */ |
---|
1175 | case 0x200A: /* HAIR SPACE */ |
---|
1176 | case 0x202f: /* NARROW NO-BREAK SPACE */ |
---|
1177 | case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
---|
1178 | case 0x3000: /* IDEOGRAPHIC SPACE */ |
---|
1179 | OK = TRUE; |
---|
1180 | break; |
---|
1181 | |
---|
1182 | default: |
---|
1183 | OK = FALSE; |
---|
1184 | break; |
---|
1185 | } |
---|
1186 | |
---|
1187 | if (OK == (d == OP_HSPACE)) |
---|
1188 | { |
---|
1189 | if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) |
---|
1190 | { |
---|
1191 | active_count--; /* Remove non-match possibility */ |
---|
1192 | next_active_state--; |
---|
1193 | } |
---|
1194 | count++; |
---|
1195 | ADD_NEW_DATA(-state_offset, count, 0); |
---|
1196 | } |
---|
1197 | } |
---|
1198 | break; |
---|
1199 | |
---|
1200 | /*-----------------------------------------------------------------*/ |
---|
1201 | #ifdef SUPPORT_UCP |
---|
1202 | case OP_PROP_EXTRA + OP_TYPEQUERY: |
---|
1203 | case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
---|
1204 | case OP_PROP_EXTRA + OP_TYPEPOSQUERY: |
---|
1205 | count = 4; |
---|
1206 | goto QS1; |
---|
1207 | |
---|
1208 | case OP_PROP_EXTRA + OP_TYPESTAR: |
---|
1209 | case OP_PROP_EXTRA + OP_TYPEMINSTAR: |
---|
1210 | case OP_PROP_EXTRA + OP_TYPEPOSSTAR: |
---|
1211 | count = 0; |
---|
1212 | |
---|
1213 | QS1: |
---|
1214 | |
---|
1215 | ADD_ACTIVE(state_offset + 4, 0); |
---|
1216 | if (clen > 0) |
---|
1217 | { |
---|
1218 | BOOL OK; |
---|
1219 | int category = _pcre_ucp_findprop(c, &chartype, &script); |
---|
1220 | switch(code[2]) |
---|
1221 | { |
---|
1222 | case PT_ANY: |
---|
1223 | OK = TRUE; |
---|
1224 | break; |
---|
1225 | |
---|
1226 | case PT_LAMP: |
---|
1227 | OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
---|
1228 | break; |
---|
1229 | |
---|
1230 | case PT_GC: |
---|
1231 | OK = category == code[3]; |
---|
1232 | break; |
---|
1233 | |
---|
1234 | case PT_PC: |
---|
1235 | OK = chartype == code[3]; |
---|
1236 | break; |
---|
1237 | |
---|
1238 | case PT_SC: |
---|
1239 | OK = script == code[3]; |
---|
1240 | break; |
---|
1241 | |
---|
1242 | /* Should never occur, but keep compilers from grumbling. */ |
---|
1243 | |
---|
1244 | default: |
---|
1245 | OK = codevalue != OP_PROP; |
---|
1246 | break; |
---|
1247 | } |
---|
1248 | |
---|
1249 | if (OK == (d == OP_PROP)) |
---|
1250 | { |
---|
1251 | if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || |
---|
1252 | codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) |
---|
1253 | { |
---|
1254 | active_count--; /* Remove non-match possibility */ |
---|
1255 | next_active_state--; |
---|
1256 | } |
---|
1257 | ADD_NEW(state_offset + count, 0); |
---|
1258 | } |
---|
1259 | } |
---|
1260 | break; |
---|
1261 | |
---|
1262 | /*-----------------------------------------------------------------*/ |
---|
1263 | case OP_EXTUNI_EXTRA + OP_TYPEQUERY: |
---|
1264 | case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: |
---|
1265 | case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: |
---|
1266 | count = 2; |
---|
1267 | goto QS2; |
---|
1268 | |
---|
1269 | case OP_EXTUNI_EXTRA + OP_TYPESTAR: |
---|
1270 | case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: |
---|
1271 | case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: |
---|
1272 | count = 0; |
---|
1273 | |
---|
1274 | QS2: |
---|
1275 | |
---|
1276 | ADD_ACTIVE(state_offset + 2, 0); |
---|
1277 | if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
---|
1278 | { |
---|
1279 | const uschar *nptr = ptr + clen; |
---|
1280 | int ncount = 0; |
---|
1281 | if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || |
---|
1282 | codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) |
---|
1283 | { |
---|
1284 | active_count--; /* Remove non-match possibility */ |
---|
1285 | next_active_state--; |
---|
1286 | } |
---|
1287 | while (nptr < end_subject) |
---|
1288 | { |
---|
1289 | int nd; |
---|
1290 | int ndlen = 1; |
---|
1291 | GETCHARLEN(nd, nptr, ndlen); |
---|
1292 | if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
---|
1293 | ncount++; |
---|
1294 | nptr += ndlen; |
---|
1295 | } |
---|
1296 | ADD_NEW_DATA(-(state_offset + count), 0, ncount); |
---|
1297 | } |
---|
1298 | break; |
---|
1299 | #endif |
---|
1300 | |
---|
1301 | /*-----------------------------------------------------------------*/ |
---|
1302 | case OP_ANYNL_EXTRA + OP_TYPEQUERY: |
---|
1303 | case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: |
---|
1304 | case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: |
---|
1305 | count = 2; |
---|
1306 | goto QS3; |
---|
1307 | |
---|
1308 | case OP_ANYNL_EXTRA + OP_TYPESTAR: |
---|
1309 | case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: |
---|
1310 | case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: |
---|
1311 | count = 0; |
---|
1312 | |
---|
1313 | QS3: |
---|
1314 | ADD_ACTIVE(state_offset + 2, 0); |
---|
1315 | if (clen > 0) |
---|
1316 | { |
---|
1317 | int ncount = 0; |
---|
1318 | switch (c) |
---|
1319 | { |
---|
1320 | case 0x000b: |
---|
1321 | case 0x000c: |
---|
1322 | case 0x0085: |
---|
1323 | case 0x2028: |
---|
1324 | case 0x2029: |
---|
1325 | if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
---|
1326 | goto ANYNL02; |
---|
1327 | |
---|
1328 | case 0x000d: |
---|
1329 | if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; |
---|
1330 | /* Fall through */ |
---|
1331 | |
---|
1332 | ANYNL02: |
---|
1333 | case 0x000a: |
---|
1334 | if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || |
---|
1335 | codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) |
---|
1336 | { |
---|
1337 | active_count--; /* Remove non-match possibility */ |
---|
1338 | next_active_state--; |
---|
1339 | } |
---|
1340 | ADD_NEW_DATA(-(state_offset + count), 0, ncount); |
---|
1341 | break; |
---|
1342 | |
---|
1343 | default: |
---|
1344 | break; |
---|
1345 | } |
---|
1346 | } |
---|
1347 | break; |
---|
1348 | |
---|
1349 | /*-----------------------------------------------------------------*/ |
---|
1350 | case OP_VSPACE_EXTRA + OP_TYPEQUERY: |
---|
1351 | case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: |
---|
1352 | case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: |
---|
1353 | count = 2; |
---|
1354 | goto QS4; |
---|
1355 | |
---|
1356 | case OP_VSPACE_EXTRA + OP_TYPESTAR: |
---|
1357 | case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: |
---|
1358 | case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: |
---|
1359 | count = 0; |
---|
1360 | |
---|
1361 | QS4: |
---|
1362 | ADD_ACTIVE(state_offset + 2, 0); |
---|
1363 | if (clen > 0) |
---|
1364 | { |
---|
1365 | BOOL OK; |
---|
1366 | switch (c) |
---|
1367 | { |
---|
1368 | case 0x000a: |
---|
1369 | case 0x000b: |
---|
1370 | case 0x000c: |
---|
1371 | case 0x000d: |
---|
1372 | case 0x0085: |
---|
1373 | case 0x2028: |
---|
1374 | case 0x2029: |
---|
1375 | OK = TRUE; |
---|
1376 | break; |
---|
1377 | |
---|
1378 | default: |
---|
1379 | OK = FALSE; |
---|
1380 | break; |
---|
1381 | } |
---|
1382 | if (OK == (d == OP_VSPACE)) |
---|
1383 | { |
---|
1384 | if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || |
---|
1385 | codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) |
---|
1386 | { |
---|
1387 | active_count--; /* Remove non-match possibility */ |
---|
1388 | next_active_state--; |
---|
1389 | } |
---|
1390 | ADD_NEW_DATA(-(state_offset + count), 0, 0); |
---|
1391 | } |
---|
1392 | } |
---|
1393 | break; |
---|
1394 | |
---|
1395 | /*-----------------------------------------------------------------*/ |
---|
1396 | case OP_HSPACE_EXTRA + OP_TYPEQUERY: |
---|
1397 | case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: |
---|
1398 | case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: |
---|
1399 | count = 2; |
---|
1400 | goto QS5; |
---|
1401 | |
---|
1402 | case OP_HSPACE_EXTRA + OP_TYPESTAR: |
---|
1403 | case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: |
---|
1404 | case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: |
---|
1405 | count = 0; |
---|
1406 | |
---|
1407 | QS5: |
---|
1408 | ADD_ACTIVE(state_offset + 2, 0); |
---|
1409 | if (clen > 0) |
---|
1410 | { |
---|
1411 | BOOL OK; |
---|
1412 | switch (c) |
---|
1413 | { |
---|
1414 | case 0x09: /* HT */ |
---|
1415 | case 0x20: /* SPACE */ |
---|
1416 | case 0xa0: /* NBSP */ |
---|
1417 | case 0x1680: /* OGHAM SPACE MARK */ |
---|
1418 | case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
---|
1419 | case 0x2000: /* EN QUAD */ |
---|
1420 | case 0x2001: /* EM QUAD */ |
---|
1421 | case 0x2002: /* EN SPACE */ |
---|
1422 | case 0x2003: /* EM SPACE */ |
---|
1423 | case 0x2004: /* THREE-PER-EM SPACE */ |
---|
1424 | case 0x2005: /* FOUR-PER-EM SPACE */ |
---|
1425 | case 0x2006: /* SIX-PER-EM SPACE */ |
---|
1426 | case 0x2007: /* FIGURE SPACE */ |
---|
1427 | case 0x2008: /* PUNCTUATION SPACE */ |
---|
1428 | case 0x2009: /* THIN SPACE */ |
---|
1429 | case 0x200A: /* HAIR SPACE */ |
---|
1430 | case 0x202f: /* NARROW NO-BREAK SPACE */ |
---|
1431 | case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
---|
1432 | case 0x3000: /* IDEOGRAPHIC SPACE */ |
---|
1433 | OK = TRUE; |
---|
1434 | break; |
---|
1435 | |
---|
1436 | default: |
---|
1437 | OK = FALSE; |
---|
1438 | break; |
---|
1439 | } |
---|
1440 | |
---|
1441 | if (OK == (d == OP_HSPACE)) |
---|
1442 | { |
---|
1443 | if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || |
---|
1444 | codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) |
---|
1445 | { |
---|
1446 | active_count--; /* Remove non-match possibility */ |
---|
1447 | next_active_state--; |
---|
1448 | } |
---|
1449 | ADD_NEW_DATA(-(state_offset + count), 0, 0); |
---|
1450 | } |
---|
1451 | } |
---|
1452 | break; |
---|
1453 | |
---|
1454 | /*-----------------------------------------------------------------*/ |
---|
1455 | #ifdef SUPPORT_UCP |
---|
1456 | case OP_PROP_EXTRA + OP_TYPEEXACT: |
---|
1457 | case OP_PROP_EXTRA + OP_TYPEUPTO: |
---|
1458 | case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
---|
1459 | case OP_PROP_EXTRA + OP_TYPEPOSUPTO: |
---|
1460 | if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
---|
1461 | { ADD_ACTIVE(state_offset + 6, 0); } |
---|
1462 | count = current_state->count; /* Number already matched */ |
---|
1463 | if (clen > 0) |
---|
1464 | { |
---|
1465 | BOOL OK; |
---|
1466 | int category = _pcre_ucp_findprop(c, &chartype, &script); |
---|
1467 | switch(code[4]) |
---|
1468 | { |
---|
1469 | case PT_ANY: |
---|
1470 | OK = TRUE; |
---|
1471 | break; |
---|
1472 | |
---|
1473 | case PT_LAMP: |
---|
1474 | OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
---|
1475 | break; |
---|
1476 | |
---|
1477 | case PT_GC: |
---|
1478 | OK = category == code[5]; |
---|
1479 | break; |
---|
1480 | |
---|
1481 | case PT_PC: |
---|
1482 | OK = chartype == code[5]; |
---|
1483 | break; |
---|
1484 | |
---|
1485 | case PT_SC: |
---|
1486 | OK = script == code[5]; |
---|
1487 | break; |
---|
1488 | |
---|
1489 | /* Should never occur, but keep compilers from grumbling. */ |
---|
1490 | |
---|
1491 | default: |
---|
1492 | OK = codevalue != OP_PROP; |
---|
1493 | break; |
---|
1494 | } |
---|
1495 | |
---|
1496 | if (OK == (d == OP_PROP)) |
---|
1497 | { |
---|
1498 | if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) |
---|
1499 | { |
---|
1500 | active_count--; /* Remove non-match possibility */ |
---|
1501 | next_active_state--; |
---|
1502 | } |
---|
1503 | if (++count >= GET2(code, 1)) |
---|
1504 | { ADD_NEW(state_offset + 6, 0); } |
---|
1505 | else |
---|
1506 | { ADD_NEW(state_offset, count); } |
---|
1507 | } |
---|
1508 | } |
---|
1509 | break; |
---|
1510 | |
---|
1511 | /*-----------------------------------------------------------------*/ |
---|
1512 | case OP_EXTUNI_EXTRA + OP_TYPEEXACT: |
---|
1513 | case OP_EXTUNI_EXTRA + OP_TYPEUPTO: |
---|
1514 | case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: |
---|
1515 | case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: |
---|
1516 | if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
---|
1517 | { ADD_ACTIVE(state_offset + 4, 0); } |
---|
1518 | count = current_state->count; /* Number already matched */ |
---|
1519 | if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
---|
1520 | { |
---|
1521 | const uschar *nptr = ptr + clen; |
---|
1522 | int ncount = 0; |
---|
1523 | if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) |
---|
1524 | { |
---|
1525 | active_count--; /* Remove non-match possibility */ |
---|
1526 | next_active_state--; |
---|
1527 | } |
---|
1528 | while (nptr < end_subject) |
---|
1529 | { |
---|
1530 | int nd; |
---|
1531 | int ndlen = 1; |
---|
1532 | GETCHARLEN(nd, nptr, ndlen); |
---|
1533 | if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
---|
1534 | ncount++; |
---|
1535 | nptr += ndlen; |
---|
1536 | } |
---|
1537 | if (++count >= GET2(code, 1)) |
---|
1538 | { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } |
---|
1539 | else |
---|
1540 | { ADD_NEW_DATA(-state_offset, count, ncount); } |
---|
1541 | } |
---|
1542 | break; |
---|
1543 | #endif |
---|
1544 | |
---|
1545 | /*-----------------------------------------------------------------*/ |
---|
1546 | case OP_ANYNL_EXTRA + OP_TYPEEXACT: |
---|
1547 | case OP_ANYNL_EXTRA + OP_TYPEUPTO: |
---|
1548 | case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: |
---|
1549 | case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: |
---|
1550 | if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) |
---|
1551 | { ADD_ACTIVE(state_offset + 4, 0); } |
---|
1552 | count = current_state->count; /* Number already matched */ |
---|
1553 | if (clen > 0) |
---|
1554 | { |
---|
1555 | int ncount = 0; |
---|
1556 | switch (c) |
---|
1557 | { |
---|
1558 | case 0x000b: |
---|
1559 | case 0x000c: |
---|
1560 | case 0x0085: |
---|
1561 | case 0x2028: |
---|
1562 | case 0x2029: |
---|
1563 | if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
---|
1564 | goto ANYNL03; |
---|
1565 | |
---|
1566 | case 0x000d: |
---|
1567 | if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; |
---|
1568 | /* Fall through */ |
---|
1569 | |
---|
1570 | ANYNL03: |
---|
1571 | case 0x000a: |
---|
1572 | if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) |
---|
1573 | { |
---|
1574 | active_count--; /* Remove non-match possibility */ |
---|
1575 | next_active_state--; |
---|
1576 | } |
---|
1577 | if (++count >= GET2(code, 1)) |
---|
1578 | { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } |
---|
1579 | else |
---|
1580 | { ADD_NEW_DATA(-state_offset, count, ncount); } |
---|
1581 | break; |
---|
1582 | |
---|
1583 | default: |
---|
1584 | break; |
---|
1585 | } |
---|
1586 | } |
---|
1587 | break; |
---|
1588 | |
---|
1589 | /*-----------------------------------------------------------------*/ |
---|
1590 | case OP_VSPACE_EXTRA + OP_TYPEEXACT: |
---|
1591 | case OP_VSPACE_EXTRA + OP_TYPEUPTO: |
---|
1592 | case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: |
---|
1593 | case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: |
---|
1594 | if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) |
---|
1595 | { ADD_ACTIVE(state_offset + 4, 0); } |
---|
1596 | count = current_state->count; /* Number already matched */ |
---|
1597 | if (clen > 0) |
---|
1598 | { |
---|
1599 | BOOL OK; |
---|
1600 | switch (c) |
---|
1601 | { |
---|
1602 | case 0x000a: |
---|
1603 | case 0x000b: |
---|
1604 | case 0x000c: |
---|
1605 | case 0x000d: |
---|
1606 | case 0x0085: |
---|
1607 | case 0x2028: |
---|
1608 | case 0x2029: |
---|
1609 | OK = TRUE; |
---|
1610 | break; |
---|
1611 | |
---|
1612 | default: |
---|
1613 | OK = FALSE; |
---|
1614 | } |
---|
1615 | |
---|
1616 | if (OK == (d == OP_VSPACE)) |
---|
1617 | { |
---|
1618 | if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) |
---|
1619 | { |
---|
1620 | active_count--; /* Remove non-match possibility */ |
---|
1621 | next_active_state--; |
---|
1622 | } |
---|
1623 | if (++count >= GET2(code, 1)) |
---|
1624 | { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } |
---|
1625 | else |
---|
1626 | { ADD_NEW_DATA(-state_offset, count, 0); } |
---|
1627 | } |
---|
1628 | } |
---|
1629 | break; |
---|
1630 | |
---|
1631 | /*-----------------------------------------------------------------*/ |
---|
1632 | case OP_HSPACE_EXTRA + OP_TYPEEXACT: |
---|
1633 | case OP_HSPACE_EXTRA + OP_TYPEUPTO: |
---|
1634 | case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: |
---|
1635 | case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: |
---|
1636 | if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) |
---|
1637 | { ADD_ACTIVE(state_offset + 4, 0); } |
---|
1638 | count = current_state->count; /* Number already matched */ |
---|
1639 | if (clen > 0) |
---|
1640 | { |
---|
1641 | BOOL OK; |
---|
1642 | switch (c) |
---|
1643 | { |
---|
1644 | case 0x09: /* HT */ |
---|
1645 | case 0x20: /* SPACE */ |
---|
1646 | case 0xa0: /* NBSP */ |
---|
1647 | case 0x1680: /* OGHAM SPACE MARK */ |
---|
1648 | case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
---|
1649 | case 0x2000: /* EN QUAD */ |
---|
1650 | case 0x2001: /* EM QUAD */ |
---|
1651 | case 0x2002: /* EN SPACE */ |
---|
1652 | case 0x2003: /* EM SPACE */ |
---|
1653 | case 0x2004: /* THREE-PER-EM SPACE */ |
---|
1654 | case 0x2005: /* FOUR-PER-EM SPACE */ |
---|
1655 | case 0x2006: /* SIX-PER-EM SPACE */ |
---|
1656 | case 0x2007: /* FIGURE SPACE */ |
---|
1657 | case 0x2008: /* PUNCTUATION SPACE */ |
---|
1658 | case 0x2009: /* THIN SPACE */ |
---|
1659 | case 0x200A: /* HAIR SPACE */ |
---|
1660 | case 0x202f: /* NARROW NO-BREAK SPACE */ |
---|
1661 | case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
---|
1662 | case 0x3000: /* IDEOGRAPHIC SPACE */ |
---|
1663 | OK = TRUE; |
---|
1664 | break; |
---|
1665 | |
---|
1666 | default: |
---|
1667 | OK = FALSE; |
---|
1668 | break; |
---|
1669 | } |
---|
1670 | |
---|
1671 | if (OK == (d == OP_HSPACE)) |
---|
1672 | { |
---|
1673 | if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) |
---|
1674 | { |
---|
1675 | active_count--; /* Remove non-match possibility */ |
---|
1676 | next_active_state--; |
---|
1677 | } |
---|
1678 | if (++count >= GET2(code, 1)) |
---|
1679 | { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } |
---|
1680 | else |
---|
1681 | { ADD_NEW_DATA(-state_offset, count, 0); } |
---|
1682 | } |
---|
1683 | } |
---|
1684 | break; |
---|
1685 | |
---|
1686 | /* ========================================================================== */ |
---|
1687 | /* These opcodes are followed by a character that is usually compared |
---|
1688 | to the current subject character; it is loaded into d. We still get |
---|
1689 | here even if there is no subject character, because in some cases zero |
---|
1690 | repetitions are permitted. */ |
---|
1691 | |
---|
1692 | /*-----------------------------------------------------------------*/ |
---|
1693 | case OP_CHAR: |
---|
1694 | if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } |
---|
1695 | break; |
---|
1696 | |
---|
1697 | /*-----------------------------------------------------------------*/ |
---|
1698 | case OP_CHARNC: |
---|
1699 | if (clen == 0) break; |
---|
1700 | |
---|
1701 | #ifdef SUPPORT_UTF8 |
---|
1702 | if (utf8) |
---|
1703 | { |
---|
1704 | if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
---|
1705 | { |
---|
1706 | unsigned int othercase; |
---|
1707 | if (c < 128) othercase = fcc[c]; else |
---|
1708 | |
---|
1709 | /* If we have Unicode property support, we can use it to test the |
---|
1710 | other case of the character. */ |
---|
1711 | |
---|
1712 | #ifdef SUPPORT_UCP |
---|
1713 | othercase = _pcre_ucp_othercase(c); |
---|
1714 | #else |
---|
1715 | othercase = NOTACHAR; |
---|
1716 | #endif |
---|
1717 | |
---|
1718 | if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
---|
1719 | } |
---|
1720 | } |
---|
1721 | else |
---|
1722 | #endif /* SUPPORT_UTF8 */ |
---|
1723 | |
---|
1724 | /* Non-UTF-8 mode */ |
---|
1725 | { |
---|
1726 | if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } |
---|
1727 | } |
---|
1728 | break; |
---|
1729 | |
---|
1730 | |
---|
1731 | #ifdef SUPPORT_UCP |
---|
1732 | /*-----------------------------------------------------------------*/ |
---|
1733 | /* This is a tricky one because it can match more than one character. |
---|
1734 | Find out how many characters to skip, and then set up a negative state |
---|
1735 | to wait for them to pass before continuing. */ |
---|
1736 | |
---|
1737 | case OP_EXTUNI: |
---|
1738 | if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
---|
1739 | { |
---|
1740 | const uschar *nptr = ptr + clen; |
---|
1741 | int ncount = 0; |
---|
1742 | while (nptr < end_subject) |
---|
1743 | { |
---|
1744 | int nclen = 1; |
---|
1745 | GETCHARLEN(c, nptr, nclen); |
---|
1746 | if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break; |
---|
1747 | ncount++; |
---|
1748 | nptr += nclen; |
---|
1749 | } |
---|
1750 | ADD_NEW_DATA(-(state_offset + 1), 0, ncount); |
---|
1751 | } |
---|
1752 | break; |
---|
1753 | #endif |
---|
1754 | |
---|
1755 | /*-----------------------------------------------------------------*/ |
---|
1756 | /* This is a tricky like EXTUNI because it too can match more than one |
---|
1757 | character (when CR is followed by LF). In this case, set up a negative |
---|
1758 | state to wait for one character to pass before continuing. */ |
---|
1759 | |
---|
1760 | case OP_ANYNL: |
---|
1761 | if (clen > 0) switch(c) |
---|
1762 | { |
---|
1763 | case 0x000b: |
---|
1764 | case 0x000c: |
---|
1765 | case 0x0085: |
---|
1766 | case 0x2028: |
---|
1767 | case 0x2029: |
---|
1768 | if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; |
---|
1769 | |
---|
1770 | case 0x000a: |
---|
1771 | ADD_NEW(state_offset + 1, 0); |
---|
1772 | break; |
---|
1773 | |
---|
1774 | case 0x000d: |
---|
1775 | if (ptr + 1 < end_subject && ptr[1] == 0x0a) |
---|
1776 | { |
---|
1777 | ADD_NEW_DATA(-(state_offset + 1), 0, 1); |
---|
1778 | } |
---|
1779 | else |
---|
1780 | { |
---|
1781 | ADD_NEW(state_offset + 1, 0); |
---|
1782 | } |
---|
1783 | break; |
---|
1784 | } |
---|
1785 | break; |
---|
1786 | |
---|
1787 | /*-----------------------------------------------------------------*/ |
---|
1788 | case OP_NOT_VSPACE: |
---|
1789 | if (clen > 0) switch(c) |
---|
1790 | { |
---|
1791 | case 0x000a: |
---|
1792 | case 0x000b: |
---|
1793 | case 0x000c: |
---|
1794 | case 0x000d: |
---|
1795 | case 0x0085: |
---|
1796 | case 0x2028: |
---|
1797 | case 0x2029: |
---|
1798 | break; |
---|
1799 | |
---|
1800 | default: |
---|
1801 | ADD_NEW(state_offset + 1, 0); |
---|
1802 | break; |
---|
1803 | } |
---|
1804 | break; |
---|
1805 | |
---|
1806 | /*-----------------------------------------------------------------*/ |
---|
1807 | case OP_VSPACE: |
---|
1808 | if (clen > 0) switch(c) |
---|
1809 | { |
---|
1810 | case 0x000a: |
---|
1811 | case 0x000b: |
---|
1812 | case 0x000c: |
---|
1813 | case 0x000d: |
---|
1814 | case 0x0085: |
---|
1815 | case 0x2028: |
---|
1816 | case 0x2029: |
---|
1817 | ADD_NEW(state_offset + 1, 0); |
---|
1818 | break; |
---|
1819 | |
---|
1820 | default: break; |
---|
1821 | } |
---|
1822 | break; |
---|
1823 | |
---|
1824 | /*-----------------------------------------------------------------*/ |
---|
1825 | case OP_NOT_HSPACE: |
---|
1826 | if (clen > 0) switch(c) |
---|
1827 | { |
---|
1828 | case 0x09: /* HT */ |
---|
1829 | case 0x20: /* SPACE */ |
---|
1830 | case 0xa0: /* NBSP */ |
---|
1831 | case 0x1680: /* OGHAM SPACE MARK */ |
---|
1832 | case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
---|
1833 | case 0x2000: /* EN QUAD */ |
---|
1834 | case 0x2001: /* EM QUAD */ |
---|
1835 | case 0x2002: /* EN SPACE */ |
---|
1836 | case 0x2003: /* EM SPACE */ |
---|
1837 | case 0x2004: /* THREE-PER-EM SPACE */ |
---|
1838 | case 0x2005: /* FOUR-PER-EM SPACE */ |
---|
1839 | case 0x2006: /* SIX-PER-EM SPACE */ |
---|
1840 | case 0x2007: /* FIGURE SPACE */ |
---|
1841 | case 0x2008: /* PUNCTUATION SPACE */ |
---|
1842 | case 0x2009: /* THIN SPACE */ |
---|
1843 | case 0x200A: /* HAIR SPACE */ |
---|
1844 | case 0x202f: /* NARROW NO-BREAK SPACE */ |
---|
1845 | case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
---|
1846 | case 0x3000: /* IDEOGRAPHIC SPACE */ |
---|
1847 | break; |
---|
1848 | |
---|
1849 | default: |
---|
1850 | ADD_NEW(state_offset + 1, 0); |
---|
1851 | break; |
---|
1852 | } |
---|
1853 | break; |
---|
1854 | |
---|
1855 | /*-----------------------------------------------------------------*/ |
---|
1856 | case OP_HSPACE: |
---|
1857 | if (clen > 0) switch(c) |
---|
1858 | { |
---|
1859 | case 0x09: /* HT */ |
---|
1860 | case 0x20: /* SPACE */ |
---|
1861 | case 0xa0: /* NBSP */ |
---|
1862 | case 0x1680: /* OGHAM SPACE MARK */ |
---|
1863 | case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ |
---|
1864 | case 0x2000: /* EN QUAD */ |
---|
1865 | case 0x2001: /* EM QUAD */ |
---|
1866 | case 0x2002: /* EN SPACE */ |
---|
1867 | case 0x2003: /* EM SPACE */ |
---|
1868 | case 0x2004: /* THREE-PER-EM SPACE */ |
---|
1869 | case 0x2005: /* FOUR-PER-EM SPACE */ |
---|
1870 | case 0x2006: /* SIX-PER-EM SPACE */ |
---|
1871 | case 0x2007: /* FIGURE SPACE */ |
---|
1872 | case 0x2008: /* PUNCTUATION SPACE */ |
---|
1873 | case 0x2009: /* THIN SPACE */ |
---|
1874 | case 0x200A: /* HAIR SPACE */ |
---|
1875 | case 0x202f: /* NARROW NO-BREAK SPACE */ |
---|
1876 | case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ |
---|
1877 | case 0x3000: /* IDEOGRAPHIC SPACE */ |
---|
1878 | ADD_NEW(state_offset + 1, 0); |
---|
1879 | break; |
---|
1880 | } |
---|
1881 | break; |
---|
1882 | |
---|
1883 | /*-----------------------------------------------------------------*/ |
---|
1884 | /* Match a negated single character. This is only used for one-byte |
---|
1885 | characters, that is, we know that d < 256. The character we are |
---|
1886 | checking (c) can be multibyte. */ |
---|
1887 | |
---|
1888 | case OP_NOT: |
---|
1889 | if (clen > 0) |
---|
1890 | { |
---|
1891 | unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d; |
---|
1892 | if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); } |
---|
1893 | } |
---|
1894 | break; |
---|
1895 | |
---|
1896 | /*-----------------------------------------------------------------*/ |
---|
1897 | case OP_PLUS: |
---|
1898 | case OP_MINPLUS: |
---|
1899 | case OP_POSPLUS: |
---|
1900 | case OP_NOTPLUS: |
---|
1901 | case OP_NOTMINPLUS: |
---|
1902 | case OP_NOTPOSPLUS: |
---|
1903 | count = current_state->count; /* Already matched */ |
---|
1904 | if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } |
---|
1905 | if (clen > 0) |
---|
1906 | { |
---|
1907 | unsigned int otherd = NOTACHAR; |
---|
1908 | if ((ims & PCRE_CASELESS) != 0) |
---|
1909 | { |
---|
1910 | #ifdef SUPPORT_UTF8 |
---|
1911 | if (utf8 && d >= 128) |
---|
1912 | { |
---|
1913 | #ifdef SUPPORT_UCP |
---|
1914 | otherd = _pcre_ucp_othercase(d); |
---|
1915 | #endif /* SUPPORT_UCP */ |
---|
1916 | } |
---|
1917 | else |
---|
1918 | #endif /* SUPPORT_UTF8 */ |
---|
1919 | otherd = fcc[d]; |
---|
1920 | } |
---|
1921 | if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
---|
1922 | { |
---|
1923 | if (count > 0 && |
---|
1924 | (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) |
---|
1925 | { |
---|
1926 | active_count--; /* Remove non-match possibility */ |
---|
1927 | next_active_state--; |
---|
1928 | } |
---|
1929 | count++; |
---|
1930 | ADD_NEW(state_offset, count); |
---|
1931 | } |
---|
1932 | } |
---|
1933 | break; |
---|
1934 | |
---|
1935 | /*-----------------------------------------------------------------*/ |
---|
1936 | case OP_QUERY: |
---|
1937 | case OP_MINQUERY: |
---|
1938 | case OP_POSQUERY: |
---|
1939 | case OP_NOTQUERY: |
---|
1940 | case OP_NOTMINQUERY: |
---|
1941 | case OP_NOTPOSQUERY: |
---|
1942 | ADD_ACTIVE(state_offset + dlen + 1, 0); |
---|
1943 | if (clen > 0) |
---|
1944 | { |
---|
1945 | unsigned int otherd = NOTACHAR; |
---|
1946 | if ((ims & PCRE_CASELESS) != 0) |
---|
1947 | { |
---|
1948 | #ifdef SUPPORT_UTF8 |
---|
1949 | if (utf8 && d >= 128) |
---|
1950 | { |
---|
1951 | #ifdef SUPPORT_UCP |
---|
1952 | otherd = _pcre_ucp_othercase(d); |
---|
1953 | #endif /* SUPPORT_UCP */ |
---|
1954 | } |
---|
1955 | else |
---|
1956 | #endif /* SUPPORT_UTF8 */ |
---|
1957 | otherd = fcc[d]; |
---|
1958 | } |
---|
1959 | if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
---|
1960 | { |
---|
1961 | if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) |
---|
1962 | { |
---|
1963 | active_count--; /* Remove non-match possibility */ |
---|
1964 | next_active_state--; |
---|
1965 | } |
---|
1966 | ADD_NEW(state_offset + dlen + 1, 0); |
---|
1967 | } |
---|
1968 | } |
---|
1969 | break; |
---|
1970 | |
---|
1971 | /*-----------------------------------------------------------------*/ |
---|
1972 | case OP_STAR: |
---|
1973 | case OP_MINSTAR: |
---|
1974 | case OP_POSSTAR: |
---|
1975 | case OP_NOTSTAR: |
---|
1976 | case OP_NOTMINSTAR: |
---|
1977 | case OP_NOTPOSSTAR: |
---|
1978 | ADD_ACTIVE(state_offset + dlen + 1, 0); |
---|
1979 | if (clen > 0) |
---|
1980 | { |
---|
1981 | unsigned int otherd = NOTACHAR; |
---|
1982 | if ((ims & PCRE_CASELESS) != 0) |
---|
1983 | { |
---|
1984 | #ifdef SUPPORT_UTF8 |
---|
1985 | if (utf8 && d >= 128) |
---|
1986 | { |
---|
1987 | #ifdef SUPPORT_UCP |
---|
1988 | otherd = _pcre_ucp_othercase(d); |
---|
1989 | #endif /* SUPPORT_UCP */ |
---|
1990 | } |
---|
1991 | else |
---|
1992 | #endif /* SUPPORT_UTF8 */ |
---|
1993 | otherd = fcc[d]; |
---|
1994 | } |
---|
1995 | if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
---|
1996 | { |
---|
1997 | if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) |
---|
1998 | { |
---|
1999 | active_count--; /* Remove non-match possibility */ |
---|
2000 | next_active_state--; |
---|
2001 | } |
---|
2002 | ADD_NEW(state_offset, 0); |
---|
2003 | } |
---|
2004 | } |
---|
2005 | break; |
---|
2006 | |
---|
2007 | /*-----------------------------------------------------------------*/ |
---|
2008 | case OP_EXACT: |
---|
2009 | case OP_NOTEXACT: |
---|
2010 | count = current_state->count; /* Number already matched */ |
---|
2011 | if (clen > 0) |
---|
2012 | { |
---|
2013 | unsigned int otherd = NOTACHAR; |
---|
2014 | if ((ims & PCRE_CASELESS) != 0) |
---|
2015 | { |
---|
2016 | #ifdef SUPPORT_UTF8 |
---|
2017 | if (utf8 && d >= 128) |
---|
2018 | { |
---|
2019 | #ifdef SUPPORT_UCP |
---|
2020 | otherd = _pcre_ucp_othercase(d); |
---|
2021 | #endif /* SUPPORT_UCP */ |
---|
2022 | } |
---|
2023 | else |
---|
2024 | #endif /* SUPPORT_UTF8 */ |
---|
2025 | otherd = fcc[d]; |
---|
2026 | } |
---|
2027 | if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
---|
2028 | { |
---|
2029 | if (++count >= GET2(code, 1)) |
---|
2030 | { ADD_NEW(state_offset + dlen + 3, 0); } |
---|
2031 | else |
---|
2032 | { ADD_NEW(state_offset, count); } |
---|
2033 | } |
---|
2034 | } |
---|
2035 | break; |
---|
2036 | |
---|
2037 | /*-----------------------------------------------------------------*/ |
---|
2038 | case OP_UPTO: |
---|
2039 | case OP_MINUPTO: |
---|
2040 | case OP_POSUPTO: |
---|
2041 | case OP_NOTUPTO: |
---|
2042 | case OP_NOTMINUPTO: |
---|
2043 | case OP_NOTPOSUPTO: |
---|
2044 | ADD_ACTIVE(state_offset + dlen + 3, 0); |
---|
2045 | count = current_state->count; /* Number already matched */ |
---|
2046 | if (clen > 0) |
---|
2047 | { |
---|
2048 | unsigned int otherd = NOTACHAR; |
---|
2049 | if ((ims & PCRE_CASELESS) != 0) |
---|
2050 | { |
---|
2051 | #ifdef SUPPORT_UTF8 |
---|
2052 | if (utf8 && d >= 128) |
---|
2053 | { |
---|
2054 | #ifdef SUPPORT_UCP |
---|
2055 | otherd = _pcre_ucp_othercase(d); |
---|
2056 | #endif /* SUPPORT_UCP */ |
---|
2057 | } |
---|
2058 | else |
---|
2059 | #endif /* SUPPORT_UTF8 */ |
---|
2060 | otherd = fcc[d]; |
---|
2061 | } |
---|
2062 | if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) |
---|
2063 | { |
---|
2064 | if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) |
---|
2065 | { |
---|
2066 | active_count--; /* Remove non-match possibility */ |
---|
2067 | next_active_state--; |
---|
2068 | } |
---|
2069 | if (++count >= GET2(code, 1)) |
---|
2070 | { ADD_NEW(state_offset + dlen + 3, 0); } |
---|
2071 | else |
---|
2072 | { ADD_NEW(state_offset, count); } |
---|
2073 | } |
---|
2074 | } |
---|
2075 | break; |
---|
2076 | |
---|
2077 | |
---|
2078 | /* ========================================================================== */ |
---|
2079 | /* These are the class-handling opcodes */ |
---|
2080 | |
---|
2081 | case OP_CLASS: |
---|
2082 | case OP_NCLASS: |
---|
2083 | case OP_XCLASS: |
---|
2084 | { |
---|
2085 | BOOL isinclass = FALSE; |
---|
2086 | int next_state_offset; |
---|
2087 | const uschar *ecode; |
---|
2088 | |
---|
2089 | /* For a simple class, there is always just a 32-byte table, and we |
---|
2090 | can set isinclass from it. */ |
---|
2091 | |
---|
2092 | if (codevalue != OP_XCLASS) |
---|
2093 | { |
---|
2094 | ecode = code + 33; |
---|
2095 | if (clen > 0) |
---|
2096 | { |
---|
2097 | isinclass = (c > 255)? (codevalue == OP_NCLASS) : |
---|
2098 | ((code[1 + c/8] & (1 << (c&7))) != 0); |
---|
2099 | } |
---|
2100 | } |
---|
2101 | |
---|
2102 | /* An extended class may have a table or a list of single characters, |
---|
2103 | ranges, or both, and it may be positive or negative. There's a |
---|
2104 | function that sorts all this out. */ |
---|
2105 | |
---|
2106 | else |
---|
2107 | { |
---|
2108 | ecode = code + GET(code, 1); |
---|
2109 | if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); |
---|
2110 | } |
---|
2111 | |
---|
2112 | /* At this point, isinclass is set for all kinds of class, and ecode |
---|
2113 | points to the byte after the end of the class. If there is a |
---|
2114 | quantifier, this is where it will be. */ |
---|
2115 | |
---|
2116 | next_state_offset = ecode - start_code; |
---|
2117 | |
---|
2118 | switch (*ecode) |
---|
2119 | { |
---|
2120 | case OP_CRSTAR: |
---|
2121 | case OP_CRMINSTAR: |
---|
2122 | ADD_ACTIVE(next_state_offset + 1, 0); |
---|
2123 | if (isinclass) { ADD_NEW(state_offset, 0); } |
---|
2124 | break; |
---|
2125 | |
---|
2126 | case OP_CRPLUS: |
---|
2127 | case OP_CRMINPLUS: |
---|
2128 | count = current_state->count; /* Already matched */ |
---|
2129 | if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } |
---|
2130 | if (isinclass) { count++; ADD_NEW(state_offset, count); } |
---|
2131 | break; |
---|
2132 | |
---|
2133 | case OP_CRQUERY: |
---|
2134 | case OP_CRMINQUERY: |
---|
2135 | ADD_ACTIVE(next_state_offset + 1, 0); |
---|
2136 | if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } |
---|
2137 | break; |
---|
2138 | |
---|
2139 | case OP_CRRANGE: |
---|
2140 | case OP_CRMINRANGE: |
---|
2141 | count = current_state->count; /* Already matched */ |
---|
2142 | if (count >= GET2(ecode, 1)) |
---|
2143 | { ADD_ACTIVE(next_state_offset + 5, 0); } |
---|
2144 | if (isinclass) |
---|
2145 | { |
---|
2146 | int max = GET2(ecode, 3); |
---|
2147 | if (++count >= max && max != 0) /* Max 0 => no limit */ |
---|
2148 | { ADD_NEW(next_state_offset + 5, 0); } |
---|
2149 | else |
---|
2150 | { ADD_NEW(state_offset, count); } |
---|
2151 | } |
---|
2152 | break; |
---|
2153 | |
---|
2154 | default: |
---|
2155 | if (isinclass) { ADD_NEW(next_state_offset, 0); } |
---|
2156 | break; |
---|
2157 | } |
---|
2158 | } |
---|
2159 | break; |
---|
2160 | |
---|
2161 | /* ========================================================================== */ |
---|
2162 | /* These are the opcodes for fancy brackets of various kinds. We have |
---|
2163 | to use recursion in order to handle them. The "always failing" assersion |
---|
2164 | (?!) is optimised when compiling to OP_FAIL, so we have to support that, |
---|
2165 | though the other "backtracking verbs" are not supported. */ |
---|
2166 | |
---|
2167 | case OP_FAIL: |
---|
2168 | break; |
---|
2169 | |
---|
2170 | case OP_ASSERT: |
---|
2171 | case OP_ASSERT_NOT: |
---|
2172 | case OP_ASSERTBACK: |
---|
2173 | case OP_ASSERTBACK_NOT: |
---|
2174 | { |
---|
2175 | int rc; |
---|
2176 | int local_offsets[2]; |
---|
2177 | int local_workspace[1000]; |
---|
2178 | const uschar *endasscode = code + GET(code, 1); |
---|
2179 | |
---|
2180 | while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); |
---|
2181 | |
---|
2182 | rc = internal_dfa_exec( |
---|
2183 | md, /* static match data */ |
---|
2184 | code, /* this subexpression's code */ |
---|
2185 | ptr, /* where we currently are */ |
---|
2186 | ptr - start_subject, /* start offset */ |
---|
2187 | local_offsets, /* offset vector */ |
---|
2188 | sizeof(local_offsets)/sizeof(int), /* size of same */ |
---|
2189 | local_workspace, /* workspace vector */ |
---|
2190 | sizeof(local_workspace)/sizeof(int), /* size of same */ |
---|
2191 | ims, /* the current ims flags */ |
---|
2192 | rlevel, /* function recursion level */ |
---|
2193 | recursing); /* pass on regex recursion */ |
---|
2194 | |
---|
2195 | if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) |
---|
2196 | { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } |
---|
2197 | } |
---|
2198 | break; |
---|
2199 | |
---|
2200 | /*-----------------------------------------------------------------*/ |
---|
2201 | case OP_COND: |
---|
2202 | case OP_SCOND: |
---|
2203 | { |
---|
2204 | int local_offsets[1000]; |
---|
2205 | int local_workspace[1000]; |
---|
2206 | int condcode = code[LINK_SIZE+1]; |
---|
2207 | |
---|
2208 | /* Back reference conditions are not supported */ |
---|
2209 | |
---|
2210 | if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; |
---|
2211 | |
---|
2212 | /* The DEFINE condition is always false */ |
---|
2213 | |
---|
2214 | if (condcode == OP_DEF) |
---|
2215 | { |
---|
2216 | ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); |
---|
2217 | } |
---|
2218 | |
---|
2219 | /* The only supported version of OP_RREF is for the value RREF_ANY, |
---|
2220 | which means "test if in any recursion". We can't test for specifically |
---|
2221 | recursed groups. */ |
---|
2222 | |
---|
2223 | else if (condcode == OP_RREF) |
---|
2224 | { |
---|
2225 | int value = GET2(code, LINK_SIZE+2); |
---|
2226 | if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; |
---|
2227 | if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } |
---|
2228 | else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } |
---|
2229 | } |
---|
2230 | |
---|
2231 | /* Otherwise, the condition is an assertion */ |
---|
2232 | |
---|
2233 | else |
---|
2234 | { |
---|
2235 | int rc; |
---|
2236 | const uschar *asscode = code + LINK_SIZE + 1; |
---|
2237 | const uschar *endasscode = asscode + GET(asscode, 1); |
---|
2238 | |
---|
2239 | while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); |
---|
2240 | |
---|
2241 | rc = internal_dfa_exec( |
---|
2242 | md, /* fixed match data */ |
---|
2243 | asscode, /* this subexpression's code */ |
---|
2244 | ptr, /* where we currently are */ |
---|
2245 | ptr - start_subject, /* start offset */ |
---|
2246 | local_offsets, /* offset vector */ |
---|
2247 | sizeof(local_offsets)/sizeof(int), /* size of same */ |
---|
2248 | local_workspace, /* workspace vector */ |
---|
2249 | sizeof(local_workspace)/sizeof(int), /* size of same */ |
---|
2250 | ims, /* the current ims flags */ |
---|
2251 | rlevel, /* function recursion level */ |
---|
2252 | recursing); /* pass on regex recursion */ |
---|
2253 | |
---|
2254 | if ((rc >= 0) == |
---|
2255 | (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) |
---|
2256 | { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } |
---|
2257 | else |
---|
2258 | { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } |
---|
2259 | } |
---|
2260 | } |
---|
2261 | break; |
---|
2262 | |
---|
2263 | /*-----------------------------------------------------------------*/ |
---|
2264 | case OP_RECURSE: |
---|
2265 | { |
---|
2266 | int local_offsets[1000]; |
---|
2267 | int local_workspace[1000]; |
---|
2268 | int rc; |
---|
2269 | |
---|
2270 | DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP, |
---|
2271 | recursing + 1)); |
---|
2272 | |
---|
2273 | rc = internal_dfa_exec( |
---|
2274 | md, /* fixed match data */ |
---|
2275 | start_code + GET(code, 1), /* this subexpression's code */ |
---|
2276 | ptr, /* where we currently are */ |
---|
2277 | ptr - start_subject, /* start offset */ |
---|
2278 | local_offsets, /* offset vector */ |
---|
2279 | sizeof(local_offsets)/sizeof(int), /* size of same */ |
---|
2280 | local_workspace, /* workspace vector */ |
---|
2281 | sizeof(local_workspace)/sizeof(int), /* size of same */ |
---|
2282 | ims, /* the current ims flags */ |
---|
2283 | rlevel, /* function recursion level */ |
---|
2284 | recursing + 1); /* regex recurse level */ |
---|
2285 | |
---|
2286 | DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP, |
---|
2287 | recursing + 1, rc)); |
---|
2288 | |
---|
2289 | /* Ran out of internal offsets */ |
---|
2290 | |
---|
2291 | if (rc == 0) return PCRE_ERROR_DFA_RECURSE; |
---|
2292 | |
---|
2293 | /* For each successful matched substring, set up the next state with a |
---|
2294 | count of characters to skip before trying it. Note that the count is in |
---|
2295 | characters, not bytes. */ |
---|
2296 | |
---|
2297 | if (rc > 0) |
---|
2298 | { |
---|
2299 | for (rc = rc*2 - 2; rc >= 0; rc -= 2) |
---|
2300 | { |
---|
2301 | const uschar *p = start_subject + local_offsets[rc]; |
---|
2302 | const uschar *pp = start_subject + local_offsets[rc+1]; |
---|
2303 | int charcount = local_offsets[rc+1] - local_offsets[rc]; |
---|
2304 | while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; |
---|
2305 | if (charcount > 0) |
---|
2306 | { |
---|
2307 | ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); |
---|
2308 | } |
---|
2309 | else |
---|
2310 | { |
---|
2311 | ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); |
---|
2312 | } |
---|
2313 | } |
---|
2314 | } |
---|
2315 | else if (rc != PCRE_ERROR_NOMATCH) return rc; |
---|
2316 | } |
---|
2317 | break; |
---|
2318 | |
---|
2319 | /*-----------------------------------------------------------------*/ |
---|
2320 | case OP_ONCE: |
---|
2321 | { |
---|
2322 | int local_offsets[2]; |
---|
2323 | int local_workspace[1000]; |
---|
2324 | |
---|
2325 | int rc = internal_dfa_exec( |
---|
2326 | md, /* fixed match data */ |
---|
2327 | code, /* this subexpression's code */ |
---|
2328 | ptr, /* where we currently are */ |
---|
2329 | ptr - start_subject, /* start offset */ |
---|
2330 | local_offsets, /* offset vector */ |
---|
2331 | sizeof(local_offsets)/sizeof(int), /* size of same */ |
---|
2332 | local_workspace, /* workspace vector */ |
---|
2333 | sizeof(local_workspace)/sizeof(int), /* size of same */ |
---|
2334 | ims, /* the current ims flags */ |
---|
2335 | rlevel, /* function recursion level */ |
---|
2336 | recursing); /* pass on regex recursion */ |
---|
2337 | |
---|
2338 | if (rc >= 0) |
---|
2339 | { |
---|
2340 | const uschar *end_subpattern = code; |
---|
2341 | int charcount = local_offsets[1] - local_offsets[0]; |
---|
2342 | int next_state_offset, repeat_state_offset; |
---|
2343 | |
---|
2344 | do { end_subpattern += GET(end_subpattern, 1); } |
---|
2345 | while (*end_subpattern == OP_ALT); |
---|
2346 | next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; |
---|
2347 | |
---|
2348 | /* If the end of this subpattern is KETRMAX or KETRMIN, we must |
---|
2349 | arrange for the repeat state also to be added to the relevant list. |
---|
2350 | Calculate the offset, or set -1 for no repeat. */ |
---|
2351 | |
---|
2352 | repeat_state_offset = (*end_subpattern == OP_KETRMAX || |
---|
2353 | *end_subpattern == OP_KETRMIN)? |
---|
2354 | end_subpattern - start_code - GET(end_subpattern, 1) : -1; |
---|
2355 | |
---|
2356 | /* If we have matched an empty string, add the next state at the |
---|
2357 | current character pointer. This is important so that the duplicate |
---|
2358 | checking kicks in, which is what breaks infinite loops that match an |
---|
2359 | empty string. */ |
---|
2360 | |
---|
2361 | if (charcount == 0) |
---|
2362 | { |
---|
2363 | ADD_ACTIVE(next_state_offset, 0); |
---|
2364 | } |
---|
2365 | |
---|
2366 | /* Optimization: if there are no more active states, and there |
---|
2367 | are no new states yet set up, then skip over the subject string |
---|
2368 | right here, to save looping. Otherwise, set up the new state to swing |
---|
2369 | into action when the end of the substring is reached. */ |
---|
2370 | |
---|
2371 | else if (i + 1 >= active_count && new_count == 0) |
---|
2372 | { |
---|
2373 | ptr += charcount; |
---|
2374 | clen = 0; |
---|
2375 | ADD_NEW(next_state_offset, 0); |
---|
2376 | |
---|
2377 | /* If we are adding a repeat state at the new character position, |
---|
2378 | we must fudge things so that it is the only current state. |
---|
2379 | Otherwise, it might be a duplicate of one we processed before, and |
---|
2380 | that would cause it to be skipped. */ |
---|
2381 | |
---|
2382 | if (repeat_state_offset >= 0) |
---|
2383 | { |
---|
2384 | next_active_state = active_states; |
---|
2385 | active_count = 0; |
---|
2386 | i = -1; |
---|
2387 | ADD_ACTIVE(repeat_state_offset, 0); |
---|
2388 | } |
---|
2389 | } |
---|
2390 | else |
---|
2391 | { |
---|
2392 | const uschar *p = start_subject + local_offsets[0]; |
---|
2393 | const uschar *pp = start_subject + local_offsets[1]; |
---|
2394 | while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; |
---|
2395 | ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); |
---|
2396 | if (repeat_state_offset >= 0) |
---|
2397 | { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } |
---|
2398 | } |
---|
2399 | |
---|
2400 | } |
---|
2401 | else if (rc != PCRE_ERROR_NOMATCH) return rc; |
---|
2402 | } |
---|
2403 | break; |
---|
2404 | |
---|
2405 | |
---|
2406 | /* ========================================================================== */ |
---|
2407 | /* Handle callouts */ |
---|
2408 | |
---|
2409 | case OP_CALLOUT: |
---|
2410 | if (pcre_callout != NULL) |
---|
2411 | { |
---|
2412 | int rrc; |
---|
2413 | pcre_callout_block cb; |
---|
2414 | cb.version = 1; /* Version 1 of the callout block */ |
---|
2415 | cb.callout_number = code[1]; |
---|
2416 | cb.offset_vector = offsets; |
---|
2417 | cb.subject = (PCRE_SPTR)start_subject; |
---|
2418 | cb.subject_length = end_subject - start_subject; |
---|
2419 | cb.start_match = current_subject - start_subject; |
---|
2420 | cb.current_position = ptr - start_subject; |
---|
2421 | cb.pattern_position = GET(code, 2); |
---|
2422 | cb.next_item_length = GET(code, 2 + LINK_SIZE); |
---|
2423 | cb.capture_top = 1; |
---|
2424 | cb.capture_last = -1; |
---|
2425 | cb.callout_data = md->callout_data; |
---|
2426 | if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ |
---|
2427 | if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); } |
---|
2428 | } |
---|
2429 | break; |
---|
2430 | |
---|
2431 | |
---|
2432 | /* ========================================================================== */ |
---|
2433 | default: /* Unsupported opcode */ |
---|
2434 | return PCRE_ERROR_DFA_UITEM; |
---|
2435 | } |
---|
2436 | |
---|
2437 | NEXT_ACTIVE_STATE: continue; |
---|
2438 | |
---|
2439 | } /* End of loop scanning active states */ |
---|
2440 | |
---|
2441 | /* We have finished the processing at the current subject character. If no |
---|
2442 | new states have been set for the next character, we have found all the |
---|
2443 | matches that we are going to find. If we are at the top level and partial |
---|
2444 | matching has been requested, check for appropriate conditions. */ |
---|
2445 | |
---|
2446 | if (new_count <= 0) |
---|
2447 | { |
---|
2448 | if (match_count < 0 && /* No matches found */ |
---|
2449 | rlevel == 1 && /* Top level match function */ |
---|
2450 | (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */ |
---|
2451 | ptr >= end_subject && /* Reached end of subject */ |
---|
2452 | ptr > current_subject) /* Matched non-empty string */ |
---|
2453 | { |
---|
2454 | if (offsetcount >= 2) |
---|
2455 | { |
---|
2456 | offsets[0] = current_subject - start_subject; |
---|
2457 | offsets[1] = end_subject - start_subject; |
---|
2458 | } |
---|
2459 | match_count = PCRE_ERROR_PARTIAL; |
---|
2460 | } |
---|
2461 | |
---|
2462 | DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
---|
2463 | "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
---|
2464 | rlevel*2-2, SP)); |
---|
2465 | break; /* In effect, "return", but see the comment below */ |
---|
2466 | } |
---|
2467 | |
---|
2468 | /* One or more states are active for the next character. */ |
---|
2469 | |
---|
2470 | ptr += clen; /* Advance to next subject character */ |
---|
2471 | } /* Loop to move along the subject string */ |
---|
2472 | |
---|
2473 | /* Control gets here from "break" a few lines above. We do it this way because |
---|
2474 | if we use "return" above, we have compiler trouble. Some compilers warn if |
---|
2475 | there's nothing here because they think the function doesn't return a value. On |
---|
2476 | the other hand, if we put a dummy statement here, some more clever compilers |
---|
2477 | complain that it can't be reached. Sigh. */ |
---|
2478 | |
---|
2479 | return match_count; |
---|
2480 | } |
---|
2481 | |
---|
2482 | |
---|
2483 | |
---|
2484 | |
---|
2485 | /************************************************* |
---|
2486 | * Execute a Regular Expression - DFA engine * |
---|
2487 | *************************************************/ |
---|
2488 | |
---|
2489 | /* This external function applies a compiled re to a subject string using a DFA |
---|
2490 | engine. This function calls the internal function multiple times if the pattern |
---|
2491 | is not anchored. |
---|
2492 | |
---|
2493 | Arguments: |
---|
2494 | argument_re points to the compiled expression |
---|
2495 | extra_data points to extra data or is NULL |
---|
2496 | subject points to the subject string |
---|
2497 | length length of subject string (may contain binary zeros) |
---|
2498 | start_offset where to start in the subject string |
---|
2499 | options option bits |
---|
2500 | offsets vector of match offsets |
---|
2501 | offsetcount size of same |
---|
2502 | workspace workspace vector |
---|
2503 | wscount size of same |
---|
2504 | |
---|
2505 | Returns: > 0 => number of match offset pairs placed in offsets |
---|
2506 | = 0 => offsets overflowed; longest matches are present |
---|
2507 | -1 => failed to match |
---|
2508 | < -1 => some kind of unexpected problem |
---|
2509 | */ |
---|
2510 | |
---|
2511 | PCRE_EXP_DEFN int |
---|
2512 | pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
---|
2513 | const char *subject, int length, int start_offset, int options, int *offsets, |
---|
2514 | int offsetcount, int *workspace, int wscount) |
---|
2515 | { |
---|
2516 | real_pcre *re = (real_pcre *)argument_re; |
---|
2517 | dfa_match_data match_block; |
---|
2518 | dfa_match_data *md = &match_block; |
---|
2519 | BOOL utf8, anchored, startline, firstline; |
---|
2520 | const uschar *current_subject, *end_subject, *lcc; |
---|
2521 | |
---|
2522 | pcre_study_data internal_study; |
---|
2523 | const pcre_study_data *study = NULL; |
---|
2524 | real_pcre internal_re; |
---|
2525 | |
---|
2526 | const uschar *req_byte_ptr; |
---|
2527 | const uschar *start_bits = NULL; |
---|
2528 | BOOL first_byte_caseless = FALSE; |
---|
2529 | BOOL req_byte_caseless = FALSE; |
---|
2530 | int first_byte = -1; |
---|
2531 | int req_byte = -1; |
---|
2532 | int req_byte2 = -1; |
---|
2533 | int newline; |
---|
2534 | |
---|
2535 | /* Plausibility checks */ |
---|
2536 | |
---|
2537 | if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; |
---|
2538 | if (re == NULL || subject == NULL || workspace == NULL || |
---|
2539 | (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
---|
2540 | if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; |
---|
2541 | if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; |
---|
2542 | |
---|
2543 | /* We need to find the pointer to any study data before we test for byte |
---|
2544 | flipping, so we scan the extra_data block first. This may set two fields in the |
---|
2545 | match block, so we must initialize them beforehand. However, the other fields |
---|
2546 | in the match block must not be set until after the byte flipping. */ |
---|
2547 | |
---|
2548 | md->tables = re->tables; |
---|
2549 | md->callout_data = NULL; |
---|
2550 | |
---|
2551 | if (extra_data != NULL) |
---|
2552 | { |
---|
2553 | unsigned int flags = extra_data->flags; |
---|
2554 | if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
---|
2555 | study = (const pcre_study_data *)extra_data->study_data; |
---|
2556 | if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
---|
2557 | if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
---|
2558 | return PCRE_ERROR_DFA_UMLIMIT; |
---|
2559 | if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
---|
2560 | md->callout_data = extra_data->callout_data; |
---|
2561 | if ((flags & PCRE_EXTRA_TABLES) != 0) |
---|
2562 | md->tables = extra_data->tables; |
---|
2563 | } |
---|
2564 | |
---|
2565 | /* Check that the first field in the block is the magic number. If it is not, |
---|
2566 | test for a regex that was compiled on a host of opposite endianness. If this is |
---|
2567 | the case, flipped values are put in internal_re and internal_study if there was |
---|
2568 | study data too. */ |
---|
2569 | |
---|
2570 | if (re->magic_number != MAGIC_NUMBER) |
---|
2571 | { |
---|
2572 | re = _pcre_try_flipped(re, &internal_re, study, &internal_study); |
---|
2573 | if (re == NULL) return PCRE_ERROR_BADMAGIC; |
---|
2574 | if (study != NULL) study = &internal_study; |
---|
2575 | } |
---|
2576 | |
---|
2577 | /* Set some local values */ |
---|
2578 | |
---|
2579 | current_subject = (const unsigned char *)subject + start_offset; |
---|
2580 | end_subject = (const unsigned char *)subject + length; |
---|
2581 | req_byte_ptr = current_subject - 1; |
---|
2582 | |
---|
2583 | #ifdef SUPPORT_UTF8 |
---|
2584 | utf8 = (re->options & PCRE_UTF8) != 0; |
---|
2585 | #else |
---|
2586 | utf8 = FALSE; |
---|
2587 | #endif |
---|
2588 | |
---|
2589 | anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
---|
2590 | (re->options & PCRE_ANCHORED) != 0; |
---|
2591 | |
---|
2592 | /* The remaining fixed data for passing around. */ |
---|
2593 | |
---|
2594 | md->start_code = (const uschar *)argument_re + |
---|
2595 | re->name_table_offset + re->name_count * re->name_entry_size; |
---|
2596 | md->start_subject = (const unsigned char *)subject; |
---|
2597 | md->end_subject = end_subject; |
---|
2598 | md->moptions = options; |
---|
2599 | md->poptions = re->options; |
---|
2600 | |
---|
2601 | /* If the BSR option is not set at match time, copy what was set |
---|
2602 | at compile time. */ |
---|
2603 | |
---|
2604 | if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) |
---|
2605 | { |
---|
2606 | if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) |
---|
2607 | md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); |
---|
2608 | #ifdef BSR_ANYCRLF |
---|
2609 | else md->moptions |= PCRE_BSR_ANYCRLF; |
---|
2610 | #endif |
---|
2611 | } |
---|
2612 | |
---|
2613 | /* Handle different types of newline. The three bits give eight cases. If |
---|
2614 | nothing is set at run time, whatever was used at compile time applies. */ |
---|
2615 | |
---|
2616 | switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & |
---|
2617 | PCRE_NEWLINE_BITS) |
---|
2618 | { |
---|
2619 | case 0: newline = NEWLINE; break; /* Compile-time default */ |
---|
2620 | case PCRE_NEWLINE_CR: newline = '\r'; break; |
---|
2621 | case PCRE_NEWLINE_LF: newline = '\n'; break; |
---|
2622 | case PCRE_NEWLINE_CR+ |
---|
2623 | PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
---|
2624 | case PCRE_NEWLINE_ANY: newline = -1; break; |
---|
2625 | case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
---|
2626 | default: return PCRE_ERROR_BADNEWLINE; |
---|
2627 | } |
---|
2628 | |
---|
2629 | if (newline == -2) |
---|
2630 | { |
---|
2631 | md->nltype = NLTYPE_ANYCRLF; |
---|
2632 | } |
---|
2633 | else if (newline < 0) |
---|
2634 | { |
---|
2635 | md->nltype = NLTYPE_ANY; |
---|
2636 | } |
---|
2637 | else |
---|
2638 | { |
---|
2639 | md->nltype = NLTYPE_FIXED; |
---|
2640 | if (newline > 255) |
---|
2641 | { |
---|
2642 | md->nllen = 2; |
---|
2643 | md->nl[0] = (newline >> 8) & 255; |
---|
2644 | md->nl[1] = newline & 255; |
---|
2645 | } |
---|
2646 | else |
---|
2647 | { |
---|
2648 | md->nllen = 1; |
---|
2649 | md->nl[0] = newline; |
---|
2650 | } |
---|
2651 | } |
---|
2652 | |
---|
2653 | /* Check a UTF-8 string if required. Unfortunately there's no way of passing |
---|
2654 | back the character offset. */ |
---|
2655 | |
---|
2656 | #ifdef SUPPORT_UTF8 |
---|
2657 | if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) |
---|
2658 | { |
---|
2659 | if (_pcre_valid_utf8((uschar *)subject, length) >= 0) |
---|
2660 | return PCRE_ERROR_BADUTF8; |
---|
2661 | if (start_offset > 0 && start_offset < length) |
---|
2662 | { |
---|
2663 | int tb = ((uschar *)subject)[start_offset]; |
---|
2664 | if (tb > 127) |
---|
2665 | { |
---|
2666 | tb &= 0xc0; |
---|
2667 | if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; |
---|
2668 | } |
---|
2669 | } |
---|
2670 | } |
---|
2671 | #endif |
---|
2672 | |
---|
2673 | /* If the exec call supplied NULL for tables, use the inbuilt ones. This |
---|
2674 | is a feature that makes it possible to save compiled regex and re-use them |
---|
2675 | in other programs later. */ |
---|
2676 | |
---|
2677 | if (md->tables == NULL) md->tables = _pcre_default_tables; |
---|
2678 | |
---|
2679 | /* The lower casing table and the "must be at the start of a line" flag are |
---|
2680 | used in a loop when finding where to start. */ |
---|
2681 | |
---|
2682 | lcc = md->tables + lcc_offset; |
---|
2683 | startline = (re->flags & PCRE_STARTLINE) != 0; |
---|
2684 | firstline = (re->options & PCRE_FIRSTLINE) != 0; |
---|
2685 | |
---|
2686 | /* Set up the first character to match, if available. The first_byte value is |
---|
2687 | never set for an anchored regular expression, but the anchoring may be forced |
---|
2688 | at run time, so we have to test for anchoring. The first char may be unset for |
---|
2689 | an unanchored pattern, of course. If there's no first char and the pattern was |
---|
2690 | studied, there may be a bitmap of possible first characters. */ |
---|
2691 | |
---|
2692 | if (!anchored) |
---|
2693 | { |
---|
2694 | if ((re->flags & PCRE_FIRSTSET) != 0) |
---|
2695 | { |
---|
2696 | first_byte = re->first_byte & 255; |
---|
2697 | if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) |
---|
2698 | first_byte = lcc[first_byte]; |
---|
2699 | } |
---|
2700 | else |
---|
2701 | { |
---|
2702 | if (startline && study != NULL && |
---|
2703 | (study->options & PCRE_STUDY_MAPPED) != 0) |
---|
2704 | start_bits = study->start_bits; |
---|
2705 | } |
---|
2706 | } |
---|
2707 | |
---|
2708 | /* For anchored or unanchored matches, there may be a "last known required |
---|
2709 | character" set. */ |
---|
2710 | |
---|
2711 | if ((re->flags & PCRE_REQCHSET) != 0) |
---|
2712 | { |
---|
2713 | req_byte = re->req_byte & 255; |
---|
2714 | req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
---|
2715 | req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ |
---|
2716 | } |
---|
2717 | |
---|
2718 | /* Call the main matching function, looping for a non-anchored regex after a |
---|
2719 | failed match. Unless restarting, optimize by moving to the first match |
---|
2720 | character if possible, when not anchored. Then unless wanting a partial match, |
---|
2721 | check for a required later character. */ |
---|
2722 | |
---|
2723 | for (;;) |
---|
2724 | { |
---|
2725 | int rc; |
---|
2726 | |
---|
2727 | if ((options & PCRE_DFA_RESTART) == 0) |
---|
2728 | { |
---|
2729 | const uschar *save_end_subject = end_subject; |
---|
2730 | |
---|
2731 | /* Advance to a unique first char if possible. If firstline is TRUE, the |
---|
2732 | start of the match is constrained to the first line of a multiline string. |
---|
2733 | Implement this by temporarily adjusting end_subject so that we stop |
---|
2734 | scanning at a newline. If the match fails at the newline, later code breaks |
---|
2735 | this loop. */ |
---|
2736 | |
---|
2737 | if (firstline) |
---|
2738 | { |
---|
2739 | const uschar *t = current_subject; |
---|
2740 | while (t < md->end_subject && !IS_NEWLINE(t)) t++; |
---|
2741 | end_subject = t; |
---|
2742 | } |
---|
2743 | |
---|
2744 | if (first_byte >= 0) |
---|
2745 | { |
---|
2746 | if (first_byte_caseless) |
---|
2747 | while (current_subject < end_subject && |
---|
2748 | lcc[*current_subject] != first_byte) |
---|
2749 | current_subject++; |
---|
2750 | else |
---|
2751 | while (current_subject < end_subject && *current_subject != first_byte) |
---|
2752 | current_subject++; |
---|
2753 | } |
---|
2754 | |
---|
2755 | /* Or to just after a linebreak for a multiline match if possible */ |
---|
2756 | |
---|
2757 | else if (startline) |
---|
2758 | { |
---|
2759 | if (current_subject > md->start_subject + start_offset) |
---|
2760 | { |
---|
2761 | while (current_subject <= end_subject && !WAS_NEWLINE(current_subject)) |
---|
2762 | current_subject++; |
---|
2763 | |
---|
2764 | /* If we have just passed a CR and the newline option is ANY or |
---|
2765 | ANYCRLF, and we are now at a LF, advance the match position by one more |
---|
2766 | character. */ |
---|
2767 | |
---|
2768 | if (current_subject[-1] == '\r' && |
---|
2769 | (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && |
---|
2770 | current_subject < end_subject && |
---|
2771 | *current_subject == '\n') |
---|
2772 | current_subject++; |
---|
2773 | } |
---|
2774 | } |
---|
2775 | |
---|
2776 | /* Or to a non-unique first char after study */ |
---|
2777 | |
---|
2778 | else if (start_bits != NULL) |
---|
2779 | { |
---|
2780 | while (current_subject < end_subject) |
---|
2781 | { |
---|
2782 | register unsigned int c = *current_subject; |
---|
2783 | if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; |
---|
2784 | else break; |
---|
2785 | } |
---|
2786 | } |
---|
2787 | |
---|
2788 | /* Restore fudged end_subject */ |
---|
2789 | |
---|
2790 | end_subject = save_end_subject; |
---|
2791 | } |
---|
2792 | |
---|
2793 | /* If req_byte is set, we know that that character must appear in the subject |
---|
2794 | for the match to succeed. If the first character is set, req_byte must be |
---|
2795 | later in the subject; otherwise the test starts at the match point. This |
---|
2796 | optimization can save a huge amount of work in patterns with nested unlimited |
---|
2797 | repeats that aren't going to match. Writing separate code for cased/caseless |
---|
2798 | versions makes it go faster, as does using an autoincrement and backing off |
---|
2799 | on a match. |
---|
2800 | |
---|
2801 | HOWEVER: when the subject string is very, very long, searching to its end can |
---|
2802 | take a long time, and give bad performance on quite ordinary patterns. This |
---|
2803 | showed up when somebody was matching /^C/ on a 32-megabyte string... so we |
---|
2804 | don't do this when the string is sufficiently long. |
---|
2805 | |
---|
2806 | ALSO: this processing is disabled when partial matching is requested. |
---|
2807 | */ |
---|
2808 | |
---|
2809 | if (req_byte >= 0 && |
---|
2810 | end_subject - current_subject < REQ_BYTE_MAX && |
---|
2811 | (options & PCRE_PARTIAL) == 0) |
---|
2812 | { |
---|
2813 | register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); |
---|
2814 | |
---|
2815 | /* We don't need to repeat the search if we haven't yet reached the |
---|
2816 | place we found it at last time. */ |
---|
2817 | |
---|
2818 | if (p > req_byte_ptr) |
---|
2819 | { |
---|
2820 | if (req_byte_caseless) |
---|
2821 | { |
---|
2822 | while (p < end_subject) |
---|
2823 | { |
---|
2824 | register int pp = *p++; |
---|
2825 | if (pp == req_byte || pp == req_byte2) { p--; break; } |
---|
2826 | } |
---|
2827 | } |
---|
2828 | else |
---|
2829 | { |
---|
2830 | while (p < end_subject) |
---|
2831 | { |
---|
2832 | if (*p++ == req_byte) { p--; break; } |
---|
2833 | } |
---|
2834 | } |
---|
2835 | |
---|
2836 | /* If we can't find the required character, break the matching loop, |
---|
2837 | which will cause a return or PCRE_ERROR_NOMATCH. */ |
---|
2838 | |
---|
2839 | if (p >= end_subject) break; |
---|
2840 | |
---|
2841 | /* If we have found the required character, save the point where we |
---|
2842 | found it, so that we don't search again next time round the loop if |
---|
2843 | the start hasn't passed this character yet. */ |
---|
2844 | |
---|
2845 | req_byte_ptr = p; |
---|
2846 | } |
---|
2847 | } |
---|
2848 | |
---|
2849 | /* OK, now we can do the business */ |
---|
2850 | |
---|
2851 | rc = internal_dfa_exec( |
---|
2852 | md, /* fixed match data */ |
---|
2853 | md->start_code, /* this subexpression's code */ |
---|
2854 | current_subject, /* where we currently are */ |
---|
2855 | start_offset, /* start offset in subject */ |
---|
2856 | offsets, /* offset vector */ |
---|
2857 | offsetcount, /* size of same */ |
---|
2858 | workspace, /* workspace vector */ |
---|
2859 | wscount, /* size of same */ |
---|
2860 | re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
---|
2861 | 0, /* function recurse level */ |
---|
2862 | 0); /* regex recurse level */ |
---|
2863 | |
---|
2864 | /* Anything other than "no match" means we are done, always; otherwise, carry |
---|
2865 | on only if not anchored. */ |
---|
2866 | |
---|
2867 | if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; |
---|
2868 | |
---|
2869 | /* Advance to the next subject character unless we are at the end of a line |
---|
2870 | and firstline is set. */ |
---|
2871 | |
---|
2872 | if (firstline && IS_NEWLINE(current_subject)) break; |
---|
2873 | current_subject++; |
---|
2874 | if (utf8) |
---|
2875 | { |
---|
2876 | while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
---|
2877 | current_subject++; |
---|
2878 | } |
---|
2879 | if (current_subject > end_subject) break; |
---|
2880 | |
---|
2881 | /* If we have just passed a CR and we are now at a LF, and the pattern does |
---|
2882 | not contain any explicit matches for \r or \n, and the newline option is CRLF |
---|
2883 | or ANY or ANYCRLF, advance the match position by one more character. */ |
---|
2884 | |
---|
2885 | if (current_subject[-1] == '\r' && |
---|
2886 | current_subject < end_subject && |
---|
2887 | *current_subject == '\n' && |
---|
2888 | (re->flags & PCRE_HASCRORLF) == 0 && |
---|
2889 | (md->nltype == NLTYPE_ANY || |
---|
2890 | md->nltype == NLTYPE_ANYCRLF || |
---|
2891 | md->nllen == 2)) |
---|
2892 | current_subject++; |
---|
2893 | |
---|
2894 | } /* "Bumpalong" loop */ |
---|
2895 | |
---|
2896 | return PCRE_ERROR_NOMATCH; |
---|
2897 | } |
---|
2898 | |
---|
2899 | /* End of pcre_dfa_exec.c */ |
---|