Changeset 9133 in project for chicken/trunk/pcre/pcre_compile.c


Ignore:
Timestamp:
02/29/08 18:18:10 (12 years ago)
Author:
Kon Lovett
Message:

PCRE 7.6

File:
1 edited

Legend:

Unmodified
Added
Removed
  • chicken/trunk/pcre/pcre_compile.c

    r6175 r9133  
    77
    88                       Written by Philip Hazel
    9            Copyright (c) 1997-2007 University of Cambridge
     9           Copyright (c) 1997-2008 University of Cambridge
    1010
    1111-----------------------------------------------------------------------------
     
    242242  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
    243243  "internal error: unexpected repeat\0"
    244   "unrecognized character after (?\0"
     244  "unrecognized character after (? or (?-\0"
    245245  "POSIX named classes are supported only within a class\0"
    246246  "missing )\0"
     
    301301  /* 60 */
    302302  "(*VERB) not recognized\0"
    303   "number is too big";
     303  "number is too big\0"
     304  "subpattern name expected\0"
     305  "digit expected after (?+";
    304306
    305307
     
    497499if (c == 0) *errorcodeptr = ERR1;
    498500
    499 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
    500 a table. A non-zero result is something that can be returned immediately.
     501/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
     502in a table. A non-zero result is something that can be returned immediately.
    501503Otherwise further processing may be required. */
    502504
    503505#ifndef EBCDIC  /* ASCII coding */
    504 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
     506else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
    505507else if ((i = escapes[c - '0']) != 0) c = i;
    506508
    507509#else           /* EBCDIC coding */
    508 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
     510else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
    509511else if ((i = escapes[c - 0x48]) != 0)  c = i;
    510512#endif
     
    723725
    724726    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
    725     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
    726     for Perl compatibility, it is a literal. This code looks a bit odd, but
    727     there used to be some cases other than the default, and there may be again
    728     in future, so I haven't "optimized" it. */
     727    other alphanumeric following \ is an error if PCRE_EXTRA was set;
     728    otherwise, for Perl compatibility, it is a literal. This code looks a bit
     729    odd, but there used to be some cases other than the default, and there may
     730    be again in future, so I haven't "optimized" it. */
    729731
    730732    default:
     
    15071509below and from compile_branch() when checking for an unlimited repeat of a
    15081510group that can match nothing. Note that first_significant_code() skips over
    1509 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
    1510 struck an inner bracket whose current branch will already have been scanned.
     1511backward and negative forward assertions when its final argument is TRUE. If we
     1512hit an unclosed bracket, we return "empty" - this means we've struck an inner
     1513bracket whose current branch will already have been scanned.
    15111514
    15121515Arguments:
     
    15291532
    15301533  c = *code;
     1534
     1535  /* Skip over forward assertions; the other assertions are skipped by
     1536  first_significant_code() with a TRUE final argument. */
     1537
     1538  if (c == OP_ASSERT)
     1539    {
     1540    do code += GET(code, 1); while (*code == OP_ALT);
     1541    c = *code;
     1542    continue;
     1543    }
    15311544
    15321545  /* Groups with zero repeats can of course be empty; skip them. */
     
    17251738
    17261739/* This function is called when the sequence "[:" or "[." or "[=" is
    1727 encountered in a character class. It checks whether this is followed by an
    1728 optional ^ and then a sequence of letters, terminated by a matching ":]" or
    1729 ".]" or "=]".
    1730 
    1731 Argument:
     1740encountered in a character class. It checks whether this is followed by a
     1741sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
     1742reach an unescaped ']' without the special preceding character, return FALSE.
     1743
     1744Originally, this function only recognized a sequence of letters between the
     1745terminators, but it seems that Perl recognizes any sequence of characters,
     1746though of course unknown POSIX names are subsequently rejected. Perl gives an
     1747"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
     1748didn't consider this to be a POSIX class. Likewise for [:1234:].
     1749
     1750The problem in trying to be exactly like Perl is in the handling of escapes. We
     1751have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
     1752class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
     1753below handles the special case of \], but does not try to do any other escape
     1754processing. This makes it different from Perl for cases such as [:l\ower:]
     1755where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
     1756"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
     1757I think.
     1758
     1759Arguments:
    17321760  ptr      pointer to the initial [
    17331761  endptr   where to return the end pointer
    1734   cd       pointer to compile data
    17351762
    17361763Returns:   TRUE or FALSE
     
    17381765
    17391766static BOOL
    1740 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
     1767check_posix_syntax(const uschar *ptr, const uschar **endptr)
    17411768{
    17421769int terminator;          /* Don't combine these lines; the Solaris cc */
    17431770terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
    1744 if (*(++ptr) == '^') ptr++;
    1745 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
    1746 if (*ptr == terminator && ptr[1] == ']')
     1771for (++ptr; *ptr != 0; ptr++)
    17471772  {
    1748   *endptr = ptr;
    1749   return TRUE;
     1773  if (*ptr == '\\' && ptr[1] == ']') ptr++; else
     1774    {
     1775    if (*ptr == ']') return FALSE;
     1776    if (*ptr == terminator && ptr[1] == ']')
     1777      {
     1778      *endptr = ptr;
     1779      return TRUE;
     1780      }
     1781    }
    17501782  }
    17511783return FALSE;
     
    23452377BOOL utf8 = (options & PCRE_UTF8) != 0;
    23462378uschar *class_utf8data;
     2379uschar *class_utf8data_base;
    23472380uschar utf8_char[6];
    23482381#else
     
    23842417  {
    23852418  BOOL negate_class;
     2419  BOOL should_flip_negation;
    23862420  BOOL possessive_quantifier;
    23872421  BOOL is_quantifier;
     
    26072641
    26082642    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
    2609         check_posix_syntax(ptr, &tempptr, cd))
     2643        check_posix_syntax(ptr, &tempptr))
    26102644      {
    26112645      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
     
    26322666      }
    26332667
     2668    /* If a class contains a negative special such as \S, we need to flip the
     2669    negation flag at the end, so that support for characters > 255 works
     2670    correctly (they are all included in the class). */
     2671
     2672    should_flip_negation = FALSE;
     2673
    26342674    /* Keep a count of chars with values < 256 so that we can optimize the case
    26352675    of just a single character (as long as it's < 256). However, For higher
     
    26492689    class_utf8 = FALSE;                       /* No chars >= 256 */
    26502690    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
     2691    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
    26512692#endif
    26522693
     
    26642705        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
    26652706        }
     2707
     2708      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
     2709      data and reset the pointer. This is so that very large classes that
     2710      contain a zillion UTF-8 characters no longer overwrite the work space
     2711      (which is on the stack). */
     2712
     2713      if (lengthptr != NULL)
     2714        {
     2715        *lengthptr += class_utf8data - class_utf8data_base;
     2716        class_utf8data = class_utf8data_base;
     2717        }
     2718
    26662719#endif
    26672720
     
    26872740      if (c == '[' &&
    26882741          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
    2689           check_posix_syntax(ptr, &tempptr, cd))
     2742          check_posix_syntax(ptr, &tempptr))
    26902743        {
    26912744        BOOL local_negate = FALSE;
     
    27042757          {
    27052758          local_negate = TRUE;
     2759          should_flip_negation = TRUE;  /* Note negative special */
    27062760          ptr++;
    27072761          }
     
    27782832        if (*errorcodeptr != 0) goto FAILED;
    27792833
    2780         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
     2834        if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
    27812835        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
    27822836        else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
     
    28062860
    28072861            case ESC_D:
     2862            should_flip_negation = TRUE;
    28082863            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
    28092864            continue;
     
    28142869
    28152870            case ESC_W:
     2871            should_flip_negation = TRUE;
    28162872            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
    28172873            continue;
     
    28232879
    28242880            case ESC_S:
     2881            should_flip_negation = TRUE;
    28252882            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
    28262883            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
    2827             continue;
    2828 
    2829             case ESC_E: /* Perl ignores an orphan \E */
    28302884            continue;
    28312885
     
    30643118          if (*errorcodeptr != 0) goto FAILED;
    30653119
    3066           /* \b is backslash; \X is literal X; \R is literal R; any other
     3120          /* \b is backspace; \X is literal X; \R is literal R; any other
    30673121          special means the '-' was literal */
    30683122
     
    33283382
    33293383    /* If there are characters with values > 255, we have to compile an
    3330     extended class, with its own opcode. If there are no characters < 256,
    3331     we can omit the bitmap in the actual compiled code. */
     3384    extended class, with its own opcode, unless there was a negated special
     3385    such as \S in the class, because in that case all characters > 255 are in
     3386    the class, so any that were explicitly given as well can be ignored. If
     3387    (when there are explicit characters > 255 that must be listed) there are no
     3388    characters < 256, we can omit the bitmap in the actual compiled code. */
    33323389
    33333390#ifdef SUPPORT_UTF8
    3334     if (class_utf8)
     3391    if (class_utf8 && !should_flip_negation)
    33353392      {
    33363393      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
     
    33583415#endif
    33593416
    3360     /* If there are no characters > 255, negate the 32-byte map if necessary,
    3361     and copy it into the code vector. If this is the first thing in the branch,
    3362     there can be no first char setting, whatever the repeat count. Any reqbyte
    3363     setting must remain unchanged after any kind of repeat. */
    3364 
     3417    /* If there are no characters > 255, set the opcode to OP_CLASS or
     3418    OP_NCLASS, depending on whether the whole class was negated and whether
     3419    there were negative specials such as \S in the class. Then copy the 32-byte
     3420    map into the code vector, negating it if necessary. */
     3421
     3422    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
    33653423    if (negate_class)
    33663424      {
    3367       *code++ = OP_NCLASS;
    33683425      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
    33693426        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
     
    33713428    else
    33723429      {
    3373       *code++ = OP_CLASS;
    33743430      memcpy(code, classbits, 32);
    33753431      }
     
    40074063      if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
    40084064          *tempcode == OP_NOTEXACT)
    4009         tempcode += _pcre_OP_lengths[*tempcode];
     4065        tempcode += _pcre_OP_lengths[*tempcode] +
     4066          ((*tempcode == OP_TYPEEXACT &&
     4067             (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
    40104068      len = code - tempcode;
    40114069      if (len > 0) switch (*tempcode)
     
    42344292            goto FAILED;
    42354293            }
    4236           if (refsign == '-')
     4294          recno = (refsign == '-')?
     4295            cd->bracount - recno + 1 : recno +cd->bracount;
     4296          if (recno <= 0 || recno > cd->final_bracount)
    42374297            {
    4238             recno = cd->bracount - recno + 1;
    4239             if (recno <= 0)
    4240               {
    4241               *errorcodeptr = ERR15;
    4242               goto FAILED;
    4243               }
     4298            *errorcodeptr = ERR15;
     4299            goto FAILED;
    42444300            }
    4245           else recno += cd->bracount;
    42464301          PUT2(code, 2+LINK_SIZE, recno);
    42474302          break;
     
    43154370          }
    43164371
    4317         /* Check for the "name" actually being a subpattern number. */
    4318 
    4319         else if (recno > 0)
     4372        /* Check for the "name" actually being a subpattern number. We are
     4373        in the second pass here, so final_bracount is set. */
     4374
     4375        else if (recno > 0 && recno <= cd->final_bracount)
    43204376          {
    43214377          PUT2(code, 2+LINK_SIZE, recno);
     
    45114567        /* We come here from the Python syntax above that handles both
    45124568        references (?P=name) and recursion (?P>name), as well as falling
    4513         through from the Perl recursion syntax (?&name). */
     4569        through from the Perl recursion syntax (?&name). We also come here from
     4570        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
     4571        .NET syntax. */
    45144572
    45154573        NAMED_REF_OR_RECURSE:
     
    45234581        if (lengthptr != NULL)
    45244582          {
     4583          if (namelen == 0)
     4584            {
     4585            *errorcodeptr = ERR62;
     4586            goto FAILED;
     4587            }
    45254588          if (*ptr != terminator)
    45264589            {
     
    45364599          }
    45374600
    4538         /* In the real compile, seek the name in the table */
     4601        /* In the real compile, seek the name in the table. We check the name
     4602        first, and then check that we have reached the end of the name in the
     4603        table. That way, if the name that is longer than any in the table,
     4604        the comparison will fail without reading beyond the table entry. */
    45394605
    45404606        else
     
    45434609          for (i = 0; i < cd->names_found; i++)
    45444610            {
    4545             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
     4611            if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
     4612                slot[2+namelen] == 0)
     4613              break;
    45464614            slot += cd->name_entry_size;
    45474615            }
     
    45804648          const uschar *called;
    45814649
    4582           if ((refsign = *ptr) == '+') ptr++;
     4650          if ((refsign = *ptr) == '+')
     4651            {
     4652            ptr++;
     4653            if ((digitab[*ptr] & ctype_digit) == 0)
     4654              {
     4655              *errorcodeptr = ERR63;
     4656              goto FAILED;
     4657              }
     4658            }
    45834659          else if (refsign == '-')
    45844660            {
     
    57465822uschar cworkspace[COMPILE_WORK_SIZE];
    57475823
    5748 
    57495824/* Set this early so that early errors get offset 0. */
    57505825
     
    59075982is a test for its doing so. */
    59085983
    5909 cd->bracount = 0;
     5984cd->bracount = cd->final_bracount = 0;
    59105985cd->names_found = 0;
    59115986cd->name_entry_size = 0;
     
    59846059*/
    59856060
     6061cd->final_bracount = cd->bracount;  /* Save for checking forward references */
    59866062cd->bracount = 0;
    59876063cd->names_found = 0;
Note: See TracChangeset for help on using the changeset viewer.