1 | /************************************************* |
---|
2 | * Perl-Compatible Regular Expressions * |
---|
3 | *************************************************/ |
---|
4 | |
---|
5 | /* PCRE is a library of functions to support regular expressions whose syntax |
---|
6 | and semantics are as close as possible to those of the Perl 5 language. |
---|
7 | |
---|
8 | Written by Philip Hazel |
---|
9 | Copyright (c) 1997-2008 University of Cambridge |
---|
10 | |
---|
11 | ----------------------------------------------------------------------------- |
---|
12 | Redistribution and use in source and binary forms, with or without |
---|
13 | modification, are permitted provided that the following conditions are met: |
---|
14 | |
---|
15 | * Redistributions of source code must retain the above copyright notice, |
---|
16 | this list of conditions and the following disclaimer. |
---|
17 | |
---|
18 | * Redistributions in binary form must reproduce the above copyright |
---|
19 | notice, this list of conditions and the following disclaimer in the |
---|
20 | documentation and/or other materials provided with the distribution. |
---|
21 | |
---|
22 | * Neither the name of the University of Cambridge nor the names of its |
---|
23 | contributors may be used to endorse or promote products derived from |
---|
24 | this software without specific prior written permission. |
---|
25 | |
---|
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
---|
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
36 | POSSIBILITY OF SUCH DAMAGE. |
---|
37 | ----------------------------------------------------------------------------- |
---|
38 | */ |
---|
39 | |
---|
40 | |
---|
41 | /* This module contains some fixed tables that are used by more than one of the |
---|
42 | PCRE code modules. The tables are also #included by the pcretest program, which |
---|
43 | uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name |
---|
44 | clashes with the library. */ |
---|
45 | |
---|
46 | |
---|
47 | #ifdef HAVE_CONFIG_H |
---|
48 | #include "config.h" |
---|
49 | #endif |
---|
50 | |
---|
51 | #include "pcre_internal.h" |
---|
52 | |
---|
53 | |
---|
54 | /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that |
---|
55 | the definition is next to the definition of the opcodes in pcre_internal.h. */ |
---|
56 | |
---|
57 | const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; |
---|
58 | |
---|
59 | |
---|
60 | |
---|
61 | /************************************************* |
---|
62 | * Tables for UTF-8 support * |
---|
63 | *************************************************/ |
---|
64 | |
---|
65 | /* These are the breakpoints for different numbers of bytes in a UTF-8 |
---|
66 | character. */ |
---|
67 | |
---|
68 | #ifdef SUPPORT_UTF8 |
---|
69 | |
---|
70 | const int _pcre_utf8_table1[] = |
---|
71 | { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; |
---|
72 | |
---|
73 | const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); |
---|
74 | |
---|
75 | /* These are the indicator bits and the mask for the data bits to set in the |
---|
76 | first byte of a character, indexed by the number of additional bytes. */ |
---|
77 | |
---|
78 | const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; |
---|
79 | const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; |
---|
80 | |
---|
81 | /* Table of the number of extra bytes, indexed by the first byte masked with |
---|
82 | 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ |
---|
83 | |
---|
84 | const uschar _pcre_utf8_table4[] = { |
---|
85 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
---|
86 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
---|
87 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
---|
88 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; |
---|
89 | |
---|
90 | /* The pcre_utt[] table below translates Unicode property names into type and |
---|
91 | code values. It is searched by binary chop, so must be in collating sequence of |
---|
92 | name. Originally, the table contained pointers to the name strings in the first |
---|
93 | field of each entry. However, that leads to a large number of relocations when |
---|
94 | a shared library is dynamically loaded. A significant reduction is made by |
---|
95 | putting all the names into a single, large string and then using offsets in the |
---|
96 | table itself. Maintenance is more error-prone, but frequent changes to this |
---|
97 | data is unlikely. */ |
---|
98 | |
---|
99 | const char _pcre_utt_names[] = |
---|
100 | "Any\0" |
---|
101 | "Arabic\0" |
---|
102 | "Armenian\0" |
---|
103 | "Balinese\0" |
---|
104 | "Bengali\0" |
---|
105 | "Bopomofo\0" |
---|
106 | "Braille\0" |
---|
107 | "Buginese\0" |
---|
108 | "Buhid\0" |
---|
109 | "C\0" |
---|
110 | "Canadian_Aboriginal\0" |
---|
111 | "Cc\0" |
---|
112 | "Cf\0" |
---|
113 | "Cherokee\0" |
---|
114 | "Cn\0" |
---|
115 | "Co\0" |
---|
116 | "Common\0" |
---|
117 | "Coptic\0" |
---|
118 | "Cs\0" |
---|
119 | "Cuneiform\0" |
---|
120 | "Cypriot\0" |
---|
121 | "Cyrillic\0" |
---|
122 | "Deseret\0" |
---|
123 | "Devanagari\0" |
---|
124 | "Ethiopic\0" |
---|
125 | "Georgian\0" |
---|
126 | "Glagolitic\0" |
---|
127 | "Gothic\0" |
---|
128 | "Greek\0" |
---|
129 | "Gujarati\0" |
---|
130 | "Gurmukhi\0" |
---|
131 | "Han\0" |
---|
132 | "Hangul\0" |
---|
133 | "Hanunoo\0" |
---|
134 | "Hebrew\0" |
---|
135 | "Hiragana\0" |
---|
136 | "Inherited\0" |
---|
137 | "Kannada\0" |
---|
138 | "Katakana\0" |
---|
139 | "Kharoshthi\0" |
---|
140 | "Khmer\0" |
---|
141 | "L\0" |
---|
142 | "L&\0" |
---|
143 | "Lao\0" |
---|
144 | "Latin\0" |
---|
145 | "Limbu\0" |
---|
146 | "Linear_B\0" |
---|
147 | "Ll\0" |
---|
148 | "Lm\0" |
---|
149 | "Lo\0" |
---|
150 | "Lt\0" |
---|
151 | "Lu\0" |
---|
152 | "M\0" |
---|
153 | "Malayalam\0" |
---|
154 | "Mc\0" |
---|
155 | "Me\0" |
---|
156 | "Mn\0" |
---|
157 | "Mongolian\0" |
---|
158 | "Myanmar\0" |
---|
159 | "N\0" |
---|
160 | "Nd\0" |
---|
161 | "New_Tai_Lue\0" |
---|
162 | "Nko\0" |
---|
163 | "Nl\0" |
---|
164 | "No\0" |
---|
165 | "Ogham\0" |
---|
166 | "Old_Italic\0" |
---|
167 | "Old_Persian\0" |
---|
168 | "Oriya\0" |
---|
169 | "Osmanya\0" |
---|
170 | "P\0" |
---|
171 | "Pc\0" |
---|
172 | "Pd\0" |
---|
173 | "Pe\0" |
---|
174 | "Pf\0" |
---|
175 | "Phags_Pa\0" |
---|
176 | "Phoenician\0" |
---|
177 | "Pi\0" |
---|
178 | "Po\0" |
---|
179 | "Ps\0" |
---|
180 | "Runic\0" |
---|
181 | "S\0" |
---|
182 | "Sc\0" |
---|
183 | "Shavian\0" |
---|
184 | "Sinhala\0" |
---|
185 | "Sk\0" |
---|
186 | "Sm\0" |
---|
187 | "So\0" |
---|
188 | "Syloti_Nagri\0" |
---|
189 | "Syriac\0" |
---|
190 | "Tagalog\0" |
---|
191 | "Tagbanwa\0" |
---|
192 | "Tai_Le\0" |
---|
193 | "Tamil\0" |
---|
194 | "Telugu\0" |
---|
195 | "Thaana\0" |
---|
196 | "Thai\0" |
---|
197 | "Tibetan\0" |
---|
198 | "Tifinagh\0" |
---|
199 | "Ugaritic\0" |
---|
200 | "Yi\0" |
---|
201 | "Z\0" |
---|
202 | "Zl\0" |
---|
203 | "Zp\0" |
---|
204 | "Zs\0"; |
---|
205 | |
---|
206 | const ucp_type_table _pcre_utt[] = { |
---|
207 | { 0, PT_ANY, 0 }, |
---|
208 | { 4, PT_SC, ucp_Arabic }, |
---|
209 | { 11, PT_SC, ucp_Armenian }, |
---|
210 | { 20, PT_SC, ucp_Balinese }, |
---|
211 | { 29, PT_SC, ucp_Bengali }, |
---|
212 | { 37, PT_SC, ucp_Bopomofo }, |
---|
213 | { 46, PT_SC, ucp_Braille }, |
---|
214 | { 54, PT_SC, ucp_Buginese }, |
---|
215 | { 63, PT_SC, ucp_Buhid }, |
---|
216 | { 69, PT_GC, ucp_C }, |
---|
217 | { 71, PT_SC, ucp_Canadian_Aboriginal }, |
---|
218 | { 91, PT_PC, ucp_Cc }, |
---|
219 | { 94, PT_PC, ucp_Cf }, |
---|
220 | { 97, PT_SC, ucp_Cherokee }, |
---|
221 | { 106, PT_PC, ucp_Cn }, |
---|
222 | { 109, PT_PC, ucp_Co }, |
---|
223 | { 112, PT_SC, ucp_Common }, |
---|
224 | { 119, PT_SC, ucp_Coptic }, |
---|
225 | { 126, PT_PC, ucp_Cs }, |
---|
226 | { 129, PT_SC, ucp_Cuneiform }, |
---|
227 | { 139, PT_SC, ucp_Cypriot }, |
---|
228 | { 147, PT_SC, ucp_Cyrillic }, |
---|
229 | { 156, PT_SC, ucp_Deseret }, |
---|
230 | { 164, PT_SC, ucp_Devanagari }, |
---|
231 | { 175, PT_SC, ucp_Ethiopic }, |
---|
232 | { 184, PT_SC, ucp_Georgian }, |
---|
233 | { 193, PT_SC, ucp_Glagolitic }, |
---|
234 | { 204, PT_SC, ucp_Gothic }, |
---|
235 | { 211, PT_SC, ucp_Greek }, |
---|
236 | { 217, PT_SC, ucp_Gujarati }, |
---|
237 | { 226, PT_SC, ucp_Gurmukhi }, |
---|
238 | { 235, PT_SC, ucp_Han }, |
---|
239 | { 239, PT_SC, ucp_Hangul }, |
---|
240 | { 246, PT_SC, ucp_Hanunoo }, |
---|
241 | { 254, PT_SC, ucp_Hebrew }, |
---|
242 | { 261, PT_SC, ucp_Hiragana }, |
---|
243 | { 270, PT_SC, ucp_Inherited }, |
---|
244 | { 280, PT_SC, ucp_Kannada }, |
---|
245 | { 288, PT_SC, ucp_Katakana }, |
---|
246 | { 297, PT_SC, ucp_Kharoshthi }, |
---|
247 | { 308, PT_SC, ucp_Khmer }, |
---|
248 | { 314, PT_GC, ucp_L }, |
---|
249 | { 316, PT_LAMP, 0 }, |
---|
250 | { 319, PT_SC, ucp_Lao }, |
---|
251 | { 323, PT_SC, ucp_Latin }, |
---|
252 | { 329, PT_SC, ucp_Limbu }, |
---|
253 | { 335, PT_SC, ucp_Linear_B }, |
---|
254 | { 344, PT_PC, ucp_Ll }, |
---|
255 | { 347, PT_PC, ucp_Lm }, |
---|
256 | { 350, PT_PC, ucp_Lo }, |
---|
257 | { 353, PT_PC, ucp_Lt }, |
---|
258 | { 356, PT_PC, ucp_Lu }, |
---|
259 | { 359, PT_GC, ucp_M }, |
---|
260 | { 361, PT_SC, ucp_Malayalam }, |
---|
261 | { 371, PT_PC, ucp_Mc }, |
---|
262 | { 374, PT_PC, ucp_Me }, |
---|
263 | { 377, PT_PC, ucp_Mn }, |
---|
264 | { 380, PT_SC, ucp_Mongolian }, |
---|
265 | { 390, PT_SC, ucp_Myanmar }, |
---|
266 | { 398, PT_GC, ucp_N }, |
---|
267 | { 400, PT_PC, ucp_Nd }, |
---|
268 | { 403, PT_SC, ucp_New_Tai_Lue }, |
---|
269 | { 415, PT_SC, ucp_Nko }, |
---|
270 | { 419, PT_PC, ucp_Nl }, |
---|
271 | { 422, PT_PC, ucp_No }, |
---|
272 | { 425, PT_SC, ucp_Ogham }, |
---|
273 | { 431, PT_SC, ucp_Old_Italic }, |
---|
274 | { 442, PT_SC, ucp_Old_Persian }, |
---|
275 | { 454, PT_SC, ucp_Oriya }, |
---|
276 | { 460, PT_SC, ucp_Osmanya }, |
---|
277 | { 468, PT_GC, ucp_P }, |
---|
278 | { 470, PT_PC, ucp_Pc }, |
---|
279 | { 473, PT_PC, ucp_Pd }, |
---|
280 | { 476, PT_PC, ucp_Pe }, |
---|
281 | { 479, PT_PC, ucp_Pf }, |
---|
282 | { 482, PT_SC, ucp_Phags_Pa }, |
---|
283 | { 491, PT_SC, ucp_Phoenician }, |
---|
284 | { 502, PT_PC, ucp_Pi }, |
---|
285 | { 505, PT_PC, ucp_Po }, |
---|
286 | { 508, PT_PC, ucp_Ps }, |
---|
287 | { 511, PT_SC, ucp_Runic }, |
---|
288 | { 517, PT_GC, ucp_S }, |
---|
289 | { 519, PT_PC, ucp_Sc }, |
---|
290 | { 522, PT_SC, ucp_Shavian }, |
---|
291 | { 530, PT_SC, ucp_Sinhala }, |
---|
292 | { 538, PT_PC, ucp_Sk }, |
---|
293 | { 541, PT_PC, ucp_Sm }, |
---|
294 | { 544, PT_PC, ucp_So }, |
---|
295 | { 547, PT_SC, ucp_Syloti_Nagri }, |
---|
296 | { 560, PT_SC, ucp_Syriac }, |
---|
297 | { 567, PT_SC, ucp_Tagalog }, |
---|
298 | { 575, PT_SC, ucp_Tagbanwa }, |
---|
299 | { 584, PT_SC, ucp_Tai_Le }, |
---|
300 | { 591, PT_SC, ucp_Tamil }, |
---|
301 | { 597, PT_SC, ucp_Telugu }, |
---|
302 | { 604, PT_SC, ucp_Thaana }, |
---|
303 | { 611, PT_SC, ucp_Thai }, |
---|
304 | { 616, PT_SC, ucp_Tibetan }, |
---|
305 | { 624, PT_SC, ucp_Tifinagh }, |
---|
306 | { 633, PT_SC, ucp_Ugaritic }, |
---|
307 | { 642, PT_SC, ucp_Yi }, |
---|
308 | { 645, PT_GC, ucp_Z }, |
---|
309 | { 647, PT_PC, ucp_Zl }, |
---|
310 | { 650, PT_PC, ucp_Zp }, |
---|
311 | { 653, PT_PC, ucp_Zs } |
---|
312 | }; |
---|
313 | |
---|
314 | const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); |
---|
315 | |
---|
316 | #endif /* SUPPORT_UTF8 */ |
---|
317 | |
---|
318 | /* End of pcre_tables.c */ |
---|