1 | /************************************************* |
2 | * Perl-Compatible Regular Expressions * |
3 | *************************************************/ |
4 | |
5 | |
6 | /* This is a library of functions to support regular expressions whose syntax |
7 | and semantics are as close as possible to those of the Perl 5 language. See |
8 | the file doc/Tech.Notes for some information on the internals. |
9 | |
10 | Written by: Philip Hazel <ph10@cam.ac.uk> |
11 | |
12 | Copyright (c) 1997-2004 University of Cambridge |
13 | |
14 | ----------------------------------------------------------------------------- |
15 | Redistribution and use in source and binary forms, with or without |
16 | modification, are permitted provided that the following conditions are met: |
17 | |
18 | * Redistributions of source code must retain the above copyright notice, |
19 | this list of conditions and the following disclaimer. |
20 | |
21 | * Redistributions in binary form must reproduce the above copyright |
22 | notice, this list of conditions and the following disclaimer in the |
23 | documentation and/or other materials provided with the distribution. |
24 | |
25 | * Neither the name of the University of Cambridge nor the names of its |
26 | contributors may be used to endorse or promote products derived from |
27 | this software without specific prior written permission. |
28 | |
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
39 | POSSIBILITY OF SUCH DAMAGE. |
40 | ----------------------------------------------------------------------------- |
41 | */ |
42 | |
43 | /* This header contains definitions that are shared between the different |
44 | modules, but which are not relevant to the outside. */ |
45 | |
46 | /* Get the definitions provided by running "configure" */ |
47 | |
48 | #include "config.h" |
49 | |
50 | /* Standard C headers plus the external interface definition. The only time |
51 | setjmp and stdarg are used is when NO_RECURSE is set. */ |
52 | |
53 | #include <ctype.h> |
54 | #include <limits.h> |
55 | #include <setjmp.h> |
56 | #include <stdarg.h> |
57 | #include <stddef.h> |
58 | #include <stdio.h> |
59 | #include <stdlib.h> |
60 | #include <string.h> |
61 | |
62 | #ifndef PCRE_SPY |
63 | #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ |
64 | #endif |
65 | |
66 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We |
67 | cannot determine these outside the compilation (e.g. by running a program as |
68 | part of "configure") because PCRE is often cross-compiled for use on other |
69 | systems. Instead we make use of the maximum sizes that are available at |
70 | preprocessor time in standard C environments. */ |
71 | |
72 | #if USHRT_MAX == 65535 |
73 | typedef unsigned short pcre_uint16; |
74 | #elif UINT_MAX == 65535 |
75 | typedef unsigned int pcre_uint16; |
76 | #else |
77 | #error Cannot determine a type for 16-bit unsigned integers |
78 | #endif |
79 | |
80 | #if UINT_MAX == 4294967295 |
81 | typedef unsigned int pcre_uint32; |
82 | #elif ULONG_MAX == 4294967295 |
83 | typedef unsigned long int pcre_uint32; |
84 | #else |
85 | #error Cannot determine a type for 32-bit unsigned integers |
86 | #endif |
87 | |
88 | /* All character handling must be done as unsigned characters. Otherwise there |
89 | are problems with top-bit-set characters and functions such as isspace(). |
90 | However, we leave the interface to the outside world as char *, because that |
91 | should make things easier for callers. We define a short type for unsigned char |
92 | to save lots of typing. I tried "uchar", but it causes problems on Digital |
93 | Unix, where it is defined in sys/types, so use "uschar" instead. */ |
94 | |
95 | typedef unsigned char uschar; |
96 | |
97 | /* Include the public PCRE header */ |
98 | |
99 | #include "pcre.h" |
100 | |
101 | /* When compiling for use with the Virtual Pascal compiler, these functions |
102 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
103 | option on the command line. */ |
104 | |
105 | #ifdef VPCOMPAT |
106 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
107 | #define memcpy(d,s,n) _memcpy(d,s,n) |
108 | #define memmove(d,s,n) _memmove(d,s,n) |
109 | #define memset(s,c,n) _memset(s,c,n) |
110 | #else /* VPCOMPAT */ |
111 | |
112 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
113 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
114 | is set. Otherwise, include an emulating function for those systems that have |
115 | neither (there some non-Unix environments where this is the case). This assumes |
116 | that all calls to memmove are moving strings upwards in store, which is the |
117 | case in PCRE. */ |
118 | |
119 | #if ! HAVE_MEMMOVE |
120 | #undef memmove /* some systems may have a macro */ |
121 | #if HAVE_BCOPY |
122 | #define memmove(a, b, c) bcopy(b, a, c) |
123 | #else /* HAVE_BCOPY */ |
124 | void * |
125 | pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) |
126 | { |
127 | int i; |
128 | dest += n; |
129 | src += n; |
130 | for (i = 0; i < n; ++i) *(--dest) = *(--src); |
131 | } |
132 | #define memmove(a, b, c) pcre_memmove(a, b, c) |
133 | #endif /* not HAVE_BCOPY */ |
134 | #endif /* not HAVE_MEMMOVE */ |
135 | #endif /* not VPCOMPAT */ |
136 | |
137 | |
138 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
139 | in big-endian order) by default. These are used, for example, to link from the |
140 | start of a subpattern to its alternatives and its end. The use of 2 bytes per |
141 | offset limits the size of the compiled regex to around 64K, which is big enough |
142 | for almost everybody. However, I received a request for an even bigger limit. |
143 | For this reason, and also to make the code easier to maintain, the storing and |
144 | loading of offsets from the byte string is now handled by the macros that are |
145 | defined here. |
146 | |
147 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in |
148 | the config.h file, but can be overridden by using -D on the command line. This |
149 | is automated on Unix systems via the "configure" command. */ |
150 | |
151 | #if LINK_SIZE == 2 |
152 | |
153 | #define PUT(a,n,d) \ |
154 | (a[n] = : enter=0, leave=0 = (d) >> : pass=0 >> 8), \ |
155 | (a[(n)+ : pass=0 +1] = : enter=0, leave=0 = (d) & : pass=0 & 255) |
156 | |
157 | #define GET(a,n) \ |
158 | (((a)[] : enter=0, leave=0 [n] << : pass=0 << 8) | : pass=0 | (a)[] : enter=0, leave=0 [(n)+ : pass=0 +1]) |
159 | |
160 | #define MAX_PATTERN_SIZE (1 << : pass=0 << 16) |
161 | |
162 | |
163 | #elif LINK_SIZE == 3 |
164 | |
165 | #define PUT(a,n,d) \ |
166 | (a[n] = (d) >> 16), \ |
167 | (a[(n)+1] = (d) >> 8), \ |
168 | (a[(n)+2] = (d) & 255) |
169 | |
170 | #define GET(a,n) \ |
171 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
172 | |
173 | #define MAX_PATTERN_SIZE (1 << 24) |
174 | |
175 | |
176 | #elif LINK_SIZE == 4 |
177 | |
178 | #define PUT(a,n,d) \ |
179 | (a[n] = (d) >> 24), \ |
180 | (a[(n)+1] = (d) >> 16), \ |
181 | (a[(n)+2] = (d) >> 8), \ |
182 | (a[(n)+3] = (d) & 255) |
183 | |
184 | #define GET(a,n) \ |
185 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
186 | |
187 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
188 | |
189 | |
190 | #else |
191 | #error LINK_SIZE must be either 2, 3, or 4 |
192 | #endif |
193 | |
194 | |
195 | /* Convenience macro defined in terms of the others */ |
196 | |
197 | #define PUTINC(a,n,d) PUT(a,n,d), a += : pass=0 += LINK_SIZE |
198 | |
199 | |
200 | /* PCRE uses some other 2-byte quantities that do not change when the size of |
201 | offsets changes. There are used for repeat counts and for other things such as |
202 | capturing parenthesis numbers in back references. */ |
203 | |
204 | #define PUT2(a,n,d) \ |
205 | a[n] = : enter=0, leave=0 = (d) >> : pass=0 >> 8; \ |
206 | a[(n)+ : pass=0 +1] = : enter=0, leave=0 = (d) & : pass=0 & 255 |
207 | |
208 | #define GET2(a,n) \ |
209 | (((a)[] : enter=0, leave=0 [n] << : pass=0 << 8) | : pass=0 | (a)[] : enter=0, leave=0 [(n)+ : pass=0 +1]) |
210 | |
211 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += : pass=0 += 2 |
212 | |
213 | |
214 | /* In case there is no definition of offsetof() provided - though any proper |
215 | Standard C system should have one. */ |
216 | |
217 | #ifndef offsetof |
218 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) |
219 | #endif |
220 | |
221 | |
222 | /* These are the public options that can change during matching. */ |
223 | |
224 | #define PCRE_IMS (PCRE_CASELESS| : pass=0 |PCRE_MULTILINE| : pass=0 |PCRE_DOTALL) |
225 | |
226 | /* Private options flags start at the most significant end of the four bytes, |
227 | but skip the top bit so we can use ints for convenience without getting tangled |
228 | with negative values. The public options defined in pcre.h start at the least |
229 | significant end. Make sure they don't overlap, though now that we have expanded |
230 | to four bytes, there is plenty of space. */ |
231 | |
232 | #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ |
233 | #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ |
234 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ |
235 | #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ |
236 | #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ |
237 | |
238 | /* Options for the "extra" block produced by pcre_study(). */ |
239 | |
240 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ |
241 | |
242 | /* Masks for identifying the public options which are permitted at compile |
243 | time, run time or study time, respectively. */ |
244 | |
245 | #define PUBLIC_OPTIONS \ |
246 | (PCRE_CASELESS| : pass=0 |PCRE_EXTENDED| : pass=0 |PCRE_ANCHORED| : pass=0 |PCRE_MULTILINE| : pass=0 | \ |
247 | PCRE_DOTALL| : pass=0 |PCRE_DOLLAR_ENDONLY| : pass=0 |PCRE_EXTRA| : pass=0 |PCRE_UNGREEDY| : pass=0 |PCRE_UTF8| : pass=0 | \ |
248 | PCRE_NO_AUTO_CAPTURE| : pass=0 |PCRE_NO_UTF8_CHECK| : pass=0 |PCRE_AUTO_CALLOUT) |
249 | |
250 | #define PUBLIC_EXEC_OPTIONS \ |
251 | (PCRE_ANCHORED| : pass=0 |PCRE_NOTBOL| : pass=0 |PCRE_NOTEOL| : pass=0 |PCRE_NOTEMPTY| : pass=0 |PCRE_NO_UTF8_CHECK| : pass=0 | \ |
252 | PCRE_PARTIAL) |
253 | |
254 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ |
255 | |
256 | /* Magic number to provide a small check against being handed junk. */ |
257 | |
258 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
259 | |
260 | /* Negative values for the firstchar and reqchar variables */ |
261 | |
262 | #define REQ_UNSET (-2) |
263 | #define REQ_NONE (-1) |
264 | |
265 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a |
266 | variable-length repeat, or a anything other than literal characters. */ |
267 | |
268 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ |
269 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ |
270 | |
271 | /* Miscellaneous definitions */ |
272 | |
273 | typedef int BOOL; |
274 | |
275 | #define FALSE 0 |
276 | #define TRUE 1 |
277 | |
278 | /* Escape items that are just an encoding of a particular data value. Note that |
279 | ESC_n is defined as yet another macro, which is set in config.h to either \n |
280 | (the default) or \r (which some people want). */ |
281 | |
282 | #ifndef ESC_e |
283 | #define ESC_e 27 |
284 | #endif |
285 | |
286 | #ifndef ESC_f |
287 | #define ESC_f '\f' |
288 | #endif |
289 | |
290 | #ifndef ESC_n |
291 | #define ESC_n NEWLINE |
292 | #endif |
293 | |
294 | #ifndef ESC_r |
295 | #define ESC_r '\r' |
296 | #endif |
297 | |
298 | /* We can't officially use ESC_t because it is a POSIX reserved identifier |
299 | (presumably because of all the others like size_t). */ |
300 | |
301 | #ifndef ESC_tee |
302 | #define ESC_tee '\t' |
303 | #endif |
304 | |
305 | /* These are escaped items that aren't just an encoding of a particular data |
306 | value such as \n. They must have non-zero values, as check_escape() returns |
307 | their negation. Also, they must appear in the same order as in the opcode |
308 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it |
309 | corresponds to "." rather than an escape sequence. The final one must be |
310 | ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two |
311 | tests in the code for an escape greater than ESC_b and less than ESC_Z to |
312 | detect the types that may be repeated. These are the types that consume |
313 | characters. If any new escapes are put in between that don't consume a |
314 | character, that code will have to change. */ |
315 | |
316 | enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, |
317 | ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, |
318 | ESC_Q, ESC_REF }; |
319 | |
320 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
321 | contain UTF-8 characters with values greater than 255. */ |
322 | |
323 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
324 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
325 | |
326 | #define XCL_END 0 /* Marks end of individual items */ |
327 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
328 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
329 | #define XCL_PROP 3 /* Unicode property (one property code) follows */ |
330 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
331 | |
332 | |
333 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
334 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to |
335 | OP_EOD must correspond in order to the list of escapes immediately above. |
336 | Note that whenever this list is updated, the two macro definitions that follow |
337 | must also be updated to match. */ |
338 | |
339 | enum { |
340 | OP_END, /* 0 End of pattern */ |
341 | |
342 | /* Values corresponding to backslashed metacharacters */ |
343 | |
344 | OP_SOD, /* 1 Start of data: \A */ |
345 | OP_SOM, /* 2 Start of match (subject + offset): \G */ |
346 | OP_NOT_WORD_BOUNDARY, /* 3 \B */ |
347 | OP_WORD_BOUNDARY, /* 4 \b */ |
348 | OP_NOT_DIGIT, /* 5 \D */ |
349 | OP_DIGIT, /* 6 \d */ |
350 | OP_NOT_WHITESPACE, /* 7 \S */ |
351 | OP_WHITESPACE, /* 8 \s */ |
352 | OP_NOT_WORDCHAR, /* 9 \W */ |
353 | OP_WORDCHAR, /* 10 \w */ |
354 | OP_ANY, /* 11 Match any character */ |
355 | OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ |
356 | OP_NOTPROP, /* 13 \P (not Unicode property) */ |
357 | OP_PROP, /* 14 \p (Unicode property) */ |
358 | OP_EXTUNI, /* 15 \X (extended Unicode sequence */ |
359 | OP_EODN, /* 16 End of data or \n at end of data: \Z. */ |
360 | OP_EOD, /* 17 End of data: \z */ |
361 | |
362 | OP_OPT, /* 18 Set runtime options */ |
363 | OP_CIRC, /* 19 Start of line - varies with multiline switch */ |
364 | OP_DOLL, /* 20 End of line - varies with multiline switch */ |
365 | OP_CHAR, /* 21 Match one character, casefully */ |
366 | OP_CHARNC, /* 22 Match one character, caselessly */ |
367 | OP_NOT, /* 23 Match anything but the following char */ |
368 | |
369 | OP_STAR, /* 24 The maximizing and minimizing versions of */ |
370 | OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ |
371 | OP_PLUS, /* 26 the minimizing one second. */ |
372 | OP_MINPLUS, /* 27 This first set applies to single characters */ |
373 | OP_QUERY, /* 28 */ |
374 | OP_MINQUERY, /* 29 */ |
375 | OP_UPTO, /* 30 From 0 to n matches */ |
376 | OP_MINUPTO, /* 31 */ |
377 | OP_EXACT, /* 32 Exactly n matches */ |
378 | |
379 | OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ |
380 | OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ |
381 | OP_NOTPLUS, /* 35 the minimizing one second. */ |
382 | OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ |
383 | OP_NOTQUERY, /* 37 */ |
384 | OP_NOTMINQUERY, /* 38 */ |
385 | OP_NOTUPTO, /* 39 From 0 to n matches */ |
386 | OP_NOTMINUPTO, /* 40 */ |
387 | OP_NOTEXACT, /* 41 Exactly n matches */ |
388 | |
389 | OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ |
390 | OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ |
391 | OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ |
392 | OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ |
393 | OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ |
394 | OP_TYPEMINQUERY, /* 47 */ |
395 | OP_TYPEUPTO, /* 48 From 0 to n matches */ |
396 | OP_TYPEMINUPTO, /* 49 */ |
397 | OP_TYPEEXACT, /* 50 Exactly n matches */ |
398 | |
399 | OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ |
400 | OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ |
401 | OP_CRPLUS, /* 53 the minimizing one second. These codes must */ |
402 | OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ |
403 | OP_CRQUERY, /* 55 These are for character classes and back refs */ |
404 | OP_CRMINQUERY, /* 56 */ |
405 | OP_CRRANGE, /* 57 These are different to the three sets above. */ |
406 | OP_CRMINRANGE, /* 58 */ |
407 | |
408 | OP_CLASS, /* 59 Match a character class, chars < 256 only */ |
409 | OP_NCLASS, /* 60 Same, but the bitmap was created from a negative |
410 | class - the difference is relevant only when a UTF-8 |
411 | character > 255 is encountered. */ |
412 | |
413 | OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the |
414 | class. This does both positive and negative. */ |
415 | |
416 | OP_REF, /* 62 Match a back reference */ |
417 | OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ |
418 | OP_CALLOUT, /* 64 Call out to external function if provided */ |
419 | |
420 | OP_ALT, /* 65 Start of alternation */ |
421 | OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ |
422 | OP_KETRMAX, /* 67 These two must remain together and in this */ |
423 | OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ |
424 | |
425 | /* The assertions must come before ONCE and COND */ |
426 | |
427 | OP_ASSERT, /* 69 Positive lookahead */ |
428 | OP_ASSERT_NOT, /* 70 Negative lookahead */ |
429 | OP_ASSERTBACK, /* 71 Positive lookbehind */ |
430 | OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ |
431 | OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ |
432 | |
433 | /* ONCE and COND must come after the assertions, with ONCE first, as there's |
434 | a test for >= ONCE for a subpattern that isn't an assertion. */ |
435 | |
436 | OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ |
437 | OP_COND, /* 75 Conditional group */ |
438 | OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ |
439 | |
440 | OP_BRAZERO, /* 77 These two must remain together and in this */ |
441 | OP_BRAMINZERO, /* 78 order. */ |
442 | |
443 | OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater |
444 | than can fit into an opcode. */ |
445 | |
446 | OP_BRA /* 80 This and greater values are used for brackets that |
447 | extract substrings up to EXTRACT_BASIC_MAX. After |
448 | that, use is made of OP_BRANUMBER. */ |
449 | }; |
450 | |
451 | /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and |
452 | study.c that all opcodes are less than 128 in value. This makes handling UTF-8 |
453 | character sequences easier. */ |
454 | |
455 | /* The highest extraction number before we have to start using additional |
456 | bytes. (Originally PCRE didn't have support for extraction counts highter than |
457 | this number.) The value is limited by the number of opcodes left after OP_BRA, |
458 | i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional |
459 | opcodes. */ |
460 | |
461 | #define EXTRACT_BASIC_MAX 100 |
462 | |
463 | |
464 | /* This macro defines textual names for all the opcodes. There are used only |
465 | for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The |
466 | macro is referenced only in printint.c. */ |
467 | |
468 | #define OP_NAME_LIST \ |
469 | "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ |
470 | "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ |
471 | "notprop", "prop", "extuni", \ |
472 | "\\Z", "\\z", \ |
473 | "Opt", "^", "$", "char", "charnc", "not", \ |
474 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
475 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
476 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
477 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
478 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ |
479 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ |
480 | "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ |
481 | "Brazero", "Braminzero", "Branumber", "Bra" |
482 | |
483 | |
484 | /* This macro defines the length of fixed length operations in the compiled |
485 | regex. The lengths are used when searching for specific things, and also in the |
486 | debugging printing of a compiled regex. We use a macro so that it can be |
487 | incorporated both into pcre.c and pcretest.c without being publicly exposed. |
488 | |
489 | As things have been extended, some of these are no longer fixed lenths, but are |
490 | minima instead. For example, the length of a single-character repeat may vary |
491 | in UTF-8 mode. The code that uses this table must know about such things. */ |
492 | |
493 | #define OP_LENGTHS \ |
494 | 1, /* End */ \ |
495 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ |
496 | 1, 1, /* Any, Anybyte */ \ |
497 | 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \ |
498 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
499 | 2, /* Char - the minimum length */ \ |
500 | 2, /* Charnc - the minimum length */ \ |
501 | 2, /* not */ \ |
502 | /* Positive single-char repeats ** These are */ \ |
503 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ |
504 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ |
505 | /* Negative single-char repeats - only for chars < 256 */ \ |
506 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ |
507 | 4, 4, 4, /* NOT upto, minupto, exact */ \ |
508 | /* Positive type repeats */ \ |
509 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ |
510 | 4, 4, 4, /* Type upto, minupto, exact */ \ |
511 | /* Character class & ref repeats */ \ |
512 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
513 | 5, 5, /* CRRANGE, CRMINRANGE */ \ |
514 | 33, /* CLASS */ \ |
515 | 33, /* NCLASS */ \ |
516 | 0, /* XCLASS - variable length */ \ |
517 | 3, /* REF */ \ |
518 | 1+ : pass=0 +LINK_SIZE, /* RECURSE */ \ |
519 | 2+ : pass=0 +2* : pass=0 *LINK_SIZE, /* CALLOUT */ \ |
520 | 1+ : pass=0 +LINK_SIZE, /* Alt */ \ |
521 | 1+ : pass=0 +LINK_SIZE, /* Ket */ \ |
522 | 1+ : pass=0 +LINK_SIZE, /* KetRmax */ \ |
523 | 1+ : pass=0 +LINK_SIZE, /* KetRmin */ \ |
524 | 1+ : pass=0 +LINK_SIZE, /* Assert */ \ |
525 | 1+ : pass=0 +LINK_SIZE, /* Assert not */ \ |
526 | 1+ : pass=0 +LINK_SIZE, /* Assert behind */ \ |
527 | 1+ : pass=0 +LINK_SIZE, /* Assert behind not */ \ |
528 | 1+ : pass=0 +LINK_SIZE, /* Reverse */ \ |
529 | 1+ : pass=0 +LINK_SIZE, /* Once */ \ |
530 | 1+ : pass=0 +LINK_SIZE, /* COND */ \ |
531 | 3, /* CREF */ \ |
532 | 1, 1, /* BRAZERO, BRAMINZERO */ \ |
533 | 3, /* BRANUMBER */ \ |
534 | 1+ : pass=0 +LINK_SIZE /* BRA */ \ |
535 | |
536 | |
537 | /* A magic value for OP_CREF to indicate the "in recursion" condition. */ |
538 | |
539 | #define CREF_RECURSE 0xffff |
540 | |
541 | /* The texts of compile-time error messages are defined as macros here so that |
542 | they can be accessed by the POSIX wrapper and converted into error codes. Yes, |
543 | I could have used error codes in the first place, but didn't feel like changing |
544 | just to accommodate the POSIX wrapper. */ |
545 | |
546 | #define ERR1 "\\ at end of pattern" |
547 | #define ERR2 "\\c at end of pattern" |
548 | #define ERR3 "unrecognized character follows \\" |
549 | #define ERR4 "numbers out of order in {} quantifier" |
550 | #define ERR5 "number too big in {} quantifier" |
551 | #define ERR6 "missing terminating ] for character class" |
552 | #define ERR7 "invalid escape sequence in character class" |
553 | #define ERR8 "range out of order in character class" |
554 | #define ERR9 "nothing to repeat" |
555 | #define ERR10 "operand of unlimited repeat could match the empty string" |
556 | #define ERR11 "internal error: unexpected repeat" |
557 | #define ERR12 "unrecognized character after (?" |
558 | #define ERR13 "POSIX named classes are supported only within a class" |
559 | #define ERR14 "missing )" |
560 | #define ERR15 "reference to non-existent subpattern" |
561 | #define ERR16 "erroffset passed as NULL" |
562 | #define ERR17 "unknown option bit(s) set" |
563 | #define ERR18 "missing ) after comment" |
564 | #define ERR19 "parentheses nested too deeply" |
565 | #define ERR20 "regular expression too large" |
566 | #define ERR21 "failed to get memory" |
567 | #define ERR22 "unmatched parentheses" |
568 | #define ERR23 "internal error: code overflow" |
569 | #define ERR24 "unrecognized character after (?<" |
570 | #define ERR25 "lookbehind assertion is not fixed length" |
571 | #define ERR26 "malformed number after (?(" |
572 | #define ERR27 "conditional group contains more than two branches" |
573 | #define ERR28 "assertion expected after (?(" |
574 | #define ERR29 "(?R or (?digits must be followed by )" |
575 | #define ERR30 "unknown POSIX class name" |
576 | #define ERR31 "POSIX collating elements are not supported" |
577 | #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" |
578 | #define ERR33 "spare error" |
579 | #define ERR34 "character value in \\x{...} sequence is too large" |
580 | #define ERR35 "invalid condition (?(0)" |
581 | #define ERR36 "\\C not allowed in lookbehind assertion" |
582 | #define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u" |
583 | #define ERR38 "number after (?C is > 255" |
584 | #define ERR39 "closing ) for (?C expected" |
585 | #define ERR40 "recursive call could loop indefinitely" |
586 | #define ERR41 "unrecognized character after (?P" |
587 | #define ERR42 "syntax error after (?P" |
588 | #define ERR43 "two named groups have the same name" |
589 | #define ERR44 "invalid UTF-8 string" |
590 | #define ERR45 "support for \\P, \\p, and \\X has not been compiled" |
591 | #define ERR46 "malformed \\P or \\p sequence" |
592 | #define ERR47 "unknown property name after \\P or \\p" |
593 | |
594 | /* The real format of the start of the pcre block; the index of names and the |
595 | code vector run on as long as necessary after the end. We store an explicit |
596 | offset to the name table so that if a regex is compiled on one host, saved, and |
597 | then run on another where the size of pointers is different, all might still |
598 | be well. For the case of compiled-on-4 and run-on-8, we include an extra |
599 | pointer that is always NULL. For future-proofing, we also include a few dummy |
600 | fields - even though you can never get this planning right! |
601 | |
602 | NOTE NOTE NOTE: |
603 | Because people can now save and re-use compiled patterns, any additions to this |
604 | structure should be made at the end, and something earlier (e.g. a new |
605 | flag in the options or one of the dummy fields) should indicate that the new |
606 | fields are present. Currently PCRE always sets the dummy fields to zero. |
607 | NOTE NOTE NOTE: |
608 | */ |
609 | |
610 | typedef struct real_pcre { |
611 | pcre_uint32 magic_number; |
612 | pcre_uint32 size; /* Total that was malloced */ |
613 | pcre_uint32 options; |
614 | pcre_uint32 dummy1; /* For future use, maybe */ |
615 | |
616 | pcre_uint16 top_bracket; |
617 | pcre_uint16 top_backref; |
618 | pcre_uint16 first_byte; |
619 | pcre_uint16 req_byte; |
620 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
621 | pcre_uint16 name_entry_size; /* Size of any name items */ |
622 | pcre_uint16 name_count; /* Number of name items */ |
623 | pcre_uint16 dummy2; /* For future use, maybe */ |
624 | |
625 | const unsigned char *tables; /* Pointer to tables or NULL for std */ |
626 | const unsigned char *nullpad; /* NULL padding */ |
627 | } real_pcre; |
628 | |
629 | /* The format of the block used to store data from pcre_study(). The same |
630 | remark (see NOTE above) about extending this structure applies. */ |
631 | |
632 | typedef struct pcre_study_data { |
633 | pcre_uint32 size; /* Total that was malloced */ |
634 | pcre_uint32 options; |
635 | uschar start_bits[32]; |
636 | } pcre_study_data; |
637 | |
638 | /* Structure for passing "static" information around between the functions |
639 | doing the compiling, so that they are thread-safe. */ |
640 | |
641 | typedef struct compile_data { |
642 | const uschar *lcc; /* Points to lower casing table */ |
643 | const uschar *fcc; /* Points to case-flipping table */ |
644 | const uschar *cbits; /* Points to character type table */ |
645 | const uschar *ctypes; /* Points to table of type maps */ |
646 | const uschar *start_code; /* The start of the compiled code */ |
647 | const uschar *start_pattern; /* The start of the pattern */ |
648 | uschar *name_table; /* The name/number table */ |
649 | int names_found; /* Number of entries so far */ |
650 | int name_entry_size; /* Size of each entry */ |
651 | int top_backref; /* Maximum back reference */ |
652 | unsigned int backref_map; /* Bitmap of low back refs */ |
653 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
654 | BOOL nopartial; /* Set TRUE if partial won't work */ |
655 | } compile_data; |
656 | |
657 | /* Structure for maintaining a chain of pointers to the currently incomplete |
658 | branches, for testing for left recursion. */ |
659 | |
660 | typedef struct branch_chain { |
661 | struct branch_chain *outer; |
662 | uschar *current; |
663 | } branch_chain; |
664 | |
665 | /* Structure for items in a linked list that represents an explicit recursive |
666 | call within the pattern. */ |
667 | |
668 | typedef struct recursion_info { |
669 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
670 | int group_num; /* Number of group that was called */ |
671 | const uschar *after_call; /* "Return value": points after the call in the expr */ |
672 | const uschar *save_start; /* Old value of md->start_match */ |
673 | int *offset_save; /* Pointer to start of saved offsets */ |
674 | int saved_max; /* Number of saved offsets */ |
675 | } recursion_info; |
676 | |
677 | /* When compiling in a mode that doesn't use recursive calls to match(), |
678 | a structure is used to remember local variables on the heap. It is defined in |
679 | pcre.c, close to the match() function, so that it is easy to keep it in step |
680 | with any changes of local variable. However, the pointer to the current frame |
681 | must be saved in some "static" place over a longjmp(). We declare the |
682 | structure here so that we can put a pointer in the match_data structure. |
683 | NOTE: This isn't used for a "normal" compilation of pcre. */ |
684 | |
685 | struct heapframe; |
686 | |
687 | /* Structure for passing "static" information around between the functions |
688 | doing the matching, so that they are thread-safe. */ |
689 | |
690 | typedef struct match_data { |
691 | unsigned long int match_call_count; /* As it says */ |
692 | unsigned long int match_limit;/* As it says */ |
693 | int *offset_vector; /* Offset vector */ |
694 | int offset_end; /* One past the end */ |
695 | int offset_max; /* The maximum usable for return data */ |
696 | const uschar *lcc; /* Points to lower casing table */ |
697 | const uschar *ctypes; /* Points to table of type maps */ |
698 | BOOL offset_overflow; /* Set if too many extractions */ |
699 | BOOL notbol; /* NOTBOL flag */ |
700 | BOOL noteol; /* NOTEOL flag */ |
701 | BOOL utf8; /* UTF8 flag */ |
702 | BOOL endonly; /* Dollar not before final \n */ |
703 | BOOL notempty; /* Empty string match not wanted */ |
704 | BOOL partial; /* PARTIAL flag */ |
705 | BOOL hitend; /* Hit the end of the subject at some point */ |
706 | const uschar *start_code; /* For use when recursing */ |
707 | const uschar *start_subject; /* Start of the subject string */ |
708 | const uschar *end_subject; /* End of the subject string */ |
709 | const uschar *start_match; /* Start of this match attempt */ |
710 | const uschar *end_match_ptr; /* Subject position at end match */ |
711 | int end_offset_top; /* Highwater mark at end of match */ |
712 | int capture_last; /* Most recent capture number */ |
713 | int start_offset; /* The start offset value */ |
714 | recursion_info *recursive; /* Linked list of recursion data */ |
715 | void *callout_data; /* To pass back to callouts */ |
716 | struct heapframe *thisframe; /* Used only when compiling for no recursion */ |
717 | } match_data; |
718 | |
719 | /* Bit definitions for entries in the pcre_ctypes table. */ |
720 | |
721 | #define ctype_space 0x01 |
722 | #define ctype_letter 0x02 |
723 | #define ctype_digit 0x04 |
724 | #define ctype_xdigit 0x08 |
725 | #define ctype_word 0x10 /* alphameric or '_' */ |
726 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
727 | |
728 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
729 | of bits for a class map. Some classes are built by combining these tables. */ |
730 | |
731 | #define cbit_space 0 /* [:space:] or \s */ |
732 | #define cbit_xdigit 32 /* [:xdigit:] */ |
733 | #define cbit_digit 64 /* [:digit:] or \d */ |
734 | #define cbit_upper 96 /* [:upper:] */ |
735 | #define cbit_lower 128 /* [:lower:] */ |
736 | #define cbit_word 160 /* [:word:] or \w */ |
737 | #define cbit_graph 192 /* [:graph:] */ |
738 | #define cbit_print 224 /* [:print:] */ |
739 | #define cbit_punct 256 /* [:punct:] */ |
740 | #define cbit_cntrl 288 /* [:cntrl:] */ |
741 | #define cbit_length 320 /* Length of the cbits table */ |
742 | |
743 | /* Offsets of the various tables from the base tables pointer, and |
744 | total length. */ |
745 | |
746 | #define lcc_offset 0 |
747 | #define fcc_offset 256 |
748 | #define cbits_offset 512 |
749 | #define ctypes_offset (cbits_offset + : pass=0 + cbit_length) |
750 | #define tables_length (ctypes_offset + 256) |
751 | |
752 | /* End of internal.h */ |
753 | [EOF] |