| 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | |
| 6 | /* This is a library of functions to support regular expressions whose syntax |
| 7 | and semantics are as close as possible to those of the Perl 5 language. See |
| 8 | the file doc/Tech.Notes for some information on the internals. |
| 9 | |
| 10 | Written by: Philip Hazel <ph10@cam.ac.uk> |
| 11 | |
| 12 | Copyright (c) 1997-2004 University of Cambridge |
| 13 | |
| 14 | ----------------------------------------------------------------------------- |
| 15 | Redistribution and use in source and binary forms, with or without |
| 16 | modification, are permitted provided that the following conditions are met: |
| 17 | |
| 18 | * Redistributions of source code must retain the above copyright notice, |
| 19 | this list of conditions and the following disclaimer. |
| 20 | |
| 21 | * Redistributions in binary form must reproduce the above copyright |
| 22 | notice, this list of conditions and the following disclaimer in the |
| 23 | documentation and/or other materials provided with the distribution. |
| 24 | |
| 25 | * Neither the name of the University of Cambridge nor the names of its |
| 26 | contributors may be used to endorse or promote products derived from |
| 27 | this software without specific prior written permission. |
| 28 | |
| 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 30 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 31 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 32 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 33 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 34 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 35 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 36 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 37 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 38 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 39 | POSSIBILITY OF SUCH DAMAGE. |
| 40 | ----------------------------------------------------------------------------- |
| 41 | */ |
| 42 | |
| 43 | /* This header contains definitions that are shared between the different |
| 44 | modules, but which are not relevant to the outside. */ |
| 45 | |
| 46 | /* Get the definitions provided by running "configure" */ |
| 47 | |
| 48 | #include "config.h" |
| 49 | |
| 50 | /* Standard C headers plus the external interface definition. The only time |
| 51 | setjmp and stdarg are used is when NO_RECURSE is set. */ |
| 52 | |
| 53 | #include <ctype.h> |
| 54 | #include <limits.h> |
| 55 | #include <setjmp.h> |
| 56 | #include <stdarg.h> |
| 57 | #include <stddef.h> |
| 58 | #include <stdio.h> |
| 59 | #include <stdlib.h> |
| 60 | #include <string.h> |
| 61 | |
| 62 | #ifndef PCRE_SPY |
| 63 | #define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ |
| 64 | #endif |
| 65 | |
| 66 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We |
| 67 | cannot determine these outside the compilation (e.g. by running a program as |
| 68 | part of "configure") because PCRE is often cross-compiled for use on other |
| 69 | systems. Instead we make use of the maximum sizes that are available at |
| 70 | preprocessor time in standard C environments. */ |
| 71 | |
| 72 | #if USHRT_MAX == 65535 |
| 73 | typedef unsigned short pcre_uint16; |
| 74 | #elif UINT_MAX == 65535 |
| 75 | typedef unsigned int pcre_uint16; |
| 76 | #else |
| 77 | #error Cannot determine a type for 16-bit unsigned integers |
| 78 | #endif |
| 79 | |
| 80 | #if UINT_MAX == 4294967295 |
| 81 | typedef unsigned int pcre_uint32; |
| 82 | #elif ULONG_MAX == 4294967295 |
| 83 | typedef unsigned long int pcre_uint32; |
| 84 | #else |
| 85 | #error Cannot determine a type for 32-bit unsigned integers |
| 86 | #endif |
| 87 | |
| 88 | /* All character handling must be done as unsigned characters. Otherwise there |
| 89 | are problems with top-bit-set characters and functions such as isspace(). |
| 90 | However, we leave the interface to the outside world as char *, because that |
| 91 | should make things easier for callers. We define a short type for unsigned char |
| 92 | to save lots of typing. I tried "uchar", but it causes problems on Digital |
| 93 | Unix, where it is defined in sys/types, so use "uschar" instead. */ |
| 94 | |
| 95 | typedef unsigned char uschar; |
| 96 | |
| 97 | /* Include the public PCRE header */ |
| 98 | |
| 99 | #include "pcre.h" |
| 100 | |
| 101 | /* When compiling for use with the Virtual Pascal compiler, these functions |
| 102 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
| 103 | option on the command line. */ |
| 104 | |
| 105 | #ifdef VPCOMPAT |
| 106 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
| 107 | #define memcpy(d,s,n) _memcpy(d,s,n) |
| 108 | #define memmove(d,s,n) _memmove(d,s,n) |
| 109 | #define memset(s,c,n) _memset(s,c,n) |
| 110 | #else /* VPCOMPAT */ |
| 111 | |
| 112 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
| 113 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
| 114 | is set. Otherwise, include an emulating function for those systems that have |
| 115 | neither (there some non-Unix environments where this is the case). This assumes |
| 116 | that all calls to memmove are moving strings upwards in store, which is the |
| 117 | case in PCRE. */ |
| 118 | |
| 119 | #if ! HAVE_MEMMOVE |
| 120 | #undef memmove /* some systems may have a macro */ |
| 121 | #if HAVE_BCOPY |
| 122 | #define memmove(a, b, c) bcopy(b, a, c) |
| 123 | #else /* HAVE_BCOPY */ |
| 124 | void * |
| 125 | pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) |
| 126 | { |
| 127 | int i; |
| 128 | dest += n; |
| 129 | src += n; |
| 130 | for (i = 0; i < n; ++i) *(--dest) = *(--src); |
| 131 | } |
| 132 | #define memmove(a, b, c) pcre_memmove(a, b, c) |
| 133 | #endif /* not HAVE_BCOPY */ |
| 134 | #endif /* not HAVE_MEMMOVE */ |
| 135 | #endif /* not VPCOMPAT */ |
| 136 | |
| 137 | |
| 138 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
| 139 | in big-endian order) by default. These are used, for example, to link from the |
| 140 | start of a subpattern to its alternatives and its end. The use of 2 bytes per |
| 141 | offset limits the size of the compiled regex to around 64K, which is big enough |
| 142 | for almost everybody. However, I received a request for an even bigger limit. |
| 143 | For this reason, and also to make the code easier to maintain, the storing and |
| 144 | loading of offsets from the byte string is now handled by the macros that are |
| 145 | defined here. |
| 146 | |
| 147 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in |
| 148 | the config.h file, but can be overridden by using -D on the command line. This |
| 149 | is automated on Unix systems via the "configure" command. */ |
| 150 | |
| 151 | #if LINK_SIZE == 2 |
| 152 | |
| 153 | #define PUT(a,n,d) \ |
| 154 | (a[n] = : enter=0, leave=0 = (d) >> : pass=0 >> 8), \ |
| 155 | (a[(n)+ : pass=0 +1] = : enter=0, leave=0 = (d) & : pass=0 & 255) |
| 156 | |
| 157 | #define GET(a,n) \ |
| 158 | (((a)[] : enter=0, leave=0 [n] << : pass=0 << 8) | : pass=0 | (a)[] : enter=0, leave=0 [(n)+ : pass=0 +1]) |
| 159 | |
| 160 | #define MAX_PATTERN_SIZE (1 << : pass=0 << 16) |
| 161 | |
| 162 | |
| 163 | #elif LINK_SIZE == 3 |
| 164 | |
| 165 | #define PUT(a,n,d) \ |
| 166 | (a[n] = (d) >> 16), \ |
| 167 | (a[(n)+1] = (d) >> 8), \ |
| 168 | (a[(n)+2] = (d) & 255) |
| 169 | |
| 170 | #define GET(a,n) \ |
| 171 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
| 172 | |
| 173 | #define MAX_PATTERN_SIZE (1 << 24) |
| 174 | |
| 175 | |
| 176 | #elif LINK_SIZE == 4 |
| 177 | |
| 178 | #define PUT(a,n,d) \ |
| 179 | (a[n] = (d) >> 24), \ |
| 180 | (a[(n)+1] = (d) >> 16), \ |
| 181 | (a[(n)+2] = (d) >> 8), \ |
| 182 | (a[(n)+3] = (d) & 255) |
| 183 | |
| 184 | #define GET(a,n) \ |
| 185 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
| 186 | |
| 187 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
| 188 | |
| 189 | |
| 190 | #else |
| 191 | #error LINK_SIZE must be either 2, 3, or 4 |
| 192 | #endif |
| 193 | |
| 194 | |
| 195 | /* Convenience macro defined in terms of the others */ |
| 196 | |
| 197 | #define PUTINC(a,n,d) PUT(a,n,d), a += : pass=0 += LINK_SIZE |
| 198 | |
| 199 | |
| 200 | /* PCRE uses some other 2-byte quantities that do not change when the size of |
| 201 | offsets changes. There are used for repeat counts and for other things such as |
| 202 | capturing parenthesis numbers in back references. */ |
| 203 | |
| 204 | #define PUT2(a,n,d) \ |
| 205 | a[n] = : enter=0, leave=0 = (d) >> : pass=0 >> 8; \ |
| 206 | a[(n)+ : pass=0 +1] = : enter=0, leave=0 = (d) & : pass=0 & 255 |
| 207 | |
| 208 | #define GET2(a,n) \ |
| 209 | (((a)[] : enter=0, leave=0 [n] << : pass=0 << 8) | : pass=0 | (a)[] : enter=0, leave=0 [(n)+ : pass=0 +1]) |
| 210 | |
| 211 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += : pass=0 += 2 |
| 212 | |
| 213 | |
| 214 | /* In case there is no definition of offsetof() provided - though any proper |
| 215 | Standard C system should have one. */ |
| 216 | |
| 217 | #ifndef offsetof |
| 218 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) |
| 219 | #endif |
| 220 | |
| 221 | |
| 222 | /* These are the public options that can change during matching. */ |
| 223 | |
| 224 | #define PCRE_IMS (PCRE_CASELESS| : pass=0 |PCRE_MULTILINE| : pass=0 |PCRE_DOTALL) |
| 225 | |
| 226 | /* Private options flags start at the most significant end of the four bytes, |
| 227 | but skip the top bit so we can use ints for convenience without getting tangled |
| 228 | with negative values. The public options defined in pcre.h start at the least |
| 229 | significant end. Make sure they don't overlap, though now that we have expanded |
| 230 | to four bytes, there is plenty of space. */ |
| 231 | |
| 232 | #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ |
| 233 | #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ |
| 234 | #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ |
| 235 | #define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ |
| 236 | #define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ |
| 237 | |
| 238 | /* Options for the "extra" block produced by pcre_study(). */ |
| 239 | |
| 240 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ |
| 241 | |
| 242 | /* Masks for identifying the public options which are permitted at compile |
| 243 | time, run time or study time, respectively. */ |
| 244 | |
| 245 | #define PUBLIC_OPTIONS \ |
| 246 | (PCRE_CASELESS| : pass=0 |PCRE_EXTENDED| : pass=0 |PCRE_ANCHORED| : pass=0 |PCRE_MULTILINE| : pass=0 | \ |
| 247 | PCRE_DOTALL| : pass=0 |PCRE_DOLLAR_ENDONLY| : pass=0 |PCRE_EXTRA| : pass=0 |PCRE_UNGREEDY| : pass=0 |PCRE_UTF8| : pass=0 | \ |
| 248 | PCRE_NO_AUTO_CAPTURE| : pass=0 |PCRE_NO_UTF8_CHECK| : pass=0 |PCRE_AUTO_CALLOUT) |
| 249 | |
| 250 | #define PUBLIC_EXEC_OPTIONS \ |
| 251 | (PCRE_ANCHORED| : pass=0 |PCRE_NOTBOL| : pass=0 |PCRE_NOTEOL| : pass=0 |PCRE_NOTEMPTY| : pass=0 |PCRE_NO_UTF8_CHECK| : pass=0 | \ |
| 252 | PCRE_PARTIAL) |
| 253 | |
| 254 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ |
| 255 | |
| 256 | /* Magic number to provide a small check against being handed junk. */ |
| 257 | |
| 258 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
| 259 | |
| 260 | /* Negative values for the firstchar and reqchar variables */ |
| 261 | |
| 262 | #define REQ_UNSET (-2) |
| 263 | #define REQ_NONE (-1) |
| 264 | |
| 265 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a |
| 266 | variable-length repeat, or a anything other than literal characters. */ |
| 267 | |
| 268 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ |
| 269 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ |
| 270 | |
| 271 | /* Miscellaneous definitions */ |
| 272 | |
| 273 | typedef int BOOL; |
| 274 | |
| 275 | #define FALSE 0 |
| 276 | #define TRUE 1 |
| 277 | |
| 278 | /* Escape items that are just an encoding of a particular data value. Note that |
| 279 | ESC_n is defined as yet another macro, which is set in config.h to either \n |
| 280 | (the default) or \r (which some people want). */ |
| 281 | |
| 282 | #ifndef ESC_e |
| 283 | #define ESC_e 27 |
| 284 | #endif |
| 285 | |
| 286 | #ifndef ESC_f |
| 287 | #define ESC_f '\f' |
| 288 | #endif |
| 289 | |
| 290 | #ifndef ESC_n |
| 291 | #define ESC_n NEWLINE |
| 292 | #endif |
| 293 | |
| 294 | #ifndef ESC_r |
| 295 | #define ESC_r '\r' |
| 296 | #endif |
| 297 | |
| 298 | /* We can't officially use ESC_t because it is a POSIX reserved identifier |
| 299 | (presumably because of all the others like size_t). */ |
| 300 | |
| 301 | #ifndef ESC_tee |
| 302 | #define ESC_tee '\t' |
| 303 | #endif |
| 304 | |
| 305 | /* These are escaped items that aren't just an encoding of a particular data |
| 306 | value such as \n. They must have non-zero values, as check_escape() returns |
| 307 | their negation. Also, they must appear in the same order as in the opcode |
| 308 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it |
| 309 | corresponds to "." rather than an escape sequence. The final one must be |
| 310 | ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two |
| 311 | tests in the code for an escape greater than ESC_b and less than ESC_Z to |
| 312 | detect the types that may be repeated. These are the types that consume |
| 313 | characters. If any new escapes are put in between that don't consume a |
| 314 | character, that code will have to change. */ |
| 315 | |
| 316 | enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, |
| 317 | ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, |
| 318 | ESC_Q, ESC_REF }; |
| 319 | |
| 320 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
| 321 | contain UTF-8 characters with values greater than 255. */ |
| 322 | |
| 323 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
| 324 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
| 325 | |
| 326 | #define XCL_END 0 /* Marks end of individual items */ |
| 327 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
| 328 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
| 329 | #define XCL_PROP 3 /* Unicode property (one property code) follows */ |
| 330 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
| 331 | |
| 332 | |
| 333 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
| 334 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to |
| 335 | OP_EOD must correspond in order to the list of escapes immediately above. |
| 336 | Note that whenever this list is updated, the two macro definitions that follow |
| 337 | must also be updated to match. */ |
| 338 | |
| 339 | enum { |
| 340 | OP_END, /* 0 End of pattern */ |
| 341 | |
| 342 | /* Values corresponding to backslashed metacharacters */ |
| 343 | |
| 344 | OP_SOD, /* 1 Start of data: \A */ |
| 345 | OP_SOM, /* 2 Start of match (subject + offset): \G */ |
| 346 | OP_NOT_WORD_BOUNDARY, /* 3 \B */ |
| 347 | OP_WORD_BOUNDARY, /* 4 \b */ |
| 348 | OP_NOT_DIGIT, /* 5 \D */ |
| 349 | OP_DIGIT, /* 6 \d */ |
| 350 | OP_NOT_WHITESPACE, /* 7 \S */ |
| 351 | OP_WHITESPACE, /* 8 \s */ |
| 352 | OP_NOT_WORDCHAR, /* 9 \W */ |
| 353 | OP_WORDCHAR, /* 10 \w */ |
| 354 | OP_ANY, /* 11 Match any character */ |
| 355 | OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */ |
| 356 | OP_NOTPROP, /* 13 \P (not Unicode property) */ |
| 357 | OP_PROP, /* 14 \p (Unicode property) */ |
| 358 | OP_EXTUNI, /* 15 \X (extended Unicode sequence */ |
| 359 | OP_EODN, /* 16 End of data or \n at end of data: \Z. */ |
| 360 | OP_EOD, /* 17 End of data: \z */ |
| 361 | |
| 362 | OP_OPT, /* 18 Set runtime options */ |
| 363 | OP_CIRC, /* 19 Start of line - varies with multiline switch */ |
| 364 | OP_DOLL, /* 20 End of line - varies with multiline switch */ |
| 365 | OP_CHAR, /* 21 Match one character, casefully */ |
| 366 | OP_CHARNC, /* 22 Match one character, caselessly */ |
| 367 | OP_NOT, /* 23 Match anything but the following char */ |
| 368 | |
| 369 | OP_STAR, /* 24 The maximizing and minimizing versions of */ |
| 370 | OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ |
| 371 | OP_PLUS, /* 26 the minimizing one second. */ |
| 372 | OP_MINPLUS, /* 27 This first set applies to single characters */ |
| 373 | OP_QUERY, /* 28 */ |
| 374 | OP_MINQUERY, /* 29 */ |
| 375 | OP_UPTO, /* 30 From 0 to n matches */ |
| 376 | OP_MINUPTO, /* 31 */ |
| 377 | OP_EXACT, /* 32 Exactly n matches */ |
| 378 | |
| 379 | OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */ |
| 380 | OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */ |
| 381 | OP_NOTPLUS, /* 35 the minimizing one second. */ |
| 382 | OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */ |
| 383 | OP_NOTQUERY, /* 37 */ |
| 384 | OP_NOTMINQUERY, /* 38 */ |
| 385 | OP_NOTUPTO, /* 39 From 0 to n matches */ |
| 386 | OP_NOTMINUPTO, /* 40 */ |
| 387 | OP_NOTEXACT, /* 41 Exactly n matches */ |
| 388 | |
| 389 | OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */ |
| 390 | OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */ |
| 391 | OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */ |
| 392 | OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */ |
| 393 | OP_TYPEQUERY, /* 46 This set applies to character types such as \d */ |
| 394 | OP_TYPEMINQUERY, /* 47 */ |
| 395 | OP_TYPEUPTO, /* 48 From 0 to n matches */ |
| 396 | OP_TYPEMINUPTO, /* 49 */ |
| 397 | OP_TYPEEXACT, /* 50 Exactly n matches */ |
| 398 | |
| 399 | OP_CRSTAR, /* 51 The maximizing and minimizing versions of */ |
| 400 | OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */ |
| 401 | OP_CRPLUS, /* 53 the minimizing one second. These codes must */ |
| 402 | OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */ |
| 403 | OP_CRQUERY, /* 55 These are for character classes and back refs */ |
| 404 | OP_CRMINQUERY, /* 56 */ |
| 405 | OP_CRRANGE, /* 57 These are different to the three sets above. */ |
| 406 | OP_CRMINRANGE, /* 58 */ |
| 407 | |
| 408 | OP_CLASS, /* 59 Match a character class, chars < 256 only */ |
| 409 | OP_NCLASS, /* 60 Same, but the bitmap was created from a negative |
| 410 | class - the difference is relevant only when a UTF-8 |
| 411 | character > 255 is encountered. */ |
| 412 | |
| 413 | OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the |
| 414 | class. This does both positive and negative. */ |
| 415 | |
| 416 | OP_REF, /* 62 Match a back reference */ |
| 417 | OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */ |
| 418 | OP_CALLOUT, /* 64 Call out to external function if provided */ |
| 419 | |
| 420 | OP_ALT, /* 65 Start of alternation */ |
| 421 | OP_KET, /* 66 End of group that doesn't have an unbounded repeat */ |
| 422 | OP_KETRMAX, /* 67 These two must remain together and in this */ |
| 423 | OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */ |
| 424 | |
| 425 | /* The assertions must come before ONCE and COND */ |
| 426 | |
| 427 | OP_ASSERT, /* 69 Positive lookahead */ |
| 428 | OP_ASSERT_NOT, /* 70 Negative lookahead */ |
| 429 | OP_ASSERTBACK, /* 71 Positive lookbehind */ |
| 430 | OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */ |
| 431 | OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */ |
| 432 | |
| 433 | /* ONCE and COND must come after the assertions, with ONCE first, as there's |
| 434 | a test for >= ONCE for a subpattern that isn't an assertion. */ |
| 435 | |
| 436 | OP_ONCE, /* 74 Once matched, don't back up into the subpattern */ |
| 437 | OP_COND, /* 75 Conditional group */ |
| 438 | OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */ |
| 439 | |
| 440 | OP_BRAZERO, /* 77 These two must remain together and in this */ |
| 441 | OP_BRAMINZERO, /* 78 order. */ |
| 442 | |
| 443 | OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater |
| 444 | than can fit into an opcode. */ |
| 445 | |
| 446 | OP_BRA /* 80 This and greater values are used for brackets that |
| 447 | extract substrings up to EXTRACT_BASIC_MAX. After |
| 448 | that, use is made of OP_BRANUMBER. */ |
| 449 | }; |
| 450 | |
| 451 | /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and |
| 452 | study.c that all opcodes are less than 128 in value. This makes handling UTF-8 |
| 453 | character sequences easier. */ |
| 454 | |
| 455 | /* The highest extraction number before we have to start using additional |
| 456 | bytes. (Originally PCRE didn't have support for extraction counts highter than |
| 457 | this number.) The value is limited by the number of opcodes left after OP_BRA, |
| 458 | i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional |
| 459 | opcodes. */ |
| 460 | |
| 461 | #define EXTRACT_BASIC_MAX 100 |
| 462 | |
| 463 | |
| 464 | /* This macro defines textual names for all the opcodes. There are used only |
| 465 | for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The |
| 466 | macro is referenced only in printint.c. */ |
| 467 | |
| 468 | #define OP_NAME_LIST \ |
| 469 | "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \ |
| 470 | "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ |
| 471 | "notprop", "prop", "extuni", \ |
| 472 | "\\Z", "\\z", \ |
| 473 | "Opt", "^", "$", "char", "charnc", "not", \ |
| 474 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
| 475 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
| 476 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
| 477 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
| 478 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ |
| 479 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ |
| 480 | "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\ |
| 481 | "Brazero", "Braminzero", "Branumber", "Bra" |
| 482 | |
| 483 | |
| 484 | /* This macro defines the length of fixed length operations in the compiled |
| 485 | regex. The lengths are used when searching for specific things, and also in the |
| 486 | debugging printing of a compiled regex. We use a macro so that it can be |
| 487 | incorporated both into pcre.c and pcretest.c without being publicly exposed. |
| 488 | |
| 489 | As things have been extended, some of these are no longer fixed lenths, but are |
| 490 | minima instead. For example, the length of a single-character repeat may vary |
| 491 | in UTF-8 mode. The code that uses this table must know about such things. */ |
| 492 | |
| 493 | #define OP_LENGTHS \ |
| 494 | 1, /* End */ \ |
| 495 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ |
| 496 | 1, 1, /* Any, Anybyte */ \ |
| 497 | 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \ |
| 498 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
| 499 | 2, /* Char - the minimum length */ \ |
| 500 | 2, /* Charnc - the minimum length */ \ |
| 501 | 2, /* not */ \ |
| 502 | /* Positive single-char repeats ** These are */ \ |
| 503 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ |
| 504 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ |
| 505 | /* Negative single-char repeats - only for chars < 256 */ \ |
| 506 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ |
| 507 | 4, 4, 4, /* NOT upto, minupto, exact */ \ |
| 508 | /* Positive type repeats */ \ |
| 509 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ |
| 510 | 4, 4, 4, /* Type upto, minupto, exact */ \ |
| 511 | /* Character class & ref repeats */ \ |
| 512 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
| 513 | 5, 5, /* CRRANGE, CRMINRANGE */ \ |
| 514 | 33, /* CLASS */ \ |
| 515 | 33, /* NCLASS */ \ |
| 516 | 0, /* XCLASS - variable length */ \ |
| 517 | 3, /* REF */ \ |
| 518 | 1+ : pass=0 +LINK_SIZE, /* RECURSE */ \ |
| 519 | 2+ : pass=0 +2* : pass=0 *LINK_SIZE, /* CALLOUT */ \ |
| 520 | 1+ : pass=0 +LINK_SIZE, /* Alt */ \ |
| 521 | 1+ : pass=0 +LINK_SIZE, /* Ket */ \ |
| 522 | 1+ : pass=0 +LINK_SIZE, /* KetRmax */ \ |
| 523 | 1+ : pass=0 +LINK_SIZE, /* KetRmin */ \ |
| 524 | 1+ : pass=0 +LINK_SIZE, /* Assert */ \ |
| 525 | 1+ : pass=0 +LINK_SIZE, /* Assert not */ \ |
| 526 | 1+ : pass=0 +LINK_SIZE, /* Assert behind */ \ |
| 527 | 1+ : pass=0 +LINK_SIZE, /* Assert behind not */ \ |
| 528 | 1+ : pass=0 +LINK_SIZE, /* Reverse */ \ |
| 529 | 1+ : pass=0 +LINK_SIZE, /* Once */ \ |
| 530 | 1+ : pass=0 +LINK_SIZE, /* COND */ \ |
| 531 | 3, /* CREF */ \ |
| 532 | 1, 1, /* BRAZERO, BRAMINZERO */ \ |
| 533 | 3, /* BRANUMBER */ \ |
| 534 | 1+ : pass=0 +LINK_SIZE /* BRA */ \ |
| 535 | |
| 536 | |
| 537 | /* A magic value for OP_CREF to indicate the "in recursion" condition. */ |
| 538 | |
| 539 | #define CREF_RECURSE 0xffff |
| 540 | |
| 541 | /* The texts of compile-time error messages are defined as macros here so that |
| 542 | they can be accessed by the POSIX wrapper and converted into error codes. Yes, |
| 543 | I could have used error codes in the first place, but didn't feel like changing |
| 544 | just to accommodate the POSIX wrapper. */ |
| 545 | |
| 546 | #define ERR1 "\\ at end of pattern" |
| 547 | #define ERR2 "\\c at end of pattern" |
| 548 | #define ERR3 "unrecognized character follows \\" |
| 549 | #define ERR4 "numbers out of order in {} quantifier" |
| 550 | #define ERR5 "number too big in {} quantifier" |
| 551 | #define ERR6 "missing terminating ] for character class" |
| 552 | #define ERR7 "invalid escape sequence in character class" |
| 553 | #define ERR8 "range out of order in character class" |
| 554 | #define ERR9 "nothing to repeat" |
| 555 | #define ERR10 "operand of unlimited repeat could match the empty string" |
| 556 | #define ERR11 "internal error: unexpected repeat" |
| 557 | #define ERR12 "unrecognized character after (?" |
| 558 | #define ERR13 "POSIX named classes are supported only within a class" |
| 559 | #define ERR14 "missing )" |
| 560 | #define ERR15 "reference to non-existent subpattern" |
| 561 | #define ERR16 "erroffset passed as NULL" |
| 562 | #define ERR17 "unknown option bit(s) set" |
| 563 | #define ERR18 "missing ) after comment" |
| 564 | #define ERR19 "parentheses nested too deeply" |
| 565 | #define ERR20 "regular expression too large" |
| 566 | #define ERR21 "failed to get memory" |
| 567 | #define ERR22 "unmatched parentheses" |
| 568 | #define ERR23 "internal error: code overflow" |
| 569 | #define ERR24 "unrecognized character after (?<" |
| 570 | #define ERR25 "lookbehind assertion is not fixed length" |
| 571 | #define ERR26 "malformed number after (?(" |
| 572 | #define ERR27 "conditional group contains more than two branches" |
| 573 | #define ERR28 "assertion expected after (?(" |
| 574 | #define ERR29 "(?R or (?digits must be followed by )" |
| 575 | #define ERR30 "unknown POSIX class name" |
| 576 | #define ERR31 "POSIX collating elements are not supported" |
| 577 | #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" |
| 578 | #define ERR33 "spare error" |
| 579 | #define ERR34 "character value in \\x{...} sequence is too large" |
| 580 | #define ERR35 "invalid condition (?(0)" |
| 581 | #define ERR36 "\\C not allowed in lookbehind assertion" |
| 582 | #define ERR37 "PCRE does not support \\L, \\l, \\N, \\U, or \\u" |
| 583 | #define ERR38 "number after (?C is > 255" |
| 584 | #define ERR39 "closing ) for (?C expected" |
| 585 | #define ERR40 "recursive call could loop indefinitely" |
| 586 | #define ERR41 "unrecognized character after (?P" |
| 587 | #define ERR42 "syntax error after (?P" |
| 588 | #define ERR43 "two named groups have the same name" |
| 589 | #define ERR44 "invalid UTF-8 string" |
| 590 | #define ERR45 "support for \\P, \\p, and \\X has not been compiled" |
| 591 | #define ERR46 "malformed \\P or \\p sequence" |
| 592 | #define ERR47 "unknown property name after \\P or \\p" |
| 593 | |
| 594 | /* The real format of the start of the pcre block; the index of names and the |
| 595 | code vector run on as long as necessary after the end. We store an explicit |
| 596 | offset to the name table so that if a regex is compiled on one host, saved, and |
| 597 | then run on another where the size of pointers is different, all might still |
| 598 | be well. For the case of compiled-on-4 and run-on-8, we include an extra |
| 599 | pointer that is always NULL. For future-proofing, we also include a few dummy |
| 600 | fields - even though you can never get this planning right! |
| 601 | |
| 602 | NOTE NOTE NOTE: |
| 603 | Because people can now save and re-use compiled patterns, any additions to this |
| 604 | structure should be made at the end, and something earlier (e.g. a new |
| 605 | flag in the options or one of the dummy fields) should indicate that the new |
| 606 | fields are present. Currently PCRE always sets the dummy fields to zero. |
| 607 | NOTE NOTE NOTE: |
| 608 | */ |
| 609 | |
| 610 | typedef struct real_pcre { |
| 611 | pcre_uint32 magic_number; |
| 612 | pcre_uint32 size; /* Total that was malloced */ |
| 613 | pcre_uint32 options; |
| 614 | pcre_uint32 dummy1; /* For future use, maybe */ |
| 615 | |
| 616 | pcre_uint16 top_bracket; |
| 617 | pcre_uint16 top_backref; |
| 618 | pcre_uint16 first_byte; |
| 619 | pcre_uint16 req_byte; |
| 620 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
| 621 | pcre_uint16 name_entry_size; /* Size of any name items */ |
| 622 | pcre_uint16 name_count; /* Number of name items */ |
| 623 | pcre_uint16 dummy2; /* For future use, maybe */ |
| 624 | |
| 625 | const unsigned char *tables; /* Pointer to tables or NULL for std */ |
| 626 | const unsigned char *nullpad; /* NULL padding */ |
| 627 | } real_pcre; |
| 628 | |
| 629 | /* The format of the block used to store data from pcre_study(). The same |
| 630 | remark (see NOTE above) about extending this structure applies. */ |
| 631 | |
| 632 | typedef struct pcre_study_data { |
| 633 | pcre_uint32 size; /* Total that was malloced */ |
| 634 | pcre_uint32 options; |
| 635 | uschar start_bits[32]; |
| 636 | } pcre_study_data; |
| 637 | |
| 638 | /* Structure for passing "static" information around between the functions |
| 639 | doing the compiling, so that they are thread-safe. */ |
| 640 | |
| 641 | typedef struct compile_data { |
| 642 | const uschar *lcc; /* Points to lower casing table */ |
| 643 | const uschar *fcc; /* Points to case-flipping table */ |
| 644 | const uschar *cbits; /* Points to character type table */ |
| 645 | const uschar *ctypes; /* Points to table of type maps */ |
| 646 | const uschar *start_code; /* The start of the compiled code */ |
| 647 | const uschar *start_pattern; /* The start of the pattern */ |
| 648 | uschar *name_table; /* The name/number table */ |
| 649 | int names_found; /* Number of entries so far */ |
| 650 | int name_entry_size; /* Size of each entry */ |
| 651 | int top_backref; /* Maximum back reference */ |
| 652 | unsigned int backref_map; /* Bitmap of low back refs */ |
| 653 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
| 654 | BOOL nopartial; /* Set TRUE if partial won't work */ |
| 655 | } compile_data; |
| 656 | |
| 657 | /* Structure for maintaining a chain of pointers to the currently incomplete |
| 658 | branches, for testing for left recursion. */ |
| 659 | |
| 660 | typedef struct branch_chain { |
| 661 | struct branch_chain *outer; |
| 662 | uschar *current; |
| 663 | } branch_chain; |
| 664 | |
| 665 | /* Structure for items in a linked list that represents an explicit recursive |
| 666 | call within the pattern. */ |
| 667 | |
| 668 | typedef struct recursion_info { |
| 669 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
| 670 | int group_num; /* Number of group that was called */ |
| 671 | const uschar *after_call; /* "Return value": points after the call in the expr */ |
| 672 | const uschar *save_start; /* Old value of md->start_match */ |
| 673 | int *offset_save; /* Pointer to start of saved offsets */ |
| 674 | int saved_max; /* Number of saved offsets */ |
| 675 | } recursion_info; |
| 676 | |
| 677 | /* When compiling in a mode that doesn't use recursive calls to match(), |
| 678 | a structure is used to remember local variables on the heap. It is defined in |
| 679 | pcre.c, close to the match() function, so that it is easy to keep it in step |
| 680 | with any changes of local variable. However, the pointer to the current frame |
| 681 | must be saved in some "static" place over a longjmp(). We declare the |
| 682 | structure here so that we can put a pointer in the match_data structure. |
| 683 | NOTE: This isn't used for a "normal" compilation of pcre. */ |
| 684 | |
| 685 | struct heapframe; |
| 686 | |
| 687 | /* Structure for passing "static" information around between the functions |
| 688 | doing the matching, so that they are thread-safe. */ |
| 689 | |
| 690 | typedef struct match_data { |
| 691 | unsigned long int match_call_count; /* As it says */ |
| 692 | unsigned long int match_limit;/* As it says */ |
| 693 | int *offset_vector; /* Offset vector */ |
| 694 | int offset_end; /* One past the end */ |
| 695 | int offset_max; /* The maximum usable for return data */ |
| 696 | const uschar *lcc; /* Points to lower casing table */ |
| 697 | const uschar *ctypes; /* Points to table of type maps */ |
| 698 | BOOL offset_overflow; /* Set if too many extractions */ |
| 699 | BOOL notbol; /* NOTBOL flag */ |
| 700 | BOOL noteol; /* NOTEOL flag */ |
| 701 | BOOL utf8; /* UTF8 flag */ |
| 702 | BOOL endonly; /* Dollar not before final \n */ |
| 703 | BOOL notempty; /* Empty string match not wanted */ |
| 704 | BOOL partial; /* PARTIAL flag */ |
| 705 | BOOL hitend; /* Hit the end of the subject at some point */ |
| 706 | const uschar *start_code; /* For use when recursing */ |
| 707 | const uschar *start_subject; /* Start of the subject string */ |
| 708 | const uschar *end_subject; /* End of the subject string */ |
| 709 | const uschar *start_match; /* Start of this match attempt */ |
| 710 | const uschar *end_match_ptr; /* Subject position at end match */ |
| 711 | int end_offset_top; /* Highwater mark at end of match */ |
| 712 | int capture_last; /* Most recent capture number */ |
| 713 | int start_offset; /* The start offset value */ |
| 714 | recursion_info *recursive; /* Linked list of recursion data */ |
| 715 | void *callout_data; /* To pass back to callouts */ |
| 716 | struct heapframe *thisframe; /* Used only when compiling for no recursion */ |
| 717 | } match_data; |
| 718 | |
| 719 | /* Bit definitions for entries in the pcre_ctypes table. */ |
| 720 | |
| 721 | #define ctype_space 0x01 |
| 722 | #define ctype_letter 0x02 |
| 723 | #define ctype_digit 0x04 |
| 724 | #define ctype_xdigit 0x08 |
| 725 | #define ctype_word 0x10 /* alphameric or '_' */ |
| 726 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
| 727 | |
| 728 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
| 729 | of bits for a class map. Some classes are built by combining these tables. */ |
| 730 | |
| 731 | #define cbit_space 0 /* [:space:] or \s */ |
| 732 | #define cbit_xdigit 32 /* [:xdigit:] */ |
| 733 | #define cbit_digit 64 /* [:digit:] or \d */ |
| 734 | #define cbit_upper 96 /* [:upper:] */ |
| 735 | #define cbit_lower 128 /* [:lower:] */ |
| 736 | #define cbit_word 160 /* [:word:] or \w */ |
| 737 | #define cbit_graph 192 /* [:graph:] */ |
| 738 | #define cbit_print 224 /* [:print:] */ |
| 739 | #define cbit_punct 256 /* [:punct:] */ |
| 740 | #define cbit_cntrl 288 /* [:cntrl:] */ |
| 741 | #define cbit_length 320 /* Length of the cbits table */ |
| 742 | |
| 743 | /* Offsets of the various tables from the base tables pointer, and |
| 744 | total length. */ |
| 745 | |
| 746 | #define lcc_offset 0 |
| 747 | #define fcc_offset 256 |
| 748 | #define cbits_offset 512 |
| 749 | #define ctypes_offset (cbits_offset + : pass=0 + cbit_length) |
| 750 | #define tables_length (ctypes_offset + 256) |
| 751 | |
| 752 | /* End of internal.h */ |
| 753 | [EOF] |