Location | Tool | Test ID | Function | Issue |
---|---|---|---|---|
/builddir/build/BUILD/Python-2.7.3/Modules/_sre.c:2303:11 | clang-analyzer | Assigned value is garbage or undefined | ||
/builddir/build/BUILD/Python-2.7.3/Modules/_sre.c:2303:11 | clang-analyzer | Assigned value is garbage or undefined | ||
/builddir/build/BUILD/Python-2.7.3/Modules/_sre.c:2316:0 | cppcheck | uninitvar | Uninitialized variable: literal | |
/builddir/build/BUILD/Python-2.7.3/Modules/_sre.c:2316:0 | cppcheck | uninitvar | Uninitialized variable: literal |
1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 *
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
27 *
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
35 */
36
37 #ifndef SRE_RECURSIVE
38
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
41
42 #define PY_SSIZE_T_CLEAN
43
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
46
47 #include "sre.h"
48
49 #include <ctype.h>
50
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
55
56 #define SRE_PY_MODULE "re"
57
58 /* defining this one enables tracing */
59 #undef VERBOSE
60
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
67
68 /* -------------------------------------------------------------------- */
69 /* optional features */
70
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
73
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
76
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
79
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
83
84 /* -------------------------------------------------------------------- */
85
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
96
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
102 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
103
104 #if defined(VERBOSE)
105 #define TRACE(v) printf v
106 #else
107 #define TRACE(v)
108 #endif
109
110 /* -------------------------------------------------------------------- */
111 /* search engine state */
112
113 /* default character predicates (run sre_chars.py to regenerate tables) */
114
115 #define SRE_DIGIT_MASK 1
116 #define SRE_SPACE_MASK 2
117 #define SRE_LINEBREAK_MASK 4
118 #define SRE_ALNUM_MASK 8
119 #define SRE_WORD_MASK 16
120
121 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
123 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
124 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
126 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
127 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
128 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
129 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
131 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
132 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
133 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
134 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
135 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139 120, 121, 122, 123, 124, 125, 126, 127 };
140
141 #define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143 #define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145 #define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147 #define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149 #define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151
152 static unsigned int sre_lower(unsigned int ch)
153 {
154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
155 }
156
157 /* locale-specific character predicates */
158 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
166 static unsigned int sre_lower_locale(unsigned int ch)
167 {
168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
169 }
170
171 /* unicode-specific character predicates */
172
173 #if defined(HAVE_UNICODE)
174
175 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180
181 static unsigned int sre_lower_unicode(unsigned int ch)
182 {
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184 }
185
186 #endif
187
188 LOCAL(int)
189 sre_category(SRE_CODE category, unsigned int ch)
190 {
191 switch (category) {
192
193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
209
210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
214
215 #if defined(HAVE_UNICODE)
216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
232 #else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
249 #endif
250 }
251 return 0;
252 }
253
254 /* helpers */
255
256 static void
257 data_stack_dealloc(SRE_STATE* state)
258 {
259 if (state->data_stack) {
260 PyMem_FREE(state->data_stack);
261 state->data_stack = NULL;
262 }
263 state->data_stack_size = state->data_stack_base = 0;
264 }
265
266 static int
267 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268 {
269 Py_ssize_t minsize, cursize;
270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
275 TRACE(("allocate/grow stack %d\n", cursize));
276 stack = PyMem_REALLOC(state->data_stack, cursize);
277 if (!stack) {
278 data_stack_dealloc(state);
279 return SRE_ERROR_MEMORY;
280 }
281 state->data_stack = (char *)stack;
282 state->data_stack_size = cursize;
283 }
284 return 0;
285 }
286
287 /* generate 8-bit version */
288
289 #define SRE_CHAR unsigned char
290 #define SRE_AT sre_at
291 #define SRE_COUNT sre_count
292 #define SRE_CHARSET sre_charset
293 #define SRE_INFO sre_info
294 #define SRE_MATCH sre_match
295 #define SRE_MATCH_CONTEXT sre_match_context
296 #define SRE_SEARCH sre_search
297 #define SRE_LITERAL_TEMPLATE sre_literal_template
298
299 #if defined(HAVE_UNICODE)
300
301 #define SRE_RECURSIVE
302 #include "_sre.c"
303 #undef SRE_RECURSIVE
304
305 #undef SRE_LITERAL_TEMPLATE
306 #undef SRE_SEARCH
307 #undef SRE_MATCH
308 #undef SRE_MATCH_CONTEXT
309 #undef SRE_INFO
310 #undef SRE_CHARSET
311 #undef SRE_COUNT
312 #undef SRE_AT
313 #undef SRE_CHAR
314
315 /* generate 16-bit unicode version */
316
317 #define SRE_CHAR Py_UNICODE
318 #define SRE_AT sre_uat
319 #define SRE_COUNT sre_ucount
320 #define SRE_CHARSET sre_ucharset
321 #define SRE_INFO sre_uinfo
322 #define SRE_MATCH sre_umatch
323 #define SRE_MATCH_CONTEXT sre_umatch_context
324 #define SRE_SEARCH sre_usearch
325 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
326 #endif
327
328 #endif /* SRE_RECURSIVE */
329
330 /* -------------------------------------------------------------------- */
331 /* String matching engine */
332
333 /* the following section is compiled twice, with different character
334 settings */
335
336 LOCAL(int)
337 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 {
339 /* check if pointer is at given position */
340
341 Py_ssize_t thisp, thatp;
342
343 switch (at) {
344
345 case SRE_AT_BEGINNING:
346 case SRE_AT_BEGINNING_STRING:
347 return ((void*) ptr == state->beginning);
348
349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
351 SRE_IS_LINEBREAK((int) ptr[-1]));
352
353 case SRE_AT_END:
354 return (((void*) (ptr+1) == state->end &&
355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
357
358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
360 SRE_IS_LINEBREAK((int) ptr[0]));
361
362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
368 thatp = ((void*) ptr > state->beginning) ?
369 SRE_IS_WORD((int) ptr[-1]) : 0;
370 thisp = ((void*) ptr < state->end) ?
371 SRE_IS_WORD((int) ptr[0]) : 0;
372 return thisp != thatp;
373
374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
377 thatp = ((void*) ptr > state->beginning) ?
378 SRE_IS_WORD((int) ptr[-1]) : 0;
379 thisp = ((void*) ptr < state->end) ?
380 SRE_IS_WORD((int) ptr[0]) : 0;
381 return thisp == thatp;
382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
386 thatp = ((void*) ptr > state->beginning) ?
387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388 thisp = ((void*) ptr < state->end) ?
389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390 return thisp != thatp;
391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
395 thatp = ((void*) ptr > state->beginning) ?
396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397 thisp = ((void*) ptr < state->end) ?
398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399 return thisp == thatp;
400
401 #if defined(HAVE_UNICODE)
402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
405 thatp = ((void*) ptr > state->beginning) ?
406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407 thisp = ((void*) ptr < state->end) ?
408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409 return thisp != thatp;
410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
414 thatp = ((void*) ptr > state->beginning) ?
415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416 thisp = ((void*) ptr < state->end) ?
417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418 return thisp == thatp;
419 #endif
420
421 }
422
423 return 0;
424 }
425
426 LOCAL(int)
427 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 {
429 /* check if character is a member of the given set */
430
431 int ok = 1;
432
433 for (;;) {
434 switch (*set++) {
435
436 case SRE_OP_FAILURE:
437 return !ok;
438
439 case SRE_OP_LITERAL:
440 /* <LITERAL> <code> */
441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
445
446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
449 return ok;
450 set += 1;
451 break;
452
453 case SRE_OP_CHARSET:
454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
459 }
460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
462 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
463 return ok;
464 set += 8;
465 }
466 break;
467
468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
482 Py_ssize_t count, block;
483 count = *(set++);
484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
500 if (block >=0 &&
501 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
502 return ok;
503 set += count*8;
504 }
505 break;
506 }
507
508 default:
509 /* internal error -- there's not much we can do about it
510 here, so let's just pretend it didn't match... */
511 return 0;
512 }
513 }
514 }
515
516 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517
518 LOCAL(Py_ssize_t)
519 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520 {
521 SRE_CODE chr;
522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
524 Py_ssize_t i;
525
526 /* adjust end */
527 if (maxcount < end - ptr && maxcount != 65535)
528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
547 /* repeated dot wildcard. skip to the end of the target
548 string, and backtrack from there */
549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
576
577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
585 default:
586 /* repeated single character pattern */
587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588 while ((SRE_CHAR*) state->ptr < end) {
589 i = SRE_MATCH(state, pattern);
590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
596 (SRE_CHAR*) state->ptr - ptr));
597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
600 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
601 return ptr - (SRE_CHAR*) state->ptr;
602 }
603
604 #if 0 /* not used in this release */
605 LOCAL(int)
606 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607 {
608 /* check if an SRE_OP_INFO block matches at the current position.
609 returns the number of SRE_CODE objects to skip if successful, 0
610 if no match */
611
612 SRE_CHAR* end = state->end;
613 SRE_CHAR* ptr = state->ptr;
614 Py_ssize_t i;
615
616 /* check minimal length */
617 if (pattern[3] && (end - ptr) < pattern[3])
618 return 0;
619
620 /* check known prefix */
621 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
622 /* <length> <skip> <prefix data> <overlap data> */
623 for (i = 0; i < pattern[5]; i++)
624 if ((SRE_CODE) ptr[i] != pattern[7 + i])
625 return 0;
626 return pattern[0] + 2 * pattern[6];
627 }
628 return pattern[0];
629 }
630 #endif
631
632 /* The macros below should be used to protect recursive SRE_MATCH()
633 * calls that *failed* and do *not* return immediately (IOW, those
634 * that will backtrack). Explaining:
635 *
636 * - Recursive SRE_MATCH() returned true: that's usually a success
637 * (besides atypical cases like ASSERT_NOT), therefore there's no
638 * reason to restore lastmark;
639 *
640 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
641 * is returning to the caller: If the current SRE_MATCH() is the
642 * top function of the recursion, returning false will be a matching
643 * failure, and it doesn't matter where lastmark is pointing to.
644 * If it's *not* the top function, it will be a recursive SRE_MATCH()
645 * failure by itself, and the calling SRE_MATCH() will have to deal
646 * with the failure by the same rules explained here (it will restore
647 * lastmark by itself if necessary);
648 *
649 * - Recursive SRE_MATCH() returned false, and will continue the
650 * outside 'for' loop: must be protected when breaking, since the next
651 * OP could potentially depend on lastmark;
652 *
653 * - Recursive SRE_MATCH() returned false, and will be called again
654 * inside a local for/while loop: must be protected between each
655 * loop iteration, since the recursive SRE_MATCH() could do anything,
656 * and could potentially depend on lastmark.
657 *
658 * For more information, check the discussion at SF patch #712900.
659 */
660 #define LASTMARK_SAVE() \
661 do { \
662 ctx->lastmark = state->lastmark; \
663 ctx->lastindex = state->lastindex; \
664 } while (0)
665 #define LASTMARK_RESTORE() \
666 do { \
667 state->lastmark = ctx->lastmark; \
668 state->lastindex = ctx->lastindex; \
669 } while (0)
670
671 #define RETURN_ERROR(i) do { return i; } while(0)
672 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
673 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674
675 #define RETURN_ON_ERROR(i) \
676 do { if (i < 0) RETURN_ERROR(i); } while (0)
677 #define RETURN_ON_SUCCESS(i) \
678 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
679 #define RETURN_ON_FAILURE(i) \
680 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681
682 #define SFY(x) #x
683
684 #define DATA_STACK_ALLOC(state, type, ptr) \
685 do { \
686 alloc_pos = state->data_stack_base; \
687 TRACE(("allocating %s in %d (%d)\n", \
688 SFY(type), alloc_pos, sizeof(type))); \
689 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
690 int j = data_stack_grow(state, sizeof(type)); \
691 if (j < 0) return j; \
692 if (ctx_pos != -1) \
693 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 } \
695 ptr = (type*)(state->data_stack+alloc_pos); \
696 state->data_stack_base += sizeof(type); \
697 } while (0)
698
699 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
700 do { \
701 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
702 ptr = (type*)(state->data_stack+pos); \
703 } while (0)
704
705 #define DATA_STACK_PUSH(state, data, size) \
706 do { \
707 TRACE(("copy data in %p to %d (%d)\n", \
708 data, state->data_stack_base, size)); \
709 if (state->data_stack_size < state->data_stack_base+size) { \
710 int j = data_stack_grow(state, size); \
711 if (j < 0) return j; \
712 if (ctx_pos != -1) \
713 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 } \
715 memcpy(state->data_stack+state->data_stack_base, data, size); \
716 state->data_stack_base += size; \
717 } while (0)
718
719 #define DATA_STACK_POP(state, data, size, discard) \
720 do { \
721 TRACE(("copy data to %p from %d (%d)\n", \
722 data, state->data_stack_base-size, size)); \
723 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
724 if (discard) \
725 state->data_stack_base -= size; \
726 } while (0)
727
728 #define DATA_STACK_POP_DISCARD(state, size) \
729 do { \
730 TRACE(("discard data from %d (%d)\n", \
731 state->data_stack_base-size, size)); \
732 state->data_stack_base -= size; \
733 } while(0)
734
735 #define DATA_PUSH(x) \
736 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
737 #define DATA_POP(x) \
738 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
739 #define DATA_POP_DISCARD(x) \
740 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
741 #define DATA_ALLOC(t,p) \
742 DATA_STACK_ALLOC(state, t, p)
743 #define DATA_LOOKUP_AT(t,p,pos) \
744 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745
746 #define MARK_PUSH(lastmark) \
747 do if (lastmark > 0) { \
748 i = lastmark; /* ctx->lastmark may change if reallocated */ \
749 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
750 } while (0)
751 #define MARK_POP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
754 } while (0)
755 #define MARK_POP_KEEP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
758 } while (0)
759 #define MARK_POP_DISCARD(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
762 } while (0)
763
764 #define JUMP_NONE 0
765 #define JUMP_MAX_UNTIL_1 1
766 #define JUMP_MAX_UNTIL_2 2
767 #define JUMP_MAX_UNTIL_3 3
768 #define JUMP_MIN_UNTIL_1 4
769 #define JUMP_MIN_UNTIL_2 5
770 #define JUMP_MIN_UNTIL_3 6
771 #define JUMP_REPEAT 7
772 #define JUMP_REPEAT_ONE_1 8
773 #define JUMP_REPEAT_ONE_2 9
774 #define JUMP_MIN_REPEAT_ONE 10
775 #define JUMP_BRANCH 11
776 #define JUMP_ASSERT 12
777 #define JUMP_ASSERT_NOT 13
778
779 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
780 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
781 nextctx->last_ctx_pos = ctx_pos; \
782 nextctx->jump = jumpvalue; \
783 nextctx->pattern = nextpattern; \
784 ctx_pos = alloc_pos; \
785 ctx = nextctx; \
786 goto entrance; \
787 jumplabel: \
788 while (0) /* gcc doesn't like labels at end of scopes */ \
789
790 typedef struct {
791 Py_ssize_t last_ctx_pos;
792 Py_ssize_t jump;
793 SRE_CHAR* ptr;
794 SRE_CODE* pattern;
795 Py_ssize_t count;
796 Py_ssize_t lastmark;
797 Py_ssize_t lastindex;
798 union {
799 SRE_CODE chr;
800 SRE_REPEAT* rep;
801 } u;
802 } SRE_MATCH_CONTEXT;
803
804 /* check if string matches the given pattern. returns <0 for
805 error, 0 for failure, and 1 for success */
806 LOCAL(Py_ssize_t)
807 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
808 {
809 SRE_CHAR* end = (SRE_CHAR *)state->end;
810 Py_ssize_t alloc_pos, ctx_pos = -1;
811 Py_ssize_t i, ret = 0;
812 Py_ssize_t jump;
813 unsigned int sigcount=0;
814
815 SRE_MATCH_CONTEXT* ctx;
816 SRE_MATCH_CONTEXT* nextctx;
817
818 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
819
820 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
821 ctx->last_ctx_pos = -1;
822 ctx->jump = JUMP_NONE;
823 ctx->pattern = pattern;
824 ctx_pos = alloc_pos;
825
826 entrance:
827
828 ctx->ptr = (SRE_CHAR *)state->ptr;
829
830 if (ctx->pattern[0] == SRE_OP_INFO) {
831 /* optimization info block */
832 /* <INFO> <1=skip> <2=flags> <3=min> ... */
833 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
834 TRACE(("reject (got %d chars, need %d)\n",
835 (end - ctx->ptr), ctx->pattern[3]));
836 RETURN_FAILURE;
837 }
838 ctx->pattern += ctx->pattern[1] + 1;
839 }
840
841 for (;;) {
842 ++sigcount;
843 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
844 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
845
846 switch (*ctx->pattern++) {
847
848 case SRE_OP_MARK:
849 /* set mark */
850 /* <MARK> <gid> */
851 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
852 ctx->ptr, ctx->pattern[0]));
853 i = ctx->pattern[0];
854 if (i & 1)
855 state->lastindex = i/2 + 1;
856 if (i > state->lastmark) {
857 /* state->lastmark is the highest valid index in the
858 state->mark array. If it is increased by more than 1,
859 the intervening marks must be set to NULL to signal
860 that these marks have not been encountered. */
861 Py_ssize_t j = state->lastmark + 1;
862 while (j < i)
863 state->mark[j++] = NULL;
864 state->lastmark = i;
865 }
866 state->mark[i] = ctx->ptr;
867 ctx->pattern++;
868 break;
869
870 case SRE_OP_LITERAL:
871 /* match literal string */
872 /* <LITERAL> <code> */
873 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
874 ctx->ptr, *ctx->pattern));
875 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
876 RETURN_FAILURE;
877 ctx->pattern++;
878 ctx->ptr++;
879 break;
880
881 case SRE_OP_NOT_LITERAL:
882 /* match anything that is not literal character */
883 /* <NOT_LITERAL> <code> */
884 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
885 ctx->ptr, *ctx->pattern));
886 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
887 RETURN_FAILURE;
888 ctx->pattern++;
889 ctx->ptr++;
890 break;
891
892 case SRE_OP_SUCCESS:
893 /* end of pattern */
894 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
895 state->ptr = ctx->ptr;
896 RETURN_SUCCESS;
897
898 case SRE_OP_AT:
899 /* match at given position */
900 /* <AT> <code> */
901 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
902 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
903 RETURN_FAILURE;
904 ctx->pattern++;
905 break;
906
907 case SRE_OP_CATEGORY:
908 /* match at given category */
909 /* <CATEGORY> <code> */
910 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
911 ctx->ptr, *ctx->pattern));
912 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
913 RETURN_FAILURE;
914 ctx->pattern++;
915 ctx->ptr++;
916 break;
917
918 case SRE_OP_ANY:
919 /* match anything (except a newline) */
920 /* <ANY> */
921 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
923 RETURN_FAILURE;
924 ctx->ptr++;
925 break;
926
927 case SRE_OP_ANY_ALL:
928 /* match anything */
929 /* <ANY_ALL> */
930 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end)
932 RETURN_FAILURE;
933 ctx->ptr++;
934 break;
935
936 case SRE_OP_IN:
937 /* match set member (or non_member) */
938 /* <IN> <skip> <set> */
939 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
940 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
941 RETURN_FAILURE;
942 ctx->pattern += ctx->pattern[0];
943 ctx->ptr++;
944 break;
945
946 case SRE_OP_LITERAL_IGNORE:
947 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
948 ctx->pattern, ctx->ptr, ctx->pattern[0]));
949 if (ctx->ptr >= end ||
950 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
951 RETURN_FAILURE;
952 ctx->pattern++;
953 ctx->ptr++;
954 break;
955
956 case SRE_OP_NOT_LITERAL_IGNORE:
957 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
958 ctx->pattern, ctx->ptr, *ctx->pattern));
959 if (ctx->ptr >= end ||
960 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
961 RETURN_FAILURE;
962 ctx->pattern++;
963 ctx->ptr++;
964 break;
965
966 case SRE_OP_IN_IGNORE:
967 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
968 if (ctx->ptr >= end
969 || !SRE_CHARSET(ctx->pattern+1,
970 (SRE_CODE)state->lower(*ctx->ptr)))
971 RETURN_FAILURE;
972 ctx->pattern += ctx->pattern[0];
973 ctx->ptr++;
974 break;
975
976 case SRE_OP_JUMP:
977 case SRE_OP_INFO:
978 /* jump forward */
979 /* <JUMP> <offset> */
980 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
981 ctx->ptr, ctx->pattern[0]));
982 ctx->pattern += ctx->pattern[0];
983 break;
984
985 case SRE_OP_BRANCH:
986 /* alternation */
987 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
988 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
989 LASTMARK_SAVE();
990 ctx->u.rep = state->repeat;
991 if (ctx->u.rep)
992 MARK_PUSH(ctx->lastmark);
993 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
994 if (ctx->pattern[1] == SRE_OP_LITERAL &&
995 (ctx->ptr >= end ||
996 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
997 continue;
998 if (ctx->pattern[1] == SRE_OP_IN &&
999 (ctx->ptr >= end ||
1000 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1001 continue;
1002 state->ptr = ctx->ptr;
1003 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1004 if (ret) {
1005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_ON_ERROR(ret);
1008 RETURN_SUCCESS;
1009 }
1010 if (ctx->u.rep)
1011 MARK_POP_KEEP(ctx->lastmark);
1012 LASTMARK_RESTORE();
1013 }
1014 if (ctx->u.rep)
1015 MARK_POP_DISCARD(ctx->lastmark);
1016 RETURN_FAILURE;
1017
1018 case SRE_OP_REPEAT_ONE:
1019 /* match repeated sequence (maximizing regexp) */
1020
1021 /* this operator only works if the repeated item is
1022 exactly one character wide, and we're not already
1023 collecting backtracking points. for other cases,
1024 use the MAX_REPEAT operator */
1025
1026 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1027
1028 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1029 ctx->pattern[1], ctx->pattern[2]));
1030
1031 if (ctx->ptr + ctx->pattern[1] > end)
1032 RETURN_FAILURE; /* cannot match */
1033
1034 state->ptr = ctx->ptr;
1035
1036 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1037 RETURN_ON_ERROR(ret);
1038 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1039 ctx->count = ret;
1040 ctx->ptr += ctx->count;
1041
1042 /* when we arrive here, count contains the number of
1043 matches, and ctx->ptr points to the tail of the target
1044 string. check if the rest of the pattern matches,
1045 and backtrack if not. */
1046
1047 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1048 RETURN_FAILURE;
1049
1050 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1051 /* tail is empty. we're finished */
1052 state->ptr = ctx->ptr;
1053 RETURN_SUCCESS;
1054 }
1055
1056 LASTMARK_SAVE();
1057
1058 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1059 /* tail starts with a literal. skip positions where
1060 the rest of the pattern cannot possibly match */
1061 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1062 for (;;) {
1063 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1064 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1065 ctx->ptr--;
1066 ctx->count--;
1067 }
1068 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1069 break;
1070 state->ptr = ctx->ptr;
1071 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1072 ctx->pattern+ctx->pattern[0]);
1073 if (ret) {
1074 RETURN_ON_ERROR(ret);
1075 RETURN_SUCCESS;
1076 }
1077
1078 LASTMARK_RESTORE();
1079
1080 ctx->ptr--;
1081 ctx->count--;
1082 }
1083
1084 } else {
1085 /* general case */
1086 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1087 state->ptr = ctx->ptr;
1088 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1089 ctx->pattern+ctx->pattern[0]);
1090 if (ret) {
1091 RETURN_ON_ERROR(ret);
1092 RETURN_SUCCESS;
1093 }
1094 ctx->ptr--;
1095 ctx->count--;
1096 LASTMARK_RESTORE();
1097 }
1098 }
1099 RETURN_FAILURE;
1100
1101 case SRE_OP_MIN_REPEAT_ONE:
1102 /* match repeated sequence (minimizing regexp) */
1103
1104 /* this operator only works if the repeated item is
1105 exactly one character wide, and we're not already
1106 collecting backtracking points. for other cases,
1107 use the MIN_REPEAT operator */
1108
1109 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1110
1111 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1112 ctx->pattern[1], ctx->pattern[2]));
1113
1114 if (ctx->ptr + ctx->pattern[1] > end)
1115 RETURN_FAILURE; /* cannot match */
1116
1117 state->ptr = ctx->ptr;
1118
1119 if (ctx->pattern[1] == 0)
1120 ctx->count = 0;
1121 else {
1122 /* count using pattern min as the maximum */
1123 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1124 RETURN_ON_ERROR(ret);
1125 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1126 if (ret < (Py_ssize_t) ctx->pattern[1])
1127 /* didn't match minimum number of times */
1128 RETURN_FAILURE;
1129 /* advance past minimum matches of repeat */
1130 ctx->count = ret;
1131 ctx->ptr += ctx->count;
1132 }
1133
1134 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1135 /* tail is empty. we're finished */
1136 state->ptr = ctx->ptr;
1137 RETURN_SUCCESS;
1138
1139 } else {
1140 /* general case */
1141 LASTMARK_SAVE();
1142 while ((Py_ssize_t)ctx->pattern[2] == 65535
1143 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1144 state->ptr = ctx->ptr;
1145 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1146 ctx->pattern+ctx->pattern[0]);
1147 if (ret) {
1148 RETURN_ON_ERROR(ret);
1149 RETURN_SUCCESS;
1150 }
1151 state->ptr = ctx->ptr;
1152 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1153 RETURN_ON_ERROR(ret);
1154 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1155 if (ret == 0)
1156 break;
1157 assert(ret == 1);
1158 ctx->ptr++;
1159 ctx->count++;
1160 LASTMARK_RESTORE();
1161 }
1162 }
1163 RETURN_FAILURE;
1164
1165 case SRE_OP_REPEAT:
1166 /* create repeat context. all the hard work is done
1167 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1168 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1169 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1170 ctx->pattern[1], ctx->pattern[2]));
1171
1172 /* install new repeat context */
1173 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1174 if (!ctx->u.rep) {
1175 PyErr_NoMemory();
1176 RETURN_FAILURE;
1177 }
1178 ctx->u.rep->count = -1;
1179 ctx->u.rep->pattern = ctx->pattern;
1180 ctx->u.rep->prev = state->repeat;
1181 ctx->u.rep->last_ptr = NULL;
1182 state->repeat = ctx->u.rep;
1183
1184 state->ptr = ctx->ptr;
1185 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1186 state->repeat = ctx->u.rep->prev;
1187 PyObject_FREE(ctx->u.rep);
1188
1189 if (ret) {
1190 RETURN_ON_ERROR(ret);
1191 RETURN_SUCCESS;
1192 }
1193 RETURN_FAILURE;
1194
1195 case SRE_OP_MAX_UNTIL:
1196 /* maximizing repeat */
1197 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1198
1199 /* FIXME: we probably need to deal with zero-width
1200 matches in here... */
1201
1202 ctx->u.rep = state->repeat;
1203 if (!ctx->u.rep)
1204 RETURN_ERROR(SRE_ERROR_STATE);
1205
1206 state->ptr = ctx->ptr;
1207
1208 ctx->count = ctx->u.rep->count+1;
1209
1210 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1211 ctx->ptr, ctx->count));
1212
1213 if (ctx->count < ctx->u.rep->pattern[1]) {
1214 /* not enough matches */
1215 ctx->u.rep->count = ctx->count;
1216 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1217 ctx->u.rep->pattern+3);
1218 if (ret) {
1219 RETURN_ON_ERROR(ret);
1220 RETURN_SUCCESS;
1221 }
1222 ctx->u.rep->count = ctx->count-1;
1223 state->ptr = ctx->ptr;
1224 RETURN_FAILURE;
1225 }
1226
1227 if ((ctx->count < ctx->u.rep->pattern[2] ||
1228 ctx->u.rep->pattern[2] == 65535) &&
1229 state->ptr != ctx->u.rep->last_ptr) {
1230 /* we may have enough matches, but if we can
1231 match another item, do so */
1232 ctx->u.rep->count = ctx->count;
1233 LASTMARK_SAVE();
1234 MARK_PUSH(ctx->lastmark);
1235 /* zero-width match protection */
1236 DATA_PUSH(&ctx->u.rep->last_ptr);
1237 ctx->u.rep->last_ptr = state->ptr;
1238 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1239 ctx->u.rep->pattern+3);
1240 DATA_POP(&ctx->u.rep->last_ptr);
1241 if (ret) {
1242 MARK_POP_DISCARD(ctx->lastmark);
1243 RETURN_ON_ERROR(ret);
1244 RETURN_SUCCESS;
1245 }
1246 MARK_POP(ctx->lastmark);
1247 LASTMARK_RESTORE();
1248 ctx->u.rep->count = ctx->count-1;
1249 state->ptr = ctx->ptr;
1250 }
1251
1252 /* cannot match more repeated items here. make sure the
1253 tail matches */
1254 state->repeat = ctx->u.rep->prev;
1255 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1256 RETURN_ON_SUCCESS(ret);
1257 state->repeat = ctx->u.rep;
1258 state->ptr = ctx->ptr;
1259 RETURN_FAILURE;
1260
1261 case SRE_OP_MIN_UNTIL:
1262 /* minimizing repeat */
1263 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1264
1265 ctx->u.rep = state->repeat;
1266 if (!ctx->u.rep)
1267 RETURN_ERROR(SRE_ERROR_STATE);
1268
1269 state->ptr = ctx->ptr;
1270
1271 ctx->count = ctx->u.rep->count+1;
1272
1273 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1274 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1275
1276 if (ctx->count < ctx->u.rep->pattern[1]) {
1277 /* not enough matches */
1278 ctx->u.rep->count = ctx->count;
1279 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1280 ctx->u.rep->pattern+3);
1281 if (ret) {
1282 RETURN_ON_ERROR(ret);
1283 RETURN_SUCCESS;
1284 }
1285 ctx->u.rep->count = ctx->count-1;
1286 state->ptr = ctx->ptr;
1287 RETURN_FAILURE;
1288 }
1289
1290 LASTMARK_SAVE();
1291
1292 /* see if the tail matches */
1293 state->repeat = ctx->u.rep->prev;
1294 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1295 if (ret) {
1296 RETURN_ON_ERROR(ret);
1297 RETURN_SUCCESS;
1298 }
1299
1300 state->repeat = ctx->u.rep;
1301 state->ptr = ctx->ptr;
1302
1303 LASTMARK_RESTORE();
1304
1305 if (ctx->count >= ctx->u.rep->pattern[2]
1306 && ctx->u.rep->pattern[2] != 65535)
1307 RETURN_FAILURE;
1308
1309 ctx->u.rep->count = ctx->count;
1310 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1311 ctx->u.rep->pattern+3);
1312 if (ret) {
1313 RETURN_ON_ERROR(ret);
1314 RETURN_SUCCESS;
1315 }
1316 ctx->u.rep->count = ctx->count-1;
1317 state->ptr = ctx->ptr;
1318 RETURN_FAILURE;
1319
1320 case SRE_OP_GROUPREF:
1321 /* match backreference */
1322 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1323 ctx->ptr, ctx->pattern[0]));
1324 i = ctx->pattern[0];
1325 {
1326 Py_ssize_t groupref = i+i;
1327 if (groupref >= state->lastmark) {
1328 RETURN_FAILURE;
1329 } else {
1330 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1331 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1332 if (!p || !e || e < p)
1333 RETURN_FAILURE;
1334 while (p < e) {
1335 if (ctx->ptr >= end || *ctx->ptr != *p)
1336 RETURN_FAILURE;
1337 p++; ctx->ptr++;
1338 }
1339 }
1340 }
1341 ctx->pattern++;
1342 break;
1343
1344 case SRE_OP_GROUPREF_IGNORE:
1345 /* match backreference */
1346 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1347 ctx->ptr, ctx->pattern[0]));
1348 i = ctx->pattern[0];
1349 {
1350 Py_ssize_t groupref = i+i;
1351 if (groupref >= state->lastmark) {
1352 RETURN_FAILURE;
1353 } else {
1354 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1355 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1356 if (!p || !e || e < p)
1357 RETURN_FAILURE;
1358 while (p < e) {
1359 if (ctx->ptr >= end ||
1360 state->lower(*ctx->ptr) != state->lower(*p))
1361 RETURN_FAILURE;
1362 p++; ctx->ptr++;
1363 }
1364 }
1365 }
1366 ctx->pattern++;
1367 break;
1368
1369 case SRE_OP_GROUPREF_EXISTS:
1370 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1371 ctx->ptr, ctx->pattern[0]));
1372 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1373 i = ctx->pattern[0];
1374 {
1375 Py_ssize_t groupref = i+i;
1376 if (groupref >= state->lastmark) {
1377 ctx->pattern += ctx->pattern[1];
1378 break;
1379 } else {
1380 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1381 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1382 if (!p || !e || e < p) {
1383 ctx->pattern += ctx->pattern[1];
1384 break;
1385 }
1386 }
1387 }
1388 ctx->pattern += 2;
1389 break;
1390
1391 case SRE_OP_ASSERT:
1392 /* assert subpattern */
1393 /* <ASSERT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1395 ctx->ptr, ctx->pattern[1]));
1396 state->ptr = ctx->ptr - ctx->pattern[1];
1397 if (state->ptr < state->beginning)
1398 RETURN_FAILURE;
1399 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1400 RETURN_ON_FAILURE(ret);
1401 ctx->pattern += ctx->pattern[0];
1402 break;
1403
1404 case SRE_OP_ASSERT_NOT:
1405 /* assert not subpattern */
1406 /* <ASSERT_NOT> <skip> <back> <pattern> */
1407 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1408 ctx->ptr, ctx->pattern[1]));
1409 state->ptr = ctx->ptr - ctx->pattern[1];
1410 if (state->ptr >= state->beginning) {
1411 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1412 if (ret) {
1413 RETURN_ON_ERROR(ret);
1414 RETURN_FAILURE;
1415 }
1416 }
1417 ctx->pattern += ctx->pattern[0];
1418 break;
1419
1420 case SRE_OP_FAILURE:
1421 /* immediate failure */
1422 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1423 RETURN_FAILURE;
1424
1425 default:
1426 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1427 ctx->pattern[-1]));
1428 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1429 }
1430 }
1431
1432 exit:
1433 ctx_pos = ctx->last_ctx_pos;
1434 jump = ctx->jump;
1435 DATA_POP_DISCARD(ctx);
1436 if (ctx_pos == -1)
1437 return ret;
1438 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1439
1440 switch (jump) {
1441 case JUMP_MAX_UNTIL_2:
1442 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1443 goto jump_max_until_2;
1444 case JUMP_MAX_UNTIL_3:
1445 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1446 goto jump_max_until_3;
1447 case JUMP_MIN_UNTIL_2:
1448 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1449 goto jump_min_until_2;
1450 case JUMP_MIN_UNTIL_3:
1451 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_until_3;
1453 case JUMP_BRANCH:
1454 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1455 goto jump_branch;
1456 case JUMP_MAX_UNTIL_1:
1457 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1458 goto jump_max_until_1;
1459 case JUMP_MIN_UNTIL_1:
1460 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1461 goto jump_min_until_1;
1462 case JUMP_REPEAT:
1463 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1464 goto jump_repeat;
1465 case JUMP_REPEAT_ONE_1:
1466 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1467 goto jump_repeat_one_1;
1468 case JUMP_REPEAT_ONE_2:
1469 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1470 goto jump_repeat_one_2;
1471 case JUMP_MIN_REPEAT_ONE:
1472 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1473 goto jump_min_repeat_one;
1474 case JUMP_ASSERT:
1475 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1476 goto jump_assert;
1477 case JUMP_ASSERT_NOT:
1478 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1479 goto jump_assert_not;
1480 case JUMP_NONE:
1481 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1482 break;
1483 }
1484
1485 return ret; /* should never get here */
1486 }
1487
1488 LOCAL(Py_ssize_t)
1489 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1490 {
1491 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1492 SRE_CHAR* end = (SRE_CHAR *)state->end;
1493 Py_ssize_t status = 0;
1494 Py_ssize_t prefix_len = 0;
1495 Py_ssize_t prefix_skip = 0;
1496 SRE_CODE* prefix = NULL;
1497 SRE_CODE* charset = NULL;
1498 SRE_CODE* overlap = NULL;
1499 int flags = 0;
1500
1501 if (pattern[0] == SRE_OP_INFO) {
1502 /* optimization info block */
1503 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1504
1505 flags = pattern[2];
1506
1507 if (pattern[3] > 1) {
1508 /* adjust end point (but make sure we leave at least one
1509 character in there, so literal search will work) */
1510 end -= pattern[3]-1;
1511 if (end <= ptr)
1512 end = ptr+1;
1513 }
1514
1515 if (flags & SRE_INFO_PREFIX) {
1516 /* pattern starts with a known prefix */
1517 /* <length> <skip> <prefix data> <overlap data> */
1518 prefix_len = pattern[5];
1519 prefix_skip = pattern[6];
1520 prefix = pattern + 7;
1521 overlap = prefix + prefix_len - 1;
1522 } else if (flags & SRE_INFO_CHARSET)
1523 /* pattern starts with a character from a known set */
1524 /* <charset> */
1525 charset = pattern + 5;
1526
1527 pattern += 1 + pattern[1];
1528 }
1529
1530 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1531 TRACE(("charset = %p\n", charset));
1532
1533 #if defined(USE_FAST_SEARCH)
1534 if (prefix_len > 1) {
1535 /* pattern starts with a known prefix. use the overlap
1536 table to skip forward as fast as we possibly can */
1537 Py_ssize_t i = 0;
1538 end = (SRE_CHAR *)state->end;
1539 while (ptr < end) {
1540 for (;;) {
1541 if ((SRE_CODE) ptr[0] != prefix[i]) {
1542 if (!i)
1543 break;
1544 else
1545 i = overlap[i];
1546 } else {
1547 if (++i == prefix_len) {
1548 /* found a potential match */
1549 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1550 state->start = ptr + 1 - prefix_len;
1551 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1552 if (flags & SRE_INFO_LITERAL)
1553 return 1; /* we got all of it */
1554 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1555 if (status != 0)
1556 return status;
1557 /* close but no cigar -- try again */
1558 i = overlap[i];
1559 }
1560 break;
1561 }
1562 }
1563 ptr++;
1564 }
1565 return 0;
1566 }
1567 #endif
1568
1569 if (pattern[0] == SRE_OP_LITERAL) {
1570 /* pattern starts with a literal character. this is used
1571 for short prefixes, and if fast search is disabled */
1572 SRE_CODE chr = pattern[1];
1573 end = (SRE_CHAR *)state->end;
1574 for (;;) {
1575 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1576 ptr++;
1577 if (ptr >= end)
1578 return 0;
1579 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1580 state->start = ptr;
1581 state->ptr = ++ptr;
1582 if (flags & SRE_INFO_LITERAL)
1583 return 1; /* we got all of it */
1584 status = SRE_MATCH(state, pattern + 2);
1585 if (status != 0)
1586 break;
1587 }
1588 } else if (charset) {
1589 /* pattern starts with a character from a known set */
1590 end = (SRE_CHAR *)state->end;
1591 for (;;) {
1592 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1593 ptr++;
1594 if (ptr >= end)
1595 return 0;
1596 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1597 state->start = ptr;
1598 state->ptr = ptr;
1599 status = SRE_MATCH(state, pattern);
1600 if (status != 0)
1601 break;
1602 ptr++;
1603 }
1604 } else
1605 /* general case */
1606 while (ptr <= end) {
1607 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1608 state->start = state->ptr = ptr++;
1609 status = SRE_MATCH(state, pattern);
1610 if (status != 0)
1611 break;
1612 }
1613
1614 return status;
1615 }
1616
1617 LOCAL(int)
1618 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1619 {
1620 /* check if given string is a literal template (i.e. no escapes) */
1621 while (len-- > 0)
1622 if (*ptr++ == '\\')
1623 return 0;
1624 return 1;
1625 }
1626
1627 #if !defined(SRE_RECURSIVE)
1628
1629 /* -------------------------------------------------------------------- */
1630 /* factories and destructors */
1631
1632 /* see sre.h for object declarations */
1633 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1634 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1635
1636 static PyObject *
1637 sre_codesize(PyObject* self, PyObject *unused)
1638 {
1639 return Py_BuildValue("l", sizeof(SRE_CODE));
1640 }
1641
1642 static PyObject *
1643 sre_getlower(PyObject* self, PyObject* args)
1644 {
1645 int character, flags;
1646 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1647 return NULL;
1648 if (flags & SRE_FLAG_LOCALE)
1649 return Py_BuildValue("i", sre_lower_locale(character));
1650 if (flags & SRE_FLAG_UNICODE)
1651 #if defined(HAVE_UNICODE)
1652 return Py_BuildValue("i", sre_lower_unicode(character));
1653 #else
1654 return Py_BuildValue("i", sre_lower_locale(character));
1655 #endif
1656 return Py_BuildValue("i", sre_lower(character));
1657 }
1658
1659 LOCAL(void)
1660 state_reset(SRE_STATE* state)
1661 {
1662 /* FIXME: dynamic! */
1663 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1664
1665 state->lastmark = -1;
1666 state->lastindex = -1;
1667
1668 state->repeat = NULL;
1669
1670 data_stack_dealloc(state);
1671 }
1672
1673 static void*
1674 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1675 {
1676 /* given a python object, return a data pointer, a length (in
1677 characters), and a character size. return NULL if the object
1678 is not a string (or not compatible) */
1679
1680 PyBufferProcs *buffer;
1681 Py_ssize_t size, bytes;
1682 int charsize;
1683 void* ptr;
1684
1685 #if defined(HAVE_UNICODE)
1686 if (PyUnicode_Check(string)) {
1687 /* unicode strings doesn't always support the buffer interface */
1688 ptr = (void*) PyUnicode_AS_DATA(string);
1689 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1690 size = PyUnicode_GET_SIZE(string);
1691 charsize = sizeof(Py_UNICODE);
1692
1693 } else {
1694 #endif
1695
1696 /* get pointer to string buffer */
1697 buffer = Py_TYPE(string)->tp_as_buffer;
1698 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1699 buffer->bf_getsegcount(string, NULL) != 1) {
1700 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1701 return NULL;
1702 }
1703
1704 /* determine buffer size */
1705 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1706 if (bytes < 0) {
1707 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1708 return NULL;
1709 }
1710
1711 /* determine character size */
1712 #if PY_VERSION_HEX >= 0x01060000
1713 size = PyObject_Size(string);
1714 #else
1715 size = PyObject_Length(string);
1716 #endif
1717
1718 if (PyString_Check(string) || bytes == size)
1719 charsize = 1;
1720 #if defined(HAVE_UNICODE)
1721 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1722 charsize = sizeof(Py_UNICODE);
1723 #endif
1724 else {
1725 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1726 return NULL;
1727 }
1728
1729 #if defined(HAVE_UNICODE)
1730 }
1731 #endif
1732
1733 *p_length = size;
1734 *p_charsize = charsize;
1735
1736 return ptr;
1737 }
1738
1739 LOCAL(PyObject*)
1740 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1741 Py_ssize_t start, Py_ssize_t end)
1742 {
1743 /* prepare state object */
1744
1745 Py_ssize_t length;
1746 int charsize;
1747 void* ptr;
1748
1749 memset(state, 0, sizeof(SRE_STATE));
1750
1751 state->lastmark = -1;
1752 state->lastindex = -1;
1753
1754 ptr = getstring(string, &length, &charsize);
1755 if (!ptr)
1756 return NULL;
1757
1758 /* adjust boundaries */
1759 if (start < 0)
1760 start = 0;
1761 else if (start > length)
1762 start = length;
1763
1764 if (end < 0)
1765 end = 0;
1766 else if (end > length)
1767 end = length;
1768
1769 state->charsize = charsize;
1770
1771 state->beginning = ptr;
1772
1773 state->start = (void*) ((char*) ptr + start * state->charsize);
1774 state->end = (void*) ((char*) ptr + end * state->charsize);
1775
1776 Py_INCREF(string);
1777 state->string = string;
1778 state->pos = start;
1779 state->endpos = end;
1780
1781 if (pattern->flags & SRE_FLAG_LOCALE)
1782 state->lower = sre_lower_locale;
1783 else if (pattern->flags & SRE_FLAG_UNICODE)
1784 #if defined(HAVE_UNICODE)
1785 state->lower = sre_lower_unicode;
1786 #else
1787 state->lower = sre_lower_locale;
1788 #endif
1789 else
1790 state->lower = sre_lower;
1791
1792 return string;
1793 }
1794
1795 LOCAL(void)
1796 state_fini(SRE_STATE* state)
1797 {
1798 Py_XDECREF(state->string);
1799 data_stack_dealloc(state);
1800 }
1801
1802 /* calculate offset from start of string */
1803 #define STATE_OFFSET(state, member)\
1804 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1805
1806 LOCAL(PyObject*)
1807 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1808 {
1809 Py_ssize_t i, j;
1810
1811 index = (index - 1) * 2;
1812
1813 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1814 if (empty)
1815 /* want empty string */
1816 i = j = 0;
1817 else {
1818 Py_INCREF(Py_None);
1819 return Py_None;
1820 }
1821 } else {
1822 i = STATE_OFFSET(state, state->mark[index]);
1823 j = STATE_OFFSET(state, state->mark[index+1]);
1824 }
1825
1826 return PySequence_GetSlice(string, i, j);
1827 }
1828
1829 static void
1830 pattern_error(int status)
1831 {
1832 switch (status) {
1833 case SRE_ERROR_RECURSION_LIMIT:
1834 PyErr_SetString(
1835 PyExc_RuntimeError,
1836 "maximum recursion limit exceeded"
1837 );
1838 break;
1839 case SRE_ERROR_MEMORY:
1840 PyErr_NoMemory();
1841 break;
1842 case SRE_ERROR_INTERRUPTED:
1843 /* An exception has already been raised, so let it fly */
1844 break;
1845 default:
1846 /* other error codes indicate compiler/engine bugs */
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "internal error in regular expression engine"
1850 );
1851 }
1852 }
1853
1854 static void
1855 pattern_dealloc(PatternObject* self)
1856 {
1857 if (self->weakreflist != NULL)
1858 PyObject_ClearWeakRefs((PyObject *) self);
1859 Py_XDECREF(self->pattern);
1860 Py_XDECREF(self->groupindex);
1861 Py_XDECREF(self->indexgroup);
1862 PyObject_DEL(self);
1863 }
1864
1865 static PyObject*
1866 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1867 {
1868 SRE_STATE state;
1869 int status;
1870
1871 PyObject* string;
1872 Py_ssize_t start = 0;
1873 Py_ssize_t end = PY_SSIZE_T_MAX;
1874 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1875 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1876 &string, &start, &end))
1877 return NULL;
1878
1879 string = state_init(&state, self, string, start, end);
1880 if (!string)
1881 return NULL;
1882
1883 state.ptr = state.start;
1884
1885 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1886
1887 if (state.charsize == 1) {
1888 status = sre_match(&state, PatternObject_GetCode(self));
1889 } else {
1890 #if defined(HAVE_UNICODE)
1891 status = sre_umatch(&state, PatternObject_GetCode(self));
1892 #endif
1893 }
1894
1895 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1896 if (PyErr_Occurred())
1897 return NULL;
1898
1899 state_fini(&state);
1900
1901 return pattern_new_match(self, &state, status);
1902 }
1903
1904 static PyObject*
1905 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1906 {
1907 SRE_STATE state;
1908 int status;
1909
1910 PyObject* string;
1911 Py_ssize_t start = 0;
1912 Py_ssize_t end = PY_SSIZE_T_MAX;
1913 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1914 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1915 &string, &start, &end))
1916 return NULL;
1917
1918 string = state_init(&state, self, string, start, end);
1919 if (!string)
1920 return NULL;
1921
1922 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1923
1924 if (state.charsize == 1) {
1925 status = sre_search(&state, PatternObject_GetCode(self));
1926 } else {
1927 #if defined(HAVE_UNICODE)
1928 status = sre_usearch(&state, PatternObject_GetCode(self));
1929 #endif
1930 }
1931
1932 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1933
1934 state_fini(&state);
1935
1936 if (PyErr_Occurred())
1937 return NULL;
1938
1939 return pattern_new_match(self, &state, status);
1940 }
1941
1942 static PyObject*
1943 call(char* module, char* function, PyObject* args)
1944 {
1945 PyObject* name;
1946 PyObject* mod;
1947 PyObject* func;
1948 PyObject* result;
1949
1950 if (!args)
1951 return NULL;
1952 name = PyString_FromString(module);
1953 if (!name)
1954 return NULL;
1955 mod = PyImport_Import(name);
1956 Py_DECREF(name);
1957 if (!mod)
1958 return NULL;
1959 func = PyObject_GetAttrString(mod, function);
1960 Py_DECREF(mod);
1961 if (!func)
1962 return NULL;
1963 result = PyObject_CallObject(func, args);
1964 Py_DECREF(func);
1965 Py_DECREF(args);
1966 return result;
1967 }
1968
1969 #ifdef USE_BUILTIN_COPY
1970 static int
1971 deepcopy(PyObject** object, PyObject* memo)
1972 {
1973 PyObject* copy;
1974
1975 copy = call(
1976 "copy", "deepcopy",
1977 PyTuple_Pack(2, *object, memo)
1978 );
1979 if (!copy)
1980 return 0;
1981
1982 Py_DECREF(*object);
1983 *object = copy;
1984
1985 return 1; /* success */
1986 }
1987 #endif
1988
1989 static PyObject*
1990 join_list(PyObject* list, PyObject* string)
1991 {
1992 /* join list elements */
1993
1994 PyObject* joiner;
1995 #if PY_VERSION_HEX >= 0x01060000
1996 PyObject* function;
1997 PyObject* args;
1998 #endif
1999 PyObject* result;
2000
2001 joiner = PySequence_GetSlice(string, 0, 0);
2002 if (!joiner)
2003 return NULL;
2004
2005 if (PyList_GET_SIZE(list) == 0) {
2006 Py_DECREF(list);
2007 return joiner;
2008 }
2009
2010 #if PY_VERSION_HEX >= 0x01060000
2011 function = PyObject_GetAttrString(joiner, "join");
2012 if (!function) {
2013 Py_DECREF(joiner);
2014 return NULL;
2015 }
2016 args = PyTuple_New(1);
2017 if (!args) {
2018 Py_DECREF(function);
2019 Py_DECREF(joiner);
2020 return NULL;
2021 }
2022 PyTuple_SET_ITEM(args, 0, list);
2023 result = PyObject_CallObject(function, args);
2024 Py_DECREF(args); /* also removes list */
2025 Py_DECREF(function);
2026 #else
2027 result = call(
2028 "string", "join",
2029 PyTuple_Pack(2, list, joiner)
2030 );
2031 #endif
2032 Py_DECREF(joiner);
2033
2034 return result;
2035 }
2036
2037 static PyObject*
2038 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2039 {
2040 SRE_STATE state;
2041 PyObject* list;
2042 int status;
2043 Py_ssize_t i, b, e;
2044
2045 PyObject* string;
2046 Py_ssize_t start = 0;
2047 Py_ssize_t end = PY_SSIZE_T_MAX;
2048 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2049 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2050 &string, &start, &end))
2051 return NULL;
2052
2053 string = state_init(&state, self, string, start, end);
2054 if (!string)
2055 return NULL;
2056
2057 list = PyList_New(0);
2058 if (!list) {
2059 state_fini(&state);
2060 return NULL;
2061 }
2062
2063 while (state.start <= state.end) {
2064
2065 PyObject* item;
2066
2067 state_reset(&state);
2068
2069 state.ptr = state.start;
2070
2071 if (state.charsize == 1) {
2072 status = sre_search(&state, PatternObject_GetCode(self));
2073 } else {
2074 #if defined(HAVE_UNICODE)
2075 status = sre_usearch(&state, PatternObject_GetCode(self));
2076 #endif
2077 }
2078
2079 if (PyErr_Occurred())
2080 goto error;
2081
2082 if (status <= 0) {
2083 if (status == 0)
2084 break;
2085 pattern_error(status);
2086 goto error;
2087 }
2088
2089 /* don't bother to build a match object */
2090 switch (self->groups) {
2091 case 0:
2092 b = STATE_OFFSET(&state, state.start);
2093 e = STATE_OFFSET(&state, state.ptr);
2094 item = PySequence_GetSlice(string, b, e);
2095 if (!item)
2096 goto error;
2097 break;
2098 case 1:
2099 item = state_getslice(&state, 1, string, 1);
2100 if (!item)
2101 goto error;
2102 break;
2103 default:
2104 item = PyTuple_New(self->groups);
2105 if (!item)
2106 goto error;
2107 for (i = 0; i < self->groups; i++) {
2108 PyObject* o = state_getslice(&state, i+1, string, 1);
2109 if (!o) {
2110 Py_DECREF(item);
2111 goto error;
2112 }
2113 PyTuple_SET_ITEM(item, i, o);
2114 }
2115 break;
2116 }
2117
2118 status = PyList_Append(list, item);
2119 Py_DECREF(item);
2120 if (status < 0)
2121 goto error;
2122
2123 if (state.ptr == state.start)
2124 state.start = (void*) ((char*) state.ptr + state.charsize);
2125 else
2126 state.start = state.ptr;
2127
2128 }
2129
2130 state_fini(&state);
2131 return list;
2132
2133 error:
2134 Py_DECREF(list);
2135 state_fini(&state);
2136 return NULL;
2137
2138 }
2139
2140 #if PY_VERSION_HEX >= 0x02020000
2141 static PyObject*
2142 pattern_finditer(PatternObject* pattern, PyObject* args)
2143 {
2144 PyObject* scanner;
2145 PyObject* search;
2146 PyObject* iterator;
2147
2148 scanner = pattern_scanner(pattern, args);
2149 if (!scanner)
2150 return NULL;
2151
2152 search = PyObject_GetAttrString(scanner, "search");
2153 Py_DECREF(scanner);
2154 if (!search)
2155 return NULL;
2156
2157 iterator = PyCallIter_New(search, Py_None);
2158 Py_DECREF(search);
2159
2160 return iterator;
2161 }
2162 #endif
2163
2164 static PyObject*
2165 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2166 {
2167 SRE_STATE state;
2168 PyObject* list;
2169 PyObject* item;
2170 int status;
2171 Py_ssize_t n;
2172 Py_ssize_t i;
2173 void* last;
2174
2175 PyObject* string;
2176 Py_ssize_t maxsplit = 0;
2177 static char* kwlist[] = { "source", "maxsplit", NULL };
2178 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2179 &string, &maxsplit))
2180 return NULL;
2181
2182 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2183 if (!string)
2184 return NULL;
2185
2186 list = PyList_New(0);
2187 if (!list) {
2188 state_fini(&state);
2189 return NULL;
2190 }
2191
2192 n = 0;
2193 last = state.start;
2194
2195 while (!maxsplit || n < maxsplit) {
2196
2197 state_reset(&state);
2198
2199 state.ptr = state.start;
2200
2201 if (state.charsize == 1) {
2202 status = sre_search(&state, PatternObject_GetCode(self));
2203 } else {
2204 #if defined(HAVE_UNICODE)
2205 status = sre_usearch(&state, PatternObject_GetCode(self));
2206 #endif
2207 }
2208
2209 if (PyErr_Occurred())
2210 goto error;
2211
2212 if (status <= 0) {
2213 if (status == 0)
2214 break;
2215 pattern_error(status);
2216 goto error;
2217 }
2218
2219 if (state.start == state.ptr) {
2220 if (last == state.end)
2221 break;
2222 /* skip one character */
2223 state.start = (void*) ((char*) state.ptr + state.charsize);
2224 continue;
2225 }
2226
2227 /* get segment before this match */
2228 item = PySequence_GetSlice(
2229 string, STATE_OFFSET(&state, last),
2230 STATE_OFFSET(&state, state.start)
2231 );
2232 if (!item)
2233 goto error;
2234 status = PyList_Append(list, item);
2235 Py_DECREF(item);
2236 if (status < 0)
2237 goto error;
2238
2239 /* add groups (if any) */
2240 for (i = 0; i < self->groups; i++) {
2241 item = state_getslice(&state, i+1, string, 0);
2242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
2248 }
2249
2250 n = n + 1;
2251
2252 last = state.start = state.ptr;
2253
2254 }
2255
2256 /* get segment following last match (even if empty) */
2257 item = PySequence_GetSlice(
2258 string, STATE_OFFSET(&state, last), state.endpos
2259 );
2260 if (!item)
2261 goto error;
2262 status = PyList_Append(list, item);
2263 Py_DECREF(item);
2264 if (status < 0)
2265 goto error;
2266
2267 state_fini(&state);
2268 return list;
2269
2270 error:
2271 Py_DECREF(list);
2272 state_fini(&state);
2273 return NULL;
2274
2275 }
2276
2277 static PyObject*
2278 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2279 Py_ssize_t count, Py_ssize_t subn)
2280 {
2281 SRE_STATE state;
2282 PyObject* list;
2283 PyObject* item;
2284 PyObject* filter;
2285 PyObject* args;
2286 PyObject* match;
2287 void* ptr;
2288 int status;
2289 Py_ssize_t n;
2290 Py_ssize_t i, b, e;
2291 int bint;
2292 int filter_is_callable;
2293
2294 if (PyCallable_Check(ptemplate)) {
2295 /* sub/subn takes either a function or a template */
2296 filter = ptemplate;
2297 Py_INCREF(filter);
2298 filter_is_callable = 1;
2299 } else {
2300 /* if not callable, check if it's a literal string */
2301 int literal;
2302 ptr = getstring(ptemplate, &n, &bint);
2303 b = bint;
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
2304 if (ptr) {
2305 if (b == 1) {
2306 literal = sre_literal_template((unsigned char *)ptr, n);
2307 } else {
2308 #if defined(HAVE_UNICODE)
2309 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2310 #endif
2311 }
2312 } else {
2313 PyErr_Clear();
2314 literal = 0;
2315 }
2316 if (literal) {
(emitted by cppcheck) (emitted by cppcheck) 2317 filter = ptemplate;
2318 Py_INCREF(filter);
2319 filter_is_callable = 0;
2320 } else {
2321 /* not a literal; hand it over to the template compiler */
2322 filter = call(
2323 SRE_PY_MODULE, "_subx",
2324 PyTuple_Pack(2, self, ptemplate)
2325 );
2326 if (!filter)
2327 return NULL;
2328 filter_is_callable = PyCallable_Check(filter);
2329 }
2330 }
2331
2332 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2333 if (!string) {
2334 Py_DECREF(filter);
2335 return NULL;
2336 }
2337
2338 list = PyList_New(0);
2339 if (!list) {
2340 Py_DECREF(filter);
2341 state_fini(&state);
2342 return NULL;
2343 }
2344
2345 n = i = 0;
2346
2347 while (!count || n < count) {
2348
2349 state_reset(&state);
2350
2351 state.ptr = state.start;
2352
2353 if (state.charsize == 1) {
2354 status = sre_search(&state, PatternObject_GetCode(self));
2355 } else {
2356 #if defined(HAVE_UNICODE)
2357 status = sre_usearch(&state, PatternObject_GetCode(self));
2358 #endif
2359 }
2360
2361 if (PyErr_Occurred())
2362 goto error;
2363
2364 if (status <= 0) {
2365 if (status == 0)
2366 break;
2367 pattern_error(status);
2368 goto error;
2369 }
2370
2371 b = STATE_OFFSET(&state, state.start);
2372 e = STATE_OFFSET(&state, state.ptr);
2373
2374 if (i < b) {
2375 /* get segment before this match */
2376 item = PySequence_GetSlice(string, i, b);
2377 if (!item)
2378 goto error;
2379 status = PyList_Append(list, item);
2380 Py_DECREF(item);
2381 if (status < 0)
2382 goto error;
2383
2384 } else if (i == b && i == e && n > 0)
2385 /* ignore empty match on latest position */
2386 goto next;
2387
2388 if (filter_is_callable) {
2389 /* pass match object through filter */
2390 match = pattern_new_match(self, &state, 1);
2391 if (!match)
2392 goto error;
2393 args = PyTuple_Pack(1, match);
2394 if (!args) {
2395 Py_DECREF(match);
2396 goto error;
2397 }
2398 item = PyObject_CallObject(filter, args);
2399 Py_DECREF(args);
2400 Py_DECREF(match);
2401 if (!item)
2402 goto error;
2403 } else {
2404 /* filter is literal string */
2405 item = filter;
2406 Py_INCREF(item);
2407 }
2408
2409 /* add to list */
2410 if (item != Py_None) {
2411 status = PyList_Append(list, item);
2412 Py_DECREF(item);
2413 if (status < 0)
2414 goto error;
2415 }
2416
2417 i = e;
2418 n = n + 1;
2419
2420 next:
2421 /* move on */
2422 if (state.ptr == state.start)
2423 state.start = (void*) ((char*) state.ptr + state.charsize);
2424 else
2425 state.start = state.ptr;
2426
2427 }
2428
2429 /* get segment following last match */
2430 if (i < state.endpos) {
2431 item = PySequence_GetSlice(string, i, state.endpos);
2432 if (!item)
2433 goto error;
2434 status = PyList_Append(list, item);
2435 Py_DECREF(item);
2436 if (status < 0)
2437 goto error;
2438 }
2439
2440 state_fini(&state);
2441
2442 Py_DECREF(filter);
2443
2444 /* convert list to single string (also removes list) */
2445 item = join_list(list, string);
2446
2447 if (!item)
2448 return NULL;
2449
2450 if (subn)
2451 return Py_BuildValue("Ni", item, n);
2452
2453 return item;
2454
2455 error:
2456 Py_DECREF(list);
2457 state_fini(&state);
2458 Py_DECREF(filter);
2459 return NULL;
2460
2461 }
2462
2463 static PyObject*
2464 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2465 {
2466 PyObject* ptemplate;
2467 PyObject* string;
2468 Py_ssize_t count = 0;
2469 static char* kwlist[] = { "repl", "string", "count", NULL };
2470 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2471 &ptemplate, &string, &count))
2472 return NULL;
2473
2474 return pattern_subx(self, ptemplate, string, count, 0);
2475 }
2476
2477 static PyObject*
2478 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2479 {
2480 PyObject* ptemplate;
2481 PyObject* string;
2482 Py_ssize_t count = 0;
2483 static char* kwlist[] = { "repl", "string", "count", NULL };
2484 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2485 &ptemplate, &string, &count))
2486 return NULL;
2487
2488 return pattern_subx(self, ptemplate, string, count, 1);
2489 }
2490
2491 static PyObject*
2492 pattern_copy(PatternObject* self, PyObject *unused)
2493 {
2494 #ifdef USE_BUILTIN_COPY
2495 PatternObject* copy;
2496 int offset;
2497
2498 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2499 if (!copy)
2500 return NULL;
2501
2502 offset = offsetof(PatternObject, groups);
2503
2504 Py_XINCREF(self->groupindex);
2505 Py_XINCREF(self->indexgroup);
2506 Py_XINCREF(self->pattern);
2507
2508 memcpy((char*) copy + offset, (char*) self + offset,
2509 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2510 copy->weakreflist = NULL;
2511
2512 return (PyObject*) copy;
2513 #else
2514 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2515 return NULL;
2516 #endif
2517 }
2518
2519 static PyObject*
2520 pattern_deepcopy(PatternObject* self, PyObject* memo)
2521 {
2522 #ifdef USE_BUILTIN_COPY
2523 PatternObject* copy;
2524
2525 copy = (PatternObject*) pattern_copy(self);
2526 if (!copy)
2527 return NULL;
2528
2529 if (!deepcopy(©->groupindex, memo) ||
2530 !deepcopy(©->indexgroup, memo) ||
2531 !deepcopy(©->pattern, memo)) {
2532 Py_DECREF(copy);
2533 return NULL;
2534 }
2535
2536 #else
2537 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2538 return NULL;
2539 #endif
2540 }
2541
2542 PyDoc_STRVAR(pattern_match_doc,
2543 "match(string[, pos[, endpos]]) --> match object or None.\n\
2544 Matches zero or more characters at the beginning of the string");
2545
2546 PyDoc_STRVAR(pattern_search_doc,
2547 "search(string[, pos[, endpos]]) --> match object or None.\n\
2548 Scan through string looking for a match, and return a corresponding\n\
2549 MatchObject instance. Return None if no position in the string matches.");
2550
2551 PyDoc_STRVAR(pattern_split_doc,
2552 "split(string[, maxsplit = 0]) --> list.\n\
2553 Split string by the occurrences of pattern.");
2554
2555 PyDoc_STRVAR(pattern_findall_doc,
2556 "findall(string[, pos[, endpos]]) --> list.\n\
2557 Return a list of all non-overlapping matches of pattern in string.");
2558
2559 PyDoc_STRVAR(pattern_finditer_doc,
2560 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2561 Return an iterator over all non-overlapping matches for the \n\
2562 RE pattern in string. For each match, the iterator returns a\n\
2563 match object.");
2564
2565 PyDoc_STRVAR(pattern_sub_doc,
2566 "sub(repl, string[, count = 0]) --> newstring\n\
2567 Return the string obtained by replacing the leftmost non-overlapping\n\
2568 occurrences of pattern in string by the replacement repl.");
2569
2570 PyDoc_STRVAR(pattern_subn_doc,
2571 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2572 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2573 the leftmost non-overlapping occurrences of pattern with the\n\
2574 replacement repl.");
2575
2576 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2577
2578 static PyMethodDef pattern_methods[] = {
2579 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2580 pattern_match_doc},
2581 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2582 pattern_search_doc},
2583 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2584 pattern_sub_doc},
2585 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2586 pattern_subn_doc},
2587 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2588 pattern_split_doc},
2589 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2590 pattern_findall_doc},
2591 #if PY_VERSION_HEX >= 0x02020000
2592 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2593 pattern_finditer_doc},
2594 #endif
2595 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2596 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2597 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2598 {NULL, NULL}
2599 };
2600
2601 #define PAT_OFF(x) offsetof(PatternObject, x)
2602 static PyMemberDef pattern_members[] = {
2603 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2604 {"flags", T_INT, PAT_OFF(flags), READONLY},
2605 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2606 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2607 {NULL} /* Sentinel */
2608 };
2609
2610 statichere PyTypeObject Pattern_Type = {
2611 PyObject_HEAD_INIT(NULL)
2612 0, "_" SRE_MODULE ".SRE_Pattern",
2613 sizeof(PatternObject), sizeof(SRE_CODE),
2614 (destructor)pattern_dealloc, /*tp_dealloc*/
2615 0, /* tp_print */
2616 0, /* tp_getattrn */
2617 0, /* tp_setattr */
2618 0, /* tp_compare */
2619 0, /* tp_repr */
2620 0, /* tp_as_number */
2621 0, /* tp_as_sequence */
2622 0, /* tp_as_mapping */
2623 0, /* tp_hash */
2624 0, /* tp_call */
2625 0, /* tp_str */
2626 0, /* tp_getattro */
2627 0, /* tp_setattro */
2628 0, /* tp_as_buffer */
2629 Py_TPFLAGS_DEFAULT, /* tp_flags */
2630 pattern_doc, /* tp_doc */
2631 0, /* tp_traverse */
2632 0, /* tp_clear */
2633 0, /* tp_richcompare */
2634 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2635 0, /* tp_iter */
2636 0, /* tp_iternext */
2637 pattern_methods, /* tp_methods */
2638 pattern_members, /* tp_members */
2639 };
2640
2641 static int _validate(PatternObject *self); /* Forward */
2642
2643 static PyObject *
2644 _compile(PyObject* self_, PyObject* args)
2645 {
2646 /* "compile" pattern descriptor to pattern object */
2647
2648 PatternObject* self;
2649 Py_ssize_t i, n;
2650
2651 PyObject* pattern;
2652 int flags = 0;
2653 PyObject* code;
2654 Py_ssize_t groups = 0;
2655 PyObject* groupindex = NULL;
2656 PyObject* indexgroup = NULL;
2657 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2658 &PyList_Type, &code, &groups,
2659 &groupindex, &indexgroup))
2660 return NULL;
2661
2662 n = PyList_GET_SIZE(code);
2663 /* coverity[ampersand_in_size] */
2664 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2665 if (!self)
2666 return NULL;
2667 self->weakreflist = NULL;
2668 self->pattern = NULL;
2669 self->groupindex = NULL;
2670 self->indexgroup = NULL;
2671
2672 self->codesize = n;
2673
2674 for (i = 0; i < n; i++) {
2675 PyObject *o = PyList_GET_ITEM(code, i);
2676 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2677 : PyLong_AsUnsignedLong(o);
2678 self->code[i] = (SRE_CODE) value;
2679 if ((unsigned long) self->code[i] != value) {
2680 PyErr_SetString(PyExc_OverflowError,
2681 "regular expression code size limit exceeded");
2682 break;
2683 }
2684 }
2685
2686 if (PyErr_Occurred()) {
2687 Py_DECREF(self);
2688 return NULL;
2689 }
2690
2691 Py_INCREF(pattern);
2692 self->pattern = pattern;
2693
2694 self->flags = flags;
2695
2696 self->groups = groups;
2697
2698 Py_XINCREF(groupindex);
2699 self->groupindex = groupindex;
2700
2701 Py_XINCREF(indexgroup);
2702 self->indexgroup = indexgroup;
2703
2704 self->weakreflist = NULL;
2705
2706 if (!_validate(self)) {
2707 Py_DECREF(self);
2708 return NULL;
2709 }
2710
2711 return (PyObject*) self;
2712 }
2713
2714 /* -------------------------------------------------------------------- */
2715 /* Code validation */
2716
2717 /* To learn more about this code, have a look at the _compile() function in
2718 Lib/sre_compile.py. The validation functions below checks the code array
2719 for conformance with the code patterns generated there.
2720
2721 The nice thing about the generated code is that it is position-independent:
2722 all jumps are relative jumps forward. Also, jumps don't cross each other:
2723 the target of a later jump is always earlier than the target of an earlier
2724 jump. IOW, this is okay:
2725
2726 J---------J-------T--------T
2727 \ \_____/ /
2728 \______________________/
2729
2730 but this is not:
2731
2732 J---------J-------T--------T
2733 \_________\_____/ /
2734 \____________/
2735
2736 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2737 bytes wide (the latter if Python is compiled for "wide" unicode support).
2738 */
2739
2740 /* Defining this one enables tracing of the validator */
2741 #undef VVERBOSE
2742
2743 /* Trace macro for the validator */
2744 #if defined(VVERBOSE)
2745 #define VTRACE(v) printf v
2746 #else
2747 #define VTRACE(v) do {} while(0) /* do nothing */
2748 #endif
2749
2750 /* Report failure */
2751 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2752
2753 /* Extract opcode, argument, or skip count from code array */
2754 #define GET_OP \
2755 do { \
2756 VTRACE(("%p: ", code)); \
2757 if (code >= end) FAIL; \
2758 op = *code++; \
2759 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2760 } while (0)
2761 #define GET_ARG \
2762 do { \
2763 VTRACE(("%p= ", code)); \
2764 if (code >= end) FAIL; \
2765 arg = *code++; \
2766 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2767 } while (0)
2768 #define GET_SKIP_ADJ(adj) \
2769 do { \
2770 VTRACE(("%p= ", code)); \
2771 if (code >= end) FAIL; \
2772 skip = *code; \
2773 VTRACE(("%lu (skip to %p)\n", \
2774 (unsigned long)skip, code+skip)); \
2775 if (code+skip-adj < code || code+skip-adj > end)\
2776 FAIL; \
2777 code++; \
2778 } while (0)
2779 #define GET_SKIP GET_SKIP_ADJ(0)
2780
2781 static int
2782 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2783 {
2784 /* Some variables are manipulated by the macros above */
2785 SRE_CODE op;
2786 SRE_CODE arg;
2787 SRE_CODE offset;
2788 int i;
2789
2790 while (code < end) {
2791 GET_OP;
2792 switch (op) {
2793
2794 case SRE_OP_NEGATE:
2795 break;
2796
2797 case SRE_OP_LITERAL:
2798 GET_ARG;
2799 break;
2800
2801 case SRE_OP_RANGE:
2802 GET_ARG;
2803 GET_ARG;
2804 break;
2805
2806 case SRE_OP_CHARSET:
2807 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2808 if (code+offset < code || code+offset > end)
2809 FAIL;
2810 code += offset;
2811 break;
2812
2813 case SRE_OP_BIGCHARSET:
2814 GET_ARG; /* Number of blocks */
2815 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2816 if (code+offset < code || code+offset > end)
2817 FAIL;
2818 /* Make sure that each byte points to a valid block */
2819 for (i = 0; i < 256; i++) {
2820 if (((unsigned char *)code)[i] >= arg)
2821 FAIL;
2822 }
2823 code += offset;
2824 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2825 if (code+offset < code || code+offset > end)
2826 FAIL;
2827 code += offset;
2828 break;
2829
2830 case SRE_OP_CATEGORY:
2831 GET_ARG;
2832 switch (arg) {
2833 case SRE_CATEGORY_DIGIT:
2834 case SRE_CATEGORY_NOT_DIGIT:
2835 case SRE_CATEGORY_SPACE:
2836 case SRE_CATEGORY_NOT_SPACE:
2837 case SRE_CATEGORY_WORD:
2838 case SRE_CATEGORY_NOT_WORD:
2839 case SRE_CATEGORY_LINEBREAK:
2840 case SRE_CATEGORY_NOT_LINEBREAK:
2841 case SRE_CATEGORY_LOC_WORD:
2842 case SRE_CATEGORY_LOC_NOT_WORD:
2843 case SRE_CATEGORY_UNI_DIGIT:
2844 case SRE_CATEGORY_UNI_NOT_DIGIT:
2845 case SRE_CATEGORY_UNI_SPACE:
2846 case SRE_CATEGORY_UNI_NOT_SPACE:
2847 case SRE_CATEGORY_UNI_WORD:
2848 case SRE_CATEGORY_UNI_NOT_WORD:
2849 case SRE_CATEGORY_UNI_LINEBREAK:
2850 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2851 break;
2852 default:
2853 FAIL;
2854 }
2855 break;
2856
2857 default:
2858 FAIL;
2859
2860 }
2861 }
2862
2863 return 1;
2864 }
2865
2866 static int
2867 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2868 {
2869 /* Some variables are manipulated by the macros above */
2870 SRE_CODE op;
2871 SRE_CODE arg;
2872 SRE_CODE skip;
2873
2874 VTRACE(("code=%p, end=%p\n", code, end));
2875
2876 if (code > end)
2877 FAIL;
2878
2879 while (code < end) {
2880 GET_OP;
2881 switch (op) {
2882
2883 case SRE_OP_MARK:
2884 /* We don't check whether marks are properly nested; the
2885 sre_match() code is robust even if they don't, and the worst
2886 you can get is nonsensical match results. */
2887 GET_ARG;
2888 if (arg > 2*groups+1) {
2889 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2890 FAIL;
2891 }
2892 break;
2893
2894 case SRE_OP_LITERAL:
2895 case SRE_OP_NOT_LITERAL:
2896 case SRE_OP_LITERAL_IGNORE:
2897 case SRE_OP_NOT_LITERAL_IGNORE:
2898 GET_ARG;
2899 /* The arg is just a character, nothing to check */
2900 break;
2901
2902 case SRE_OP_SUCCESS:
2903 case SRE_OP_FAILURE:
2904 /* Nothing to check; these normally end the matching process */
2905 break;
2906
2907 case SRE_OP_AT:
2908 GET_ARG;
2909 switch (arg) {
2910 case SRE_AT_BEGINNING:
2911 case SRE_AT_BEGINNING_STRING:
2912 case SRE_AT_BEGINNING_LINE:
2913 case SRE_AT_END:
2914 case SRE_AT_END_LINE:
2915 case SRE_AT_END_STRING:
2916 case SRE_AT_BOUNDARY:
2917 case SRE_AT_NON_BOUNDARY:
2918 case SRE_AT_LOC_BOUNDARY:
2919 case SRE_AT_LOC_NON_BOUNDARY:
2920 case SRE_AT_UNI_BOUNDARY:
2921 case SRE_AT_UNI_NON_BOUNDARY:
2922 break;
2923 default:
2924 FAIL;
2925 }
2926 break;
2927
2928 case SRE_OP_ANY:
2929 case SRE_OP_ANY_ALL:
2930 /* These have no operands */
2931 break;
2932
2933 case SRE_OP_IN:
2934 case SRE_OP_IN_IGNORE:
2935 GET_SKIP;
2936 /* Stop 1 before the end; we check the FAILURE below */
2937 if (!_validate_charset(code, code+skip-2))
2938 FAIL;
2939 if (code[skip-2] != SRE_OP_FAILURE)
2940 FAIL;
2941 code += skip-1;
2942 break;
2943
2944 case SRE_OP_INFO:
2945 {
2946 /* A minimal info field is
2947 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2948 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2949 more follows. */
2950 SRE_CODE flags, i;
2951 SRE_CODE *newcode;
2952 GET_SKIP;
2953 newcode = code+skip-1;
2954 GET_ARG; flags = arg;
2955 GET_ARG; /* min */
2956 GET_ARG; /* max */
2957 /* Check that only valid flags are present */
2958 if ((flags & ~(SRE_INFO_PREFIX |
2959 SRE_INFO_LITERAL |
2960 SRE_INFO_CHARSET)) != 0)
2961 FAIL;
2962 /* PREFIX and CHARSET are mutually exclusive */
2963 if ((flags & SRE_INFO_PREFIX) &&
2964 (flags & SRE_INFO_CHARSET))
2965 FAIL;
2966 /* LITERAL implies PREFIX */
2967 if ((flags & SRE_INFO_LITERAL) &&
2968 !(flags & SRE_INFO_PREFIX))
2969 FAIL;
2970 /* Validate the prefix */
2971 if (flags & SRE_INFO_PREFIX) {
2972 SRE_CODE prefix_len;
2973 GET_ARG; prefix_len = arg;
2974 GET_ARG; /* prefix skip */
2975 /* Here comes the prefix string */
2976 if (code+prefix_len < code || code+prefix_len > newcode)
2977 FAIL;
2978 code += prefix_len;
2979 /* And here comes the overlap table */
2980 if (code+prefix_len < code || code+prefix_len > newcode)
2981 FAIL;
2982 /* Each overlap value should be < prefix_len */
2983 for (i = 0; i < prefix_len; i++) {
2984 if (code[i] >= prefix_len)
2985 FAIL;
2986 }
2987 code += prefix_len;
2988 }
2989 /* Validate the charset */
2990 if (flags & SRE_INFO_CHARSET) {
2991 if (!_validate_charset(code, newcode-1))
2992 FAIL;
2993 if (newcode[-1] != SRE_OP_FAILURE)
2994 FAIL;
2995 code = newcode;
2996 }
2997 else if (code != newcode) {
2998 VTRACE(("code=%p, newcode=%p\n", code, newcode));
2999 FAIL;
3000 }
3001 }
3002 break;
3003
3004 case SRE_OP_BRANCH:
3005 {
3006 SRE_CODE *target = NULL;
3007 for (;;) {
3008 GET_SKIP;
3009 if (skip == 0)
3010 break;
3011 /* Stop 2 before the end; we check the JUMP below */
3012 if (!_validate_inner(code, code+skip-3, groups))
3013 FAIL;
3014 code += skip-3;
3015 /* Check that it ends with a JUMP, and that each JUMP
3016 has the same target */
3017 GET_OP;
3018 if (op != SRE_OP_JUMP)
3019 FAIL;
3020 GET_SKIP;
3021 if (target == NULL)
3022 target = code+skip-1;
3023 else if (code+skip-1 != target)
3024 FAIL;
3025 }
3026 }
3027 break;
3028
3029 case SRE_OP_REPEAT_ONE:
3030 case SRE_OP_MIN_REPEAT_ONE:
3031 {
3032 SRE_CODE min, max;
3033 GET_SKIP;
3034 GET_ARG; min = arg;
3035 GET_ARG; max = arg;
3036 if (min > max)
3037 FAIL;
3038 #ifdef Py_UNICODE_WIDE
3039 if (max > 65535)
3040 FAIL;
3041 #endif
3042 if (!_validate_inner(code, code+skip-4, groups))
3043 FAIL;
3044 code += skip-4;
3045 GET_OP;
3046 if (op != SRE_OP_SUCCESS)
3047 FAIL;
3048 }
3049 break;
3050
3051 case SRE_OP_REPEAT:
3052 {
3053 SRE_CODE min, max;
3054 GET_SKIP;
3055 GET_ARG; min = arg;
3056 GET_ARG; max = arg;
3057 if (min > max)
3058 FAIL;
3059 #ifdef Py_UNICODE_WIDE
3060 if (max > 65535)
3061 FAIL;
3062 #endif
3063 if (!_validate_inner(code, code+skip-3, groups))
3064 FAIL;
3065 code += skip-3;
3066 GET_OP;
3067 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3068 FAIL;
3069 }
3070 break;
3071
3072 case SRE_OP_GROUPREF:
3073 case SRE_OP_GROUPREF_IGNORE:
3074 GET_ARG;
3075 if (arg >= groups)
3076 FAIL;
3077 break;
3078
3079 case SRE_OP_GROUPREF_EXISTS:
3080 /* The regex syntax for this is: '(?(group)then|else)', where
3081 'group' is either an integer group number or a group name,
3082 'then' and 'else' are sub-regexes, and 'else' is optional. */
3083 GET_ARG;
3084 if (arg >= groups)
3085 FAIL;
3086 GET_SKIP_ADJ(1);
3087 code--; /* The skip is relative to the first arg! */
3088 /* There are two possibilities here: if there is both a 'then'
3089 part and an 'else' part, the generated code looks like:
3090
3091 GROUPREF_EXISTS
3092 <group>
3093 <skipyes>
3094 ...then part...
3095 JUMP
3096 <skipno>
3097 (<skipyes> jumps here)
3098 ...else part...
3099 (<skipno> jumps here)
3100
3101 If there is only a 'then' part, it looks like:
3102
3103 GROUPREF_EXISTS
3104 <group>
3105 <skip>
3106 ...then part...
3107 (<skip> jumps here)
3108
3109 There is no direct way to decide which it is, and we don't want
3110 to allow arbitrary jumps anywhere in the code; so we just look
3111 for a JUMP opcode preceding our skip target.
3112 */
3113 if (skip >= 3 && code+skip-3 >= code &&
3114 code[skip-3] == SRE_OP_JUMP)
3115 {
3116 VTRACE(("both then and else parts present\n"));
3117 if (!_validate_inner(code+1, code+skip-3, groups))
3118 FAIL;
3119 code += skip-2; /* Position after JUMP, at <skipno> */
3120 GET_SKIP;
3121 if (!_validate_inner(code, code+skip-1, groups))
3122 FAIL;
3123 code += skip-1;
3124 }
3125 else {
3126 VTRACE(("only a then part present\n"));
3127 if (!_validate_inner(code+1, code+skip-1, groups))
3128 FAIL;
3129 code += skip-1;
3130 }
3131 break;
3132
3133 case SRE_OP_ASSERT:
3134 case SRE_OP_ASSERT_NOT:
3135 GET_SKIP;
3136 GET_ARG; /* 0 for lookahead, width for lookbehind */
3137 code--; /* Back up over arg to simplify math below */
3138 if (arg & 0x80000000)
3139 FAIL; /* Width too large */
3140 /* Stop 1 before the end; we check the SUCCESS below */
3141 if (!_validate_inner(code+1, code+skip-2, groups))
3142 FAIL;
3143 code += skip-2;
3144 GET_OP;
3145 if (op != SRE_OP_SUCCESS)
3146 FAIL;
3147 break;
3148
3149 default:
3150 FAIL;
3151
3152 }
3153 }
3154
3155 VTRACE(("okay\n"));
3156 return 1;
3157 }
3158
3159 static int
3160 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3161 {
3162 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3163 FAIL;
3164 if (groups == 0) /* fix for simplejson */
3165 groups = 100; /* 100 groups should always be safe */
3166 return _validate_inner(code, end-1, groups);
3167 }
3168
3169 static int
3170 _validate(PatternObject *self)
3171 {
3172 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3173 {
3174 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3175 return 0;
3176 }
3177 else
3178 VTRACE(("Success!\n"));
3179 return 1;
3180 }
3181
3182 /* -------------------------------------------------------------------- */
3183 /* match methods */
3184
3185 static void
3186 match_dealloc(MatchObject* self)
3187 {
3188 Py_XDECREF(self->regs);
3189 Py_XDECREF(self->string);
3190 Py_DECREF(self->pattern);
3191 PyObject_DEL(self);
3192 }
3193
3194 static PyObject*
3195 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3196 {
3197 if (index < 0 || index >= self->groups) {
3198 /* raise IndexError if we were given a bad group number */
3199 PyErr_SetString(
3200 PyExc_IndexError,
3201 "no such group"
3202 );
3203 return NULL;
3204 }
3205
3206 index *= 2;
3207
3208 if (self->string == Py_None || self->mark[index] < 0) {
3209 /* return default value if the string or group is undefined */
3210 Py_INCREF(def);
3211 return def;
3212 }
3213
3214 return PySequence_GetSlice(
3215 self->string, self->mark[index], self->mark[index+1]
3216 );
3217 }
3218
3219 static Py_ssize_t
3220 match_getindex(MatchObject* self, PyObject* index)
3221 {
3222 Py_ssize_t i;
3223
3224 if (PyInt_Check(index))
3225 return PyInt_AsSsize_t(index);
3226
3227 i = -1;
3228
3229 if (self->pattern->groupindex) {
3230 index = PyObject_GetItem(self->pattern->groupindex, index);
3231 if (index) {
3232 if (PyInt_Check(index) || PyLong_Check(index))
3233 i = PyInt_AsSsize_t(index);
3234 Py_DECREF(index);
3235 } else
3236 PyErr_Clear();
3237 }
3238
3239 return i;
3240 }
3241
3242 static PyObject*
3243 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3244 {
3245 return match_getslice_by_index(self, match_getindex(self, index), def);
3246 }
3247
3248 static PyObject*
3249 match_expand(MatchObject* self, PyObject* ptemplate)
3250 {
3251 /* delegate to Python code */
3252 return call(
3253 SRE_PY_MODULE, "_expand",
3254 PyTuple_Pack(3, self->pattern, self, ptemplate)
3255 );
3256 }
3257
3258 static PyObject*
3259 match_group(MatchObject* self, PyObject* args)
3260 {
3261 PyObject* result;
3262 Py_ssize_t i, size;
3263
3264 size = PyTuple_GET_SIZE(args);
3265
3266 switch (size) {
3267 case 0:
3268 result = match_getslice(self, Py_False, Py_None);
3269 break;
3270 case 1:
3271 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3272 break;
3273 default:
3274 /* fetch multiple items */
3275 result = PyTuple_New(size);
3276 if (!result)
3277 return NULL;
3278 for (i = 0; i < size; i++) {
3279 PyObject* item = match_getslice(
3280 self, PyTuple_GET_ITEM(args, i), Py_None
3281 );
3282 if (!item) {
3283 Py_DECREF(result);
3284 return NULL;
3285 }
3286 PyTuple_SET_ITEM(result, i, item);
3287 }
3288 break;
3289 }
3290 return result;
3291 }
3292
3293 static PyObject*
3294 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3295 {
3296 PyObject* result;
3297 Py_ssize_t index;
3298
3299 PyObject* def = Py_None;
3300 static char* kwlist[] = { "default", NULL };
3301 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3302 return NULL;
3303
3304 result = PyTuple_New(self->groups-1);
3305 if (!result)
3306 return NULL;
3307
3308 for (index = 1; index < self->groups; index++) {
3309 PyObject* item;
3310 item = match_getslice_by_index(self, index, def);
3311 if (!item) {
3312 Py_DECREF(result);
3313 return NULL;
3314 }
3315 PyTuple_SET_ITEM(result, index-1, item);
3316 }
3317
3318 return result;
3319 }
3320
3321 static PyObject*
3322 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3323 {
3324 PyObject* result;
3325 PyObject* keys;
3326 Py_ssize_t index;
3327
3328 PyObject* def = Py_None;
3329 static char* kwlist[] = { "default", NULL };
3330 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3331 return NULL;
3332
3333 result = PyDict_New();
3334 if (!result || !self->pattern->groupindex)
3335 return result;
3336
3337 keys = PyMapping_Keys(self->pattern->groupindex);
3338 if (!keys)
3339 goto failed;
3340
3341 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3342 int status;
3343 PyObject* key;
3344 PyObject* value;
3345 key = PyList_GET_ITEM(keys, index);
3346 if (!key)
3347 goto failed;
3348 value = match_getslice(self, key, def);
3349 if (!value) {
3350 Py_DECREF(key);
3351 goto failed;
3352 }
3353 status = PyDict_SetItem(result, key, value);
3354 Py_DECREF(value);
3355 if (status < 0)
3356 goto failed;
3357 }
3358
3359 Py_DECREF(keys);
3360
3361 return result;
3362
3363 failed:
3364 Py_XDECREF(keys);
3365 Py_DECREF(result);
3366 return NULL;
3367 }
3368
3369 static PyObject*
3370 match_start(MatchObject* self, PyObject* args)
3371 {
3372 Py_ssize_t index;
3373
3374 PyObject* index_ = Py_False; /* zero */
3375 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3376 return NULL;
3377
3378 index = match_getindex(self, index_);
3379
3380 if (index < 0 || index >= self->groups) {
3381 PyErr_SetString(
3382 PyExc_IndexError,
3383 "no such group"
3384 );
3385 return NULL;
3386 }
3387
3388 /* mark is -1 if group is undefined */
3389 return Py_BuildValue("i", self->mark[index*2]);
3390 }
3391
3392 static PyObject*
3393 match_end(MatchObject* self, PyObject* args)
3394 {
3395 Py_ssize_t index;
3396
3397 PyObject* index_ = Py_False; /* zero */
3398 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3399 return NULL;
3400
3401 index = match_getindex(self, index_);
3402
3403 if (index < 0 || index >= self->groups) {
3404 PyErr_SetString(
3405 PyExc_IndexError,
3406 "no such group"
3407 );
3408 return NULL;
3409 }
3410
3411 /* mark is -1 if group is undefined */
3412 return Py_BuildValue("i", self->mark[index*2+1]);
3413 }
3414
3415 LOCAL(PyObject*)
3416 _pair(Py_ssize_t i1, Py_ssize_t i2)
3417 {
3418 PyObject* pair;
3419 PyObject* item;
3420
3421 pair = PyTuple_New(2);
3422 if (!pair)
3423 return NULL;
3424
3425 item = PyInt_FromSsize_t(i1);
3426 if (!item)
3427 goto error;
3428 PyTuple_SET_ITEM(pair, 0, item);
3429
3430 item = PyInt_FromSsize_t(i2);
3431 if (!item)
3432 goto error;
3433 PyTuple_SET_ITEM(pair, 1, item);
3434
3435 return pair;
3436
3437 error:
3438 Py_DECREF(pair);
3439 return NULL;
3440 }
3441
3442 static PyObject*
3443 match_span(MatchObject* self, PyObject* args)
3444 {
3445 Py_ssize_t index;
3446
3447 PyObject* index_ = Py_False; /* zero */
3448 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3449 return NULL;
3450
3451 index = match_getindex(self, index_);
3452
3453 if (index < 0 || index >= self->groups) {
3454 PyErr_SetString(
3455 PyExc_IndexError,
3456 "no such group"
3457 );
3458 return NULL;
3459 }
3460
3461 /* marks are -1 if group is undefined */
3462 return _pair(self->mark[index*2], self->mark[index*2+1]);
3463 }
3464
3465 static PyObject*
3466 match_regs(MatchObject* self)
3467 {
3468 PyObject* regs;
3469 PyObject* item;
3470 Py_ssize_t index;
3471
3472 regs = PyTuple_New(self->groups);
3473 if (!regs)
3474 return NULL;
3475
3476 for (index = 0; index < self->groups; index++) {
3477 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3478 if (!item) {
3479 Py_DECREF(regs);
3480 return NULL;
3481 }
3482 PyTuple_SET_ITEM(regs, index, item);
3483 }
3484
3485 Py_INCREF(regs);
3486 self->regs = regs;
3487
3488 return regs;
3489 }
3490
3491 static PyObject*
3492 match_copy(MatchObject* self, PyObject *unused)
3493 {
3494 #ifdef USE_BUILTIN_COPY
3495 MatchObject* copy;
3496 Py_ssize_t slots, offset;
3497
3498 slots = 2 * (self->pattern->groups+1);
3499
3500 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3501 if (!copy)
3502 return NULL;
3503
3504 /* this value a constant, but any compiler should be able to
3505 figure that out all by itself */
3506 offset = offsetof(MatchObject, string);
3507
3508 Py_XINCREF(self->pattern);
3509 Py_XINCREF(self->string);
3510 Py_XINCREF(self->regs);
3511
3512 memcpy((char*) copy + offset, (char*) self + offset,
3513 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3514
3515 return (PyObject*) copy;
3516 #else
3517 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3518 return NULL;
3519 #endif
3520 }
3521
3522 static PyObject*
3523 match_deepcopy(MatchObject* self, PyObject* memo)
3524 {
3525 #ifdef USE_BUILTIN_COPY
3526 MatchObject* copy;
3527
3528 copy = (MatchObject*) match_copy(self);
3529 if (!copy)
3530 return NULL;
3531
3532 if (!deepcopy((PyObject**) ©->pattern, memo) ||
3533 !deepcopy(©->string, memo) ||
3534 !deepcopy(©->regs, memo)) {
3535 Py_DECREF(copy);
3536 return NULL;
3537 }
3538
3539 #else
3540 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3541 return NULL;
3542 #endif
3543 }
3544
3545 static struct PyMethodDef match_methods[] = {
3546 {"group", (PyCFunction) match_group, METH_VARARGS},
3547 {"start", (PyCFunction) match_start, METH_VARARGS},
3548 {"end", (PyCFunction) match_end, METH_VARARGS},
3549 {"span", (PyCFunction) match_span, METH_VARARGS},
3550 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3551 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3552 {"expand", (PyCFunction) match_expand, METH_O},
3553 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3554 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3555 {NULL, NULL}
3556 };
3557
3558 static PyObject *
3559 match_lastindex_get(MatchObject *self)
3560 {
3561 if (self->lastindex >= 0)
3562 return Py_BuildValue("i", self->lastindex);
3563 Py_INCREF(Py_None);
3564 return Py_None;
3565 }
3566
3567 static PyObject *
3568 match_lastgroup_get(MatchObject *self)
3569 {
3570 if (self->pattern->indexgroup && self->lastindex >= 0) {
3571 PyObject* result = PySequence_GetItem(
3572 self->pattern->indexgroup, self->lastindex
3573 );
3574 if (result)
3575 return result;
3576 PyErr_Clear();
3577 }
3578 Py_INCREF(Py_None);
3579 return Py_None;
3580 }
3581
3582 static PyObject *
3583 match_regs_get(MatchObject *self)
3584 {
3585 if (self->regs) {
3586 Py_INCREF(self->regs);
3587 return self->regs;
3588 } else
3589 return match_regs(self);
3590 }
3591
3592 static PyGetSetDef match_getset[] = {
3593 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3594 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3595 {"regs", (getter)match_regs_get, (setter)NULL},
3596 {NULL}
3597 };
3598
3599 #define MATCH_OFF(x) offsetof(MatchObject, x)
3600 static PyMemberDef match_members[] = {
3601 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3602 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3603 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3604 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3605 {NULL}
3606 };
3607
3608
3609 /* FIXME: implement setattr("string", None) as a special case (to
3610 detach the associated string, if any */
3611
3612 static PyTypeObject Match_Type = {
3613 PyVarObject_HEAD_INIT(NULL, 0)
3614 "_" SRE_MODULE ".SRE_Match",
3615 sizeof(MatchObject), sizeof(Py_ssize_t),
3616 (destructor)match_dealloc, /* tp_dealloc */
3617 0, /* tp_print */
3618 0, /* tp_getattr */
3619 0, /* tp_setattr */
3620 0, /* tp_compare */
3621 0, /* tp_repr */
3622 0, /* tp_as_number */
3623 0, /* tp_as_sequence */
3624 0, /* tp_as_mapping */
3625 0, /* tp_hash */
3626 0, /* tp_call */
3627 0, /* tp_str */
3628 0, /* tp_getattro */
3629 0, /* tp_setattro */
3630 0, /* tp_as_buffer */
3631 Py_TPFLAGS_DEFAULT,
3632 0, /* tp_doc */
3633 0, /* tp_traverse */
3634 0, /* tp_clear */
3635 0, /* tp_richcompare */
3636 0, /* tp_weaklistoffset */
3637 0, /* tp_iter */
3638 0, /* tp_iternext */
3639 match_methods, /* tp_methods */
3640 match_members, /* tp_members */
3641 match_getset, /* tp_getset */
3642 };
3643
3644 static PyObject*
3645 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3646 {
3647 /* create match object (from state object) */
3648
3649 MatchObject* match;
3650 Py_ssize_t i, j;
3651 char* base;
3652 int n;
3653
3654 if (status > 0) {
3655
3656 /* create match object (with room for extra group marks) */
3657 /* coverity[ampersand_in_size] */
3658 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3659 2*(pattern->groups+1));
3660 if (!match)
3661 return NULL;
3662
3663 Py_INCREF(pattern);
3664 match->pattern = pattern;
3665
3666 Py_INCREF(state->string);
3667 match->string = state->string;
3668
3669 match->regs = NULL;
3670 match->groups = pattern->groups+1;
3671
3672 /* fill in group slices */
3673
3674 base = (char*) state->beginning;
3675 n = state->charsize;
3676
3677 match->mark[0] = ((char*) state->start - base) / n;
3678 match->mark[1] = ((char*) state->ptr - base) / n;
3679
3680 for (i = j = 0; i < pattern->groups; i++, j+=2)
3681 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3682 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3683 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3684 } else
3685 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3686
3687 match->pos = state->pos;
3688 match->endpos = state->endpos;
3689
3690 match->lastindex = state->lastindex;
3691
3692 return (PyObject*) match;
3693
3694 } else if (status == 0) {
3695
3696 /* no match */
3697 Py_INCREF(Py_None);
3698 return Py_None;
3699
3700 }
3701
3702 /* internal error */
3703 pattern_error(status);
3704 return NULL;
3705 }
3706
3707
3708 /* -------------------------------------------------------------------- */
3709 /* scanner methods (experimental) */
3710
3711 static void
3712 scanner_dealloc(ScannerObject* self)
3713 {
3714 state_fini(&self->state);
3715 Py_XDECREF(self->pattern);
3716 PyObject_DEL(self);
3717 }
3718
3719 static PyObject*
3720 scanner_match(ScannerObject* self, PyObject *unused)
3721 {
3722 SRE_STATE* state = &self->state;
3723 PyObject* match;
3724 int status;
3725
3726 state_reset(state);
3727
3728 state->ptr = state->start;
3729
3730 if (state->charsize == 1) {
3731 status = sre_match(state, PatternObject_GetCode(self->pattern));
3732 } else {
3733 #if defined(HAVE_UNICODE)
3734 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3735 #endif
3736 }
3737 if (PyErr_Occurred())
3738 return NULL;
3739
3740 match = pattern_new_match((PatternObject*) self->pattern,
3741 state, status);
3742
3743 if (status == 0 || state->ptr == state->start)
3744 state->start = (void*) ((char*) state->ptr + state->charsize);
3745 else
3746 state->start = state->ptr;
3747
3748 return match;
3749 }
3750
3751
3752 static PyObject*
3753 scanner_search(ScannerObject* self, PyObject *unused)
3754 {
3755 SRE_STATE* state = &self->state;
3756 PyObject* match;
3757 int status;
3758
3759 state_reset(state);
3760
3761 state->ptr = state->start;
3762
3763 if (state->charsize == 1) {
3764 status = sre_search(state, PatternObject_GetCode(self->pattern));
3765 } else {
3766 #if defined(HAVE_UNICODE)
3767 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3768 #endif
3769 }
3770 if (PyErr_Occurred())
3771 return NULL;
3772
3773 match = pattern_new_match((PatternObject*) self->pattern,
3774 state, status);
3775
3776 if (status == 0 || state->ptr == state->start)
3777 state->start = (void*) ((char*) state->ptr + state->charsize);
3778 else
3779 state->start = state->ptr;
3780
3781 return match;
3782 }
3783
3784 static PyMethodDef scanner_methods[] = {
3785 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3786 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3787 {NULL, NULL}
3788 };
3789
3790 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3791 static PyMemberDef scanner_members[] = {
3792 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3793 {NULL} /* Sentinel */
3794 };
3795
3796 statichere PyTypeObject Scanner_Type = {
3797 PyObject_HEAD_INIT(NULL)
3798 0, "_" SRE_MODULE ".SRE_Scanner",
3799 sizeof(ScannerObject), 0,
3800 (destructor)scanner_dealloc, /*tp_dealloc*/
3801 0, /* tp_print */
3802 0, /* tp_getattr */
3803 0, /* tp_setattr */
3804 0, /* tp_reserved */
3805 0, /* tp_repr */
3806 0, /* tp_as_number */
3807 0, /* tp_as_sequence */
3808 0, /* tp_as_mapping */
3809 0, /* tp_hash */
3810 0, /* tp_call */
3811 0, /* tp_str */
3812 0, /* tp_getattro */
3813 0, /* tp_setattro */
3814 0, /* tp_as_buffer */
3815 Py_TPFLAGS_DEFAULT, /* tp_flags */
3816 0, /* tp_doc */
3817 0, /* tp_traverse */
3818 0, /* tp_clear */
3819 0, /* tp_richcompare */
3820 0, /* tp_weaklistoffset */
3821 0, /* tp_iter */
3822 0, /* tp_iternext */
3823 scanner_methods, /* tp_methods */
3824 scanner_members, /* tp_members */
3825 0, /* tp_getset */
3826 };
3827
3828 static PyObject*
3829 pattern_scanner(PatternObject* pattern, PyObject* args)
3830 {
3831 /* create search state object */
3832
3833 ScannerObject* self;
3834
3835 PyObject* string;
3836 Py_ssize_t start = 0;
3837 Py_ssize_t end = PY_SSIZE_T_MAX;
3838 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3839 return NULL;
3840
3841 /* create scanner object */
3842 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3843 if (!self)
3844 return NULL;
3845 self->pattern = NULL;
3846
3847 string = state_init(&self->state, pattern, string, start, end);
3848 if (!string) {
3849 Py_DECREF(self);
3850 return NULL;
3851 }
3852
3853 Py_INCREF(pattern);
3854 self->pattern = (PyObject*) pattern;
3855
3856 return (PyObject*) self;
3857 }
3858
3859 static PyMethodDef _functions[] = {
3860 {"compile", _compile, METH_VARARGS},
3861 {"getcodesize", sre_codesize, METH_NOARGS},
3862 {"getlower", sre_getlower, METH_VARARGS},
3863 {NULL, NULL}
3864 };
3865
3866 #if PY_VERSION_HEX < 0x02030000
3867 DL_EXPORT(void) init_sre(void)
3868 #else
3869 PyMODINIT_FUNC init_sre(void)
3870 #endif
3871 {
3872 PyObject* m;
3873 PyObject* d;
3874 PyObject* x;
3875
3876 /* Patch object types */
3877 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3878 PyType_Ready(&Scanner_Type))
3879 return;
3880
3881 m = Py_InitModule("_" SRE_MODULE, _functions);
3882 if (m == NULL)
3883 return;
3884 d = PyModule_GetDict(m);
3885
3886 x = PyInt_FromLong(SRE_MAGIC);
3887 if (x) {
3888 PyDict_SetItemString(d, "MAGIC", x);
3889 Py_DECREF(x);
3890 }
3891
3892 x = PyInt_FromLong(sizeof(SRE_CODE));
3893 if (x) {
3894 PyDict_SetItemString(d, "CODESIZE", x);
3895 Py_DECREF(x);
3896 }
3897
3898 x = PyString_FromString(copyright);
3899 if (x) {
3900 PyDict_SetItemString(d, "copyright", x);
3901 Py_DECREF(x);
3902 }
3903 }
3904
3905 #endif /* !defined(SRE_RECURSIVE) */
3906
3907 /* vim:ts=4:sw=4:et
3908 */