Location | Tool | Test ID | Function | Issue |
---|---|---|---|---|
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:919:17 | clang-analyzer | Dereference of null pointer (loaded from variable 'callresult') | ||
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:919:17 | clang-analyzer | Dereference of null pointer (loaded from variable 'callresult') | ||
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:957:25 | clang-analyzer | Dereference of null pointer (loaded from variable 'callresult') | ||
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:957:25 | clang-analyzer | Dereference of null pointer (loaded from variable 'callresult') | ||
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:2094:9 | clang-analyzer | Value stored to 'nallocated' is never read |
1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10 Copyright (c) Corporation for National Research Initiatives.
11
12 --------------------------------------------------------------------
13 The original string type implementation is:
14
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
17
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39
40 */
41
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51
52 /* Limit for the Unicode object free list */
53
54 #define PyUnicode_MAXFREELIST 1024
55
56 /* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
61
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
70
71 */
72
73 #define KEEPALIVE_SIZE_LIMIT 9
74
75 /* Endianness switches; defaults to little endian */
76
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82
83 /* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88 */
89
90
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
94
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
98
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
101
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
105
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
111
112 */
113 static char unicode_default_encoding[100];
114
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * CHARACTER TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * LINE TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
144 };
145
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000B, * LINE TABULATION */
151 /* 0x000C, * FORM FEED */
152 /* 0x000D, * CARRIAGE RETURN */
153 0, 0, 1, 1, 1, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 /* 0x001C, * FILE SEPARATOR */
156 /* 0x001D, * GROUP SEPARATOR */
157 /* 0x001E, * RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172 };
173
174
175 Py_UNICODE
176 PyUnicode_GetMax(void)
177 {
178 #ifdef Py_UNICODE_WIDE
179 return 0x10FFFF;
180 #else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184 #endif
185 }
186
187 /* --- Bloom Filters ----------------------------------------------------- */
188
189 /* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193 /* the linebreak mask is set up by Unicode_Init below */
194
195 #if LONG_BIT >= 128
196 #define BLOOM_WIDTH 128
197 #elif LONG_BIT >= 64
198 #define BLOOM_WIDTH 64
199 #elif LONG_BIT >= 32
200 #define BLOOM_WIDTH 32
201 #else
202 #error "LONG_BIT is smaller than 32"
203 #endif
204
205 #define BLOOM_MASK unsigned long
206
207 static BLOOM_MASK bloom_linebreak;
208
209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
211
212 #define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
215
216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217 {
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
220 BLOOM_MASK mask;
221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
225 BLOOM_ADD(mask, ptr[i]);
226
227 return mask;
228 }
229
230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231 {
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
238 return 0;
239 }
240
241 #define BLOOM_MEMBER(mask, chr, set, setlen) \
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
244 /* --- Unicode Object ----------------------------------------------------- */
245
246 static
247 int unicode_resize(register PyUnicodeObject *unicode,
248 Py_ssize_t length)
249 {
250 void *oldstr;
251
252 /* Shortcut if there's nothing much to do. */
253 if (unicode->length == length)
254 goto reset;
255
256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
259
260 if (unicode == unicode_empty ||
261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
264 PyErr_SetString(PyExc_SystemError,
265 "can't resize shared unicode objects");
266 return -1;
267 }
268
269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
272 it contains). */
273
274 oldstr = unicode->str;
275 unicode->str = PyObject_REALLOC(unicode->str,
276 sizeof(Py_UNICODE) * (length + 1));
277 if (!unicode->str) {
278 unicode->str = (Py_UNICODE *)oldstr;
279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
283 unicode->length = length;
284
285 reset:
286 /* Reset the object caches */
287 if (unicode->defenc) {
288 Py_CLEAR(unicode->defenc);
289 }
290 unicode->hash = -1;
291
292 return 0;
293 }
294
295 /* We allocate one more byte to make sure the string is
296 Ux0000 terminated; some code relies on that.
297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301 */
302
303 static
304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305 {
306 register PyUnicodeObject *unicode;
307
308 /* Optimization for empty strings */
309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
319 /* Unicode freelist & memory allocation */
320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
328 unicode_resize(unicode, length) < 0) {
329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
332 }
333 else {
334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
338 }
339 else {
340 size_t new_size;
341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
342 if (unicode == NULL)
343 return NULL;
344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
346 }
347
348 if (!unicode->str) {
349 PyErr_NoMemory();
350 goto onError;
351 }
352 /* Initialize the first element to guard against cases where
353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
359 unicode->str[0] = 0;
360 unicode->str[length] = 0;
361 unicode->length = length;
362 unicode->hash = -1;
363 unicode->defenc = NULL;
364 return unicode;
365
366 onError:
367 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_DEC_REFTOTAL;
369 _Py_ForgetReference((PyObject *)unicode);
370 PyObject_Del(unicode);
371 return NULL;
372 }
373
374 static
375 void unicode_dealloc(register PyUnicodeObject *unicode)
376 {
377 if (PyUnicode_CheckExact(unicode) &&
378 numfree < PyUnicode_MAXFREELIST) {
379 /* Keep-Alive optimization */
380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381 PyObject_DEL(unicode->str);
382 unicode->str = NULL;
383 unicode->length = 0;
384 }
385 if (unicode->defenc) {
386 Py_CLEAR(unicode->defenc);
387 }
388 /* Add to free list */
389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
392 }
393 else {
394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
397 }
398 }
399
400 static
401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
402 {
403 register PyUnicodeObject *v;
404
405 /* Argument checks */
406 if (unicode == NULL) {
407 PyErr_BadInternalCall();
408 return -1;
409 }
410 v = *unicode;
411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
412 PyErr_BadInternalCall();
413 return -1;
414 }
415
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
419 if (v->length != length &&
420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
429 }
430
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
434 }
435
436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437 {
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439 }
440
441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
442 Py_ssize_t size)
443 {
444 PyUnicodeObject *unicode;
445
446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
449
450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
454 }
455
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
466 }
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
469 }
470 }
471
472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
475
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
478 Py_UNICODE_COPY(unicode->str, u, size);
479
480 return (PyObject *)unicode;
481 }
482
483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484 {
485 PyUnicodeObject *unicode;
486
487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
489 "Negative size passed to PyUnicode_FromStringAndSize");
490 return NULL;
491 }
492
493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
498
499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
503 }
504
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515 }
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
518 }
519
520 return PyUnicode_DecodeUTF8(u, size, NULL);
521 }
522
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
526
527 return (PyObject *)unicode;
528 }
529
530 PyObject *PyUnicode_FromString(const char *u)
531 {
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
536 }
537
538 return PyUnicode_FromStringAndSize(u, size);
539 }
540
541 #ifdef HAVE_WCHAR_H
542
543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544 # define CONVERT_WCHAR_TO_SURROGATES
545 #endif
546
547 #ifdef CONVERT_WCHAR_TO_SURROGATES
548
549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
551
552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
554 {
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
559
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
563 }
564
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
571 }
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578 {
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
587 }
588 else
589 *u++ = *w++;
590 }
591 }
592 return (PyObject *)unicode;
593 }
594
595 #else
596
597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598 Py_ssize_t size)
599 {
600 PyUnicodeObject *unicode;
601
602 if (w == NULL) {
603 PyErr_BadInternalCall();
604 return NULL;
605 }
606
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
610
611 /* Copy the wchar_t data into the new object */
612 #ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
614 #else
615 {
616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
621 }
622 #endif
623
624 return (PyObject *)unicode;
625 }
626
627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629 #undef CONVERT_WCHAR_TO_SURROGATES
630
631 static void
632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633 {
634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
639 }
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
648 }
649 *fmt++ = c;
650 *fmt = '\0';
651 }
652
653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655 PyObject *
656 PyUnicode_FromFormatV(const char *format, va_list vargs)
657 {
658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
678
679 #ifdef VA_LIST_IS_ARRAY
680 Py_MEMCPY(count, vargs, sizeof(va_list));
681 #else
682 #ifdef __va_copy
683 __va_copy(count, vargs);
684 #else
685 count = vargs;
686 #endif
687 #endif
688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
691 for (f = format; *f; f++) {
692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
695 if (*(f+1)=='S' || *(f+1)=='R')
696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
700 ;
701 if (*f == 's')
702 ++callcount;
703 }
704 }
705 /* step 2: allocate memory for the results of
706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
712 }
713 callresult = callresults;
714 }
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
723 ;
724
725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
727 */
728 if ((*f == 'l' || *f == 'z') &&
729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
731
732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
753 {
754 /* UTF-8 */
755 const char *s = va_arg(count, const char*);
756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
762 break;
763 }
764 case 'U':
765 {
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
770 }
771 case 'V':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
782 }
783 case 'S':
784 {
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
795 }
796 case 'R':
797 {
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
808 }
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
815 */
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
827 }
828 } else
829 n++;
830 }
831 expand:
832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
837 }
838 realbuffer = abuffer;
839 }
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
849
850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
852
853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
868 }
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
874 }
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
879 }
880
881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
916 {
917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
926 break;
927 }
928 case 'U':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
935 }
936 case 'V':
937 {
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
946 }
947 break;
948 }
949 case 'S':
950 case 'R':
951 {
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
966 }
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
976 }
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
985 }
986 } else
987 *s++ = *f;
988 }
989
990 end:
991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
997 fail:
998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1003 }
1004 PyObject_Free(callresults);
1005 }
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
1009 }
1010
1011 #undef appendstring
1012
1013 PyObject *
1014 PyUnicode_FromFormat(const char *format, ...)
1015 {
1016 PyObject* ret;
1017 va_list vargs;
1018
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020 va_start(vargs, format);
1021 #else
1022 va_start(vargs);
1023 #endif
1024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
1027 }
1028
1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030 wchar_t *w,
1031 Py_ssize_t size)
1032 {
1033 if (unicode == NULL) {
1034 PyErr_BadInternalCall();
1035 return -1;
1036 }
1037
1038 /* If possible, try to copy the 0-termination as well */
1039 if (size > PyUnicode_GET_SIZE(unicode))
1040 size = PyUnicode_GET_SIZE(unicode) + 1;
1041
1042 #ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044 #else
1045 {
1046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
1051 }
1052 #endif
1053
1054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
1057 return size;
1058 }
1059
1060 #endif
1061
1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
1063 {
1064 Py_UNICODE s[1];
1065
1066 #ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
1068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
1072 }
1073 #else
1074 if (ordinal < 0 || ordinal > 0xffff) {
1075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
1079 }
1080 #endif
1081
1082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
1084 }
1085
1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
1087 {
1088 /* XXX Perhaps we should make this API an alias of
1089 PyObject_Unicode() instead ?! */
1090 if (PyUnicode_CheckExact(obj)) {
1091 Py_INCREF(obj);
1092 return obj;
1093 }
1094 if (PyUnicode_Check(obj)) {
1095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
1099 }
1100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101 }
1102
1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104 const char *encoding,
1105 const char *errors)
1106 {
1107 const char *s = NULL;
1108 Py_ssize_t len;
1109 PyObject *v;
1110
1111 if (obj == NULL) {
1112 PyErr_BadInternalCall();
1113 return NULL;
1114 }
1115
1116 #if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
1118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
1121
1122 NOTE: This API should really only be used for object which
1123 represent *encoded* Unicode !
1124
1125 */
1126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
1128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
1131 }
1132 return PyObject_Unicode(obj);
1133 }
1134 #else
1135 if (PyUnicode_Check(obj)) {
1136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
1139 }
1140 #endif
1141
1142 /* Coerce object */
1143 if (PyString_Check(obj)) {
1144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
1146 }
1147 else if (PyByteArray_Check(obj)) {
1148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1152 }
1153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
1162 }
1163
1164 /* Convert to Unicode */
1165 if (len == 0) {
1166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
1168 }
1169 else
1170 v = PyUnicode_Decode(s, len, encoding, errors);
1171
1172 return v;
1173
1174 onError:
1175 return NULL;
1176 }
1177
1178 PyObject *PyUnicode_Decode(const char *s,
1179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
1182 {
1183 PyObject *buffer = NULL, *unicode;
1184
1185 if (encoding == NULL)
1186 encoding = PyUnicode_GetDefaultEncoding();
1187
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
1190 return PyUnicode_DecodeUTF8(s, size, errors);
1191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif
1197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
1199
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
1209 "decoder did not return an unicode object (type=%.400s)",
1210 Py_TYPE(unicode)->tp_name);
1211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
1216
1217 onError:
1218 Py_XDECREF(buffer);
1219 return NULL;
1220 }
1221
1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225 {
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
1234 encoding = PyUnicode_GetDefaultEncoding();
1235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
1242 onError:
1243 return NULL;
1244 }
1245
1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
1250 {
1251 PyObject *v, *unicode;
1252
1253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
1255 return NULL;
1256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1259 }
1260
1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1264 {
1265 PyObject *v;
1266
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1270 }
1271
1272 if (encoding == NULL)
1273 encoding = PyUnicode_GetDefaultEncoding();
1274
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1280
1281 onError:
1282 return NULL;
1283 }
1284
1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1288 {
1289 PyObject *v;
1290
1291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
1295
1296 if (encoding == NULL)
1297 encoding = PyUnicode_GetDefaultEncoding();
1298
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
1301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
1308 #endif
1309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
1311 }
1312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
1317 if (!PyString_Check(v)) {
1318 PyErr_Format(PyExc_TypeError,
1319 "encoder did not return a string object (type=%.400s)",
1320 Py_TYPE(v)->tp_name);
1321 Py_DECREF(v);
1322 goto onError;
1323 }
1324 return v;
1325
1326 onError:
1327 return NULL;
1328 }
1329
1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331 const char *errors)
1332 {
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1341 }
1342
1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344 {
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
1349 return PyUnicode_AS_UNICODE(unicode);
1350
1351 onError:
1352 return NULL;
1353 }
1354
1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1356 {
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361 return PyUnicode_GET_SIZE(unicode);
1362
1363 onError:
1364 return -1;
1365 }
1366
1367 const char *PyUnicode_GetDefaultEncoding(void)
1368 {
1369 return unicode_default_encoding;
1370 }
1371
1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
1373 {
1374 PyObject *v;
1375
1376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
1380 goto onError;
1381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
1383 encoding,
1384 sizeof(unicode_default_encoding));
1385 return 0;
1386
1387 onError:
1388 return -1;
1389 }
1390
1391 /* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
1393 if no exception occurred, copy the replacement to the output
1394 and adjust various state variables.
1395 return 0 on success, -1 on error
1396 */
1397
1398 static
1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1404 {
1405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1406
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
1409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
1412 Py_UNICODE *repptr;
1413 Py_ssize_t repsize;
1414 int res = -1;
1415
1416 if (*errorHandler == NULL) {
1417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
1420 }
1421
1422 if (*exceptionObject == NULL) {
1423 *exceptionObject = PyUnicodeDecodeError_Create(
1424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
1427 }
1428 else {
1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
1435 }
1436
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
1439 goto onError;
1440 if (!PyTuple_Check(restuple)) {
1441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442 goto onError;
1443 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445 goto onError;
1446 if (newpos<0)
1447 newpos = insize+newpos;
1448 if (newpos<0 || newpos>insize) {
1449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
1451 }
1452
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
1461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466 }
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1474
1475 onError:
1476 Py_XDECREF(restuple);
1477 return res;
1478 }
1479
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1481
1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1483
1484 /* Three simple macros defining base-64. */
1485
1486 /* Is c a base-64 character? */
1487
1488 #define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491 /* given that c is a base-64 character, what is its base-64 value? */
1492
1493 #define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1498
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1500
1501 #define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509 #define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1511
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
1525
1526 static
1527 char utf7_category[128] = {
1528 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532 /* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536 /* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540 /* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542 /* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1544 };
1545
1546 /* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
1551
1552 #define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
1557
1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
1559 Py_ssize_t size,
1560 const char *errors)
1561 {
1562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563 }
1564
1565 /* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
1576 {
1577 const char *starts = s;
1578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
1581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
1586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
1590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
1592
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
1596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
1599 return (PyObject *)unicode;
1600 }
1601
1602 p = unicode->str;
1603 shiftOutStart = p;
1604 e = s + size;
1605
1606 while (s < e) {
1607 Py_UNICODE ch = (unsigned char) *s;
1608
1609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626 #else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629 #endif
1630 surrogate = 0;
1631 continue;
1632 }
1633 else {
1634 *p++ = surrogate;
1635 surrogate = 0;
1636 }
1637 }
1638 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639 /* first surrogate */
1640 surrogate = outCh;
1641 }
1642 else {
1643 *p++ = outCh;
1644 }
1645 }
1646 }
1647 else { /* now leaving a base-64 section */
1648 inShift = 0;
1649 s++;
1650 if (surrogate) {
1651 *p++ = surrogate;
1652 surrogate = 0;
1653 }
1654 if (base64bits > 0) { /* left-over bits */
1655 if (base64bits >= 6) {
1656 /* We've seen at least one base-64 character */
1657 errmsg = "partial character in shift sequence";
1658 goto utf7Error;
1659 }
1660 else {
1661 /* Some bits remain; they should be zero */
1662 if (base64buffer != 0) {
1663 errmsg = "non-zero padding bits in shift sequence";
1664 goto utf7Error;
1665 }
1666 }
1667 }
1668 if (ch != '-') {
1669 /* '-' is absorbed; other terminating
1670 characters are preserved */
1671 *p++ = ch;
1672 }
1673 }
1674 }
1675 else if ( ch == '+' ) {
1676 startinpos = s-starts;
1677 s++; /* consume '+' */
1678 if (s < e && *s == '-') { /* '+-' encodes '+' */
1679 s++;
1680 *p++ = '+';
1681 }
1682 else { /* begin base64-encoded section */
1683 inShift = 1;
1684 shiftOutStart = p;
1685 base64bits = 0;
1686 }
1687 }
1688 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1689 *p++ = ch;
1690 s++;
1691 }
1692 else {
1693 startinpos = s-starts;
1694 s++;
1695 errmsg = "unexpected special character";
1696 goto utf7Error;
1697 }
1698 continue;
1699 utf7Error:
1700 outpos = p-PyUnicode_AS_UNICODE(unicode);
1701 endinpos = s-starts;
1702 if (unicode_decode_call_errorhandler(
1703 errors, &errorHandler,
1704 "utf7", errmsg,
1705 starts, size, &startinpos, &endinpos, &exc, &s,
1706 &unicode, &outpos, &p))
1707 goto onError;
1708 }
1709
1710 /* end of string */
1711
1712 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1713 /* if we're in an inconsistent state, that's an error */
1714 if (surrogate ||
1715 (base64bits >= 6) ||
1716 (base64bits > 0 && base64buffer != 0)) {
1717 outpos = p-PyUnicode_AS_UNICODE(unicode);
1718 endinpos = size;
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf7", "unterminated shift sequence",
1722 starts, size, &startinpos, &endinpos, &exc, &s,
1723 &unicode, &outpos, &p))
1724 goto onError;
1725 }
1726 }
1727
1728 /* return state */
1729 if (consumed) {
1730 if (inShift) {
1731 p = shiftOutStart; /* back off output */
1732 *consumed = startinpos;
1733 }
1734 else {
1735 *consumed = s-starts;
1736 }
1737 }
1738
1739 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1740 goto onError;
1741
1742 Py_XDECREF(errorHandler);
1743 Py_XDECREF(exc);
1744 return (PyObject *)unicode;
1745
1746 onError:
1747 Py_XDECREF(errorHandler);
1748 Py_XDECREF(exc);
1749 Py_DECREF(unicode);
1750 return NULL;
1751 }
1752
1753
1754 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1755 Py_ssize_t size,
1756 int base64SetO,
1757 int base64WhiteSpace,
1758 const char *errors)
1759 {
1760 PyObject *v;
1761 /* It might be possible to tighten this worst case */
1762 Py_ssize_t allocated = 8 * size;
1763 int inShift = 0;
1764 Py_ssize_t i = 0;
1765 unsigned int base64bits = 0;
1766 unsigned long base64buffer = 0;
1767 char * out;
1768 char * start;
1769
1770 if (allocated / 8 != size)
1771 return PyErr_NoMemory();
1772
1773 if (size == 0)
1774 return PyString_FromStringAndSize(NULL, 0);
1775
1776 v = PyString_FromStringAndSize(NULL, allocated);
1777 if (v == NULL)
1778 return NULL;
1779
1780 start = out = PyString_AS_STRING(v);
1781 for (;i < size; ++i) {
1782 Py_UNICODE ch = s[i];
1783
1784 if (inShift) {
1785 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1786 /* shifting out */
1787 if (base64bits) { /* output remaining bits */
1788 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1789 base64buffer = 0;
1790 base64bits = 0;
1791 }
1792 inShift = 0;
1793 /* Characters not in the BASE64 set implicitly unshift the sequence
1794 so no '-' is required, except if the character is itself a '-' */
1795 if (IS_BASE64(ch) || ch == '-') {
1796 *out++ = '-';
1797 }
1798 *out++ = (char) ch;
1799 }
1800 else {
1801 goto encode_char;
1802 }
1803 }
1804 else { /* not in a shift sequence */
1805 if (ch == '+') {
1806 *out++ = '+';
1807 *out++ = '-';
1808 }
1809 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1810 *out++ = (char) ch;
1811 }
1812 else {
1813 *out++ = '+';
1814 inShift = 1;
1815 goto encode_char;
1816 }
1817 }
1818 continue;
1819 encode_char:
1820 #ifdef Py_UNICODE_WIDE
1821 if (ch >= 0x10000) {
1822 /* code first surrogate */
1823 base64bits += 16;
1824 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1825 while (base64bits >= 6) {
1826 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1827 base64bits -= 6;
1828 }
1829 /* prepare second surrogate */
1830 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1831 }
1832 #endif
1833 base64bits += 16;
1834 base64buffer = (base64buffer << 16) | ch;
1835 while (base64bits >= 6) {
1836 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1837 base64bits -= 6;
1838 }
1839 }
1840 if (base64bits)
1841 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1842 if (inShift)
1843 *out++ = '-';
1844
1845 if (_PyString_Resize(&v, out - start))
1846 return NULL;
1847 return v;
1848 }
1849
1850 #undef IS_BASE64
1851 #undef FROM_BASE64
1852 #undef TO_BASE64
1853 #undef DECODE_DIRECT
1854 #undef ENCODE_DIRECT
1855
1856 /* --- UTF-8 Codec -------------------------------------------------------- */
1857
1858 static
1859 char utf8_code_length[256] = {
1860 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1861 illegal prefix. See RFC 3629 for details */
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1863 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1864 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1871 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1872 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1874 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1875 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1876 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1877 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1878 };
1879
1880 PyObject *PyUnicode_DecodeUTF8(const char *s,
1881 Py_ssize_t size,
1882 const char *errors)
1883 {
1884 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1885 }
1886
1887 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1888 Py_ssize_t size,
1889 const char *errors,
1890 Py_ssize_t *consumed)
1891 {
1892 const char *starts = s;
1893 int n;
1894 int k;
1895 Py_ssize_t startinpos;
1896 Py_ssize_t endinpos;
1897 Py_ssize_t outpos;
1898 const char *e;
1899 PyUnicodeObject *unicode;
1900 Py_UNICODE *p;
1901 const char *errmsg = "";
1902 PyObject *errorHandler = NULL;
1903 PyObject *exc = NULL;
1904
1905 /* Note: size will always be longer than the resulting Unicode
1906 character count */
1907 unicode = _PyUnicode_New(size);
1908 if (!unicode)
1909 return NULL;
1910 if (size == 0) {
1911 if (consumed)
1912 *consumed = 0;
1913 return (PyObject *)unicode;
1914 }
1915
1916 /* Unpack UTF-8 encoded data */
1917 p = unicode->str;
1918 e = s + size;
1919
1920 while (s < e) {
1921 Py_UCS4 ch = (unsigned char)*s;
1922
1923 if (ch < 0x80) {
1924 *p++ = (Py_UNICODE)ch;
1925 s++;
1926 continue;
1927 }
1928
1929 n = utf8_code_length[ch];
1930
1931 if (s + n > e) {
1932 if (consumed)
1933 break;
1934 else {
1935 errmsg = "unexpected end of data";
1936 startinpos = s-starts;
1937 endinpos = startinpos+1;
1938 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1939 endinpos++;
1940 goto utf8Error;
1941 }
1942 }
1943
1944 switch (n) {
1945
1946 case 0:
1947 errmsg = "invalid start byte";
1948 startinpos = s-starts;
1949 endinpos = startinpos+1;
1950 goto utf8Error;
1951
1952 case 1:
1953 errmsg = "internal error";
1954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
1957
1958 case 2:
1959 if ((s[1] & 0xc0) != 0x80) {
1960 errmsg = "invalid continuation byte";
1961 startinpos = s-starts;
1962 endinpos = startinpos + 1;
1963 goto utf8Error;
1964 }
1965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1966 assert ((ch > 0x007F) && (ch <= 0x07FF));
1967 *p++ = (Py_UNICODE)ch;
1968 break;
1969
1970 case 3:
1971 /* XXX: surrogates shouldn't be valid UTF-8!
1972 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1973 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1974 Uncomment the 2 lines below to make them invalid,
1975 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1976 if ((s[1] & 0xc0) != 0x80 ||
1977 (s[2] & 0xc0) != 0x80 ||
1978 ((unsigned char)s[0] == 0xE0 &&
1979 (unsigned char)s[1] < 0xA0)/* ||
1980 ((unsigned char)s[0] == 0xED &&
1981 (unsigned char)s[1] > 0x9F)*/) {
1982 errmsg = "invalid continuation byte";
1983 startinpos = s-starts;
1984 endinpos = startinpos + 1;
1985
1986 /* if s[1] first two bits are 1 and 0, then the invalid
1987 continuation byte is s[2], so increment endinpos by 1,
1988 if not, s[1] is invalid and endinpos doesn't need to
1989 be incremented. */
1990 if ((s[1] & 0xC0) == 0x80)
1991 endinpos++;
1992 goto utf8Error;
1993 }
1994 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1995 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1996 *p++ = (Py_UNICODE)ch;
1997 break;
1998
1999 case 4:
2000 if ((s[1] & 0xc0) != 0x80 ||
2001 (s[2] & 0xc0) != 0x80 ||
2002 (s[3] & 0xc0) != 0x80 ||
2003 ((unsigned char)s[0] == 0xF0 &&
2004 (unsigned char)s[1] < 0x90) ||
2005 ((unsigned char)s[0] == 0xF4 &&
2006 (unsigned char)s[1] > 0x8F)) {
2007 errmsg = "invalid continuation byte";
2008 startinpos = s-starts;
2009 endinpos = startinpos + 1;
2010 if ((s[1] & 0xC0) == 0x80) {
2011 endinpos++;
2012 if ((s[2] & 0xC0) == 0x80)
2013 endinpos++;
2014 }
2015 goto utf8Error;
2016 }
2017 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2018 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2019 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2020
2021 #ifdef Py_UNICODE_WIDE
2022 *p++ = (Py_UNICODE)ch;
2023 #else
2024 /* compute and append the two surrogates: */
2025
2026 /* translate from 10000..10FFFF to 0..FFFF */
2027 ch -= 0x10000;
2028
2029 /* high surrogate = top 10 bits added to D800 */
2030 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2031
2032 /* low surrogate = bottom 10 bits added to DC00 */
2033 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2034 #endif
2035 break;
2036 }
2037 s += n;
2038 continue;
2039
2040 utf8Error:
2041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
2045 starts, size, &startinpos, &endinpos, &exc, &s,
2046 &unicode, &outpos, &p))
2047 goto onError;
2048 }
2049 if (consumed)
2050 *consumed = s-starts;
2051
2052 /* Adjust length */
2053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2054 goto onError;
2055
2056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
2058 return (PyObject *)unicode;
2059
2060 onError:
2061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
2063 Py_DECREF(unicode);
2064 return NULL;
2065 }
2066
2067 /* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
2071 */
2072 PyObject *
2073 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2074 Py_ssize_t size,
2075 const char *errors)
2076 {
2077 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2078
2079 Py_ssize_t i; /* index into s of next input byte */
2080 PyObject *v; /* result string object */
2081 char *p; /* next free byte in output buffer */
2082 Py_ssize_t nallocated; /* number of result bytes allocated */
2083 Py_ssize_t nneeded; /* number of result bytes needed */
2084 char stackbuf[MAX_SHORT_UNICHARS * 4];
2085
2086 assert(s != NULL);
2087 assert(size >= 0);
2088
2089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
2095 v = NULL; /* will allocate after we're done */
2096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
2103 v = PyString_FromStringAndSize(NULL, nallocated);
2104 if (v == NULL)
2105 return NULL;
2106 p = PyString_AS_STRING(v);
2107 }
2108
2109 for (i = 0; i < size;) {
2110 Py_UCS4 ch = s[i++];
2111
2112 if (ch < 0x80)
2113 /* Encode ASCII */
2114 *p++ = (char) ch;
2115
2116 else if (ch < 0x0800) {
2117 /* Encode Latin-1 */
2118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
2120 }
2121 else {
2122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2131 i++;
2132 goto encodeUCS4;
2133 }
2134 /* Fall through: handles isolated high surrogates */
2135 }
2136 *p++ = (char)(0xe0 | (ch >> 12));
2137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
2140 }
2141 encodeUCS4:
2142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
2148 }
2149
2150 if (v == NULL) {
2151 /* This was stack allocated. */
2152 nneeded = p - stackbuf;
2153 assert(nneeded <= nallocated);
2154 v = PyString_FromStringAndSize(stackbuf, nneeded);
2155 }
2156 else {
2157 /* Cut back to size actually needed. */
2158 nneeded = p - PyString_AS_STRING(v);
2159 assert(nneeded <= nallocated);
2160 if (_PyString_Resize(&v, nneeded))
2161 return NULL;
2162 }
2163 return v;
2164
2165 #undef MAX_SHORT_UNICHARS
2166 }
2167
2168 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2169 {
2170 if (!PyUnicode_Check(unicode)) {
2171 PyErr_BadArgument();
2172 return NULL;
2173 }
2174 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2175 PyUnicode_GET_SIZE(unicode),
2176 NULL);
2177 }
2178
2179 /* --- UTF-32 Codec ------------------------------------------------------- */
2180
2181 PyObject *
2182 PyUnicode_DecodeUTF32(const char *s,
2183 Py_ssize_t size,
2184 const char *errors,
2185 int *byteorder)
2186 {
2187 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188 }
2189
2190 PyObject *
2191 PyUnicode_DecodeUTF32Stateful(const char *s,
2192 Py_ssize_t size,
2193 const char *errors,
2194 int *byteorder,
2195 Py_ssize_t *consumed)
2196 {
2197 const char *starts = s;
2198 Py_ssize_t startinpos;
2199 Py_ssize_t endinpos;
2200 Py_ssize_t outpos;
2201 PyUnicodeObject *unicode;
2202 Py_UNICODE *p;
2203 #ifndef Py_UNICODE_WIDE
2204 int pairs = 0;
2205 const unsigned char *qq;
2206 #else
2207 const int pairs = 0;
2208 #endif
2209 const unsigned char *q, *e;
2210 int bo = 0; /* assume native ordering by default */
2211 const char *errmsg = "";
2212 /* Offsets from q for retrieving bytes in the right order. */
2213 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2214 int iorder[] = {0, 1, 2, 3};
2215 #else
2216 int iorder[] = {3, 2, 1, 0};
2217 #endif
2218 PyObject *errorHandler = NULL;
2219 PyObject *exc = NULL;
2220
2221 q = (unsigned char *)s;
2222 e = q + size;
2223
2224 if (byteorder)
2225 bo = *byteorder;
2226
2227 /* Check for BOM marks (U+FEFF) in the input and adjust current
2228 byte order setting accordingly. In native mode, the leading BOM
2229 mark is skipped, in all other modes, it is copied to the output
2230 stream as-is (giving a ZWNBSP character). */
2231 if (bo == 0) {
2232 if (size >= 4) {
2233 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2234 (q[iorder[1]] << 8) | q[iorder[0]];
2235 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2236 if (bom == 0x0000FEFF) {
2237 q += 4;
2238 bo = -1;
2239 }
2240 else if (bom == 0xFFFE0000) {
2241 q += 4;
2242 bo = 1;
2243 }
2244 #else
2245 if (bom == 0x0000FEFF) {
2246 q += 4;
2247 bo = 1;
2248 }
2249 else if (bom == 0xFFFE0000) {
2250 q += 4;
2251 bo = -1;
2252 }
2253 #endif
2254 }
2255 }
2256
2257 if (bo == -1) {
2258 /* force LE */
2259 iorder[0] = 0;
2260 iorder[1] = 1;
2261 iorder[2] = 2;
2262 iorder[3] = 3;
2263 }
2264 else if (bo == 1) {
2265 /* force BE */
2266 iorder[0] = 3;
2267 iorder[1] = 2;
2268 iorder[2] = 1;
2269 iorder[3] = 0;
2270 }
2271
2272 /* On narrow builds we split characters outside the BMP into two
2273 codepoints => count how much extra space we need. */
2274 #ifndef Py_UNICODE_WIDE
2275 for (qq = q; qq < e; qq += 4)
2276 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2277 pairs++;
2278 #endif
2279
2280 /* This might be one to much, because of a BOM */
2281 unicode = _PyUnicode_New((size+3)/4+pairs);
2282 if (!unicode)
2283 return NULL;
2284 if (size == 0)
2285 return (PyObject *)unicode;
2286
2287 /* Unpack UTF-32 encoded data */
2288 p = unicode->str;
2289
2290 while (q < e) {
2291 Py_UCS4 ch;
2292 /* remaining bytes at the end? (size should be divisible by 4) */
2293 if (e-q<4) {
2294 if (consumed)
2295 break;
2296 errmsg = "truncated data";
2297 startinpos = ((const char *)q)-starts;
2298 endinpos = ((const char *)e)-starts;
2299 goto utf32Error;
2300 /* The remaining input chars are ignored if the callback
2301 chooses to skip the input */
2302 }
2303 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2304 (q[iorder[1]] << 8) | q[iorder[0]];
2305
2306 if (ch >= 0x110000)
2307 {
2308 errmsg = "codepoint not in range(0x110000)";
2309 startinpos = ((const char *)q)-starts;
2310 endinpos = startinpos+4;
2311 goto utf32Error;
2312 }
2313 #ifndef Py_UNICODE_WIDE
2314 if (ch >= 0x10000)
2315 {
2316 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2317 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2318 }
2319 else
2320 #endif
2321 *p++ = ch;
2322 q += 4;
2323 continue;
2324 utf32Error:
2325 outpos = p-PyUnicode_AS_UNICODE(unicode);
2326 if (unicode_decode_call_errorhandler(
2327 errors, &errorHandler,
2328 "utf32", errmsg,
2329 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2330 &unicode, &outpos, &p))
2331 goto onError;
2332 }
2333
2334 if (byteorder)
2335 *byteorder = bo;
2336
2337 if (consumed)
2338 *consumed = (const char *)q-starts;
2339
2340 /* Adjust length */
2341 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2342 goto onError;
2343
2344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
2346 return (PyObject *)unicode;
2347
2348 onError:
2349 Py_DECREF(unicode);
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return NULL;
2353 }
2354
2355 PyObject *
2356 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2357 Py_ssize_t size,
2358 const char *errors,
2359 int byteorder)
2360 {
2361 PyObject *v;
2362 unsigned char *p;
2363 Py_ssize_t nsize, bytesize;
2364 #ifndef Py_UNICODE_WIDE
2365 Py_ssize_t i, pairs;
2366 #else
2367 const int pairs = 0;
2368 #endif
2369 /* Offsets from p for storing byte pairs in the right order. */
2370 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2371 int iorder[] = {0, 1, 2, 3};
2372 #else
2373 int iorder[] = {3, 2, 1, 0};
2374 #endif
2375
2376 #define STORECHAR(CH) \
2377 do { \
2378 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2379 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2380 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2381 p[iorder[0]] = (CH) & 0xff; \
2382 p += 4; \
2383 } while(0)
2384
2385 /* In narrow builds we can output surrogate pairs as one codepoint,
2386 so we need less space. */
2387 #ifndef Py_UNICODE_WIDE
2388 for (i = pairs = 0; i < size-1; i++)
2389 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2390 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2391 pairs++;
2392 #endif
2393 nsize = (size - pairs + (byteorder == 0));
2394 bytesize = nsize * 4;
2395 if (bytesize / 4 != nsize)
2396 return PyErr_NoMemory();
2397 v = PyString_FromStringAndSize(NULL, bytesize);
2398 if (v == NULL)
2399 return NULL;
2400
2401 p = (unsigned char *)PyString_AS_STRING(v);
2402 if (byteorder == 0)
2403 STORECHAR(0xFEFF);
2404 if (size == 0)
2405 return v;
2406
2407 if (byteorder == -1) {
2408 /* force LE */
2409 iorder[0] = 0;
2410 iorder[1] = 1;
2411 iorder[2] = 2;
2412 iorder[3] = 3;
2413 }
2414 else if (byteorder == 1) {
2415 /* force BE */
2416 iorder[0] = 3;
2417 iorder[1] = 2;
2418 iorder[2] = 1;
2419 iorder[3] = 0;
2420 }
2421
2422 while (size-- > 0) {
2423 Py_UCS4 ch = *s++;
2424 #ifndef Py_UNICODE_WIDE
2425 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2426 Py_UCS4 ch2 = *s;
2427 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2428 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2429 s++;
2430 size--;
2431 }
2432 }
2433 #endif
2434 STORECHAR(ch);
2435 }
2436 return v;
2437 #undef STORECHAR
2438 }
2439
2440 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2441 {
2442 if (!PyUnicode_Check(unicode)) {
2443 PyErr_BadArgument();
2444 return NULL;
2445 }
2446 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2447 PyUnicode_GET_SIZE(unicode),
2448 NULL,
2449 0);
2450 }
2451
2452 /* --- UTF-16 Codec ------------------------------------------------------- */
2453
2454 PyObject *
2455 PyUnicode_DecodeUTF16(const char *s,
2456 Py_ssize_t size,
2457 const char *errors,
2458 int *byteorder)
2459 {
2460 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2461 }
2462
2463 PyObject *
2464 PyUnicode_DecodeUTF16Stateful(const char *s,
2465 Py_ssize_t size,
2466 const char *errors,
2467 int *byteorder,
2468 Py_ssize_t *consumed)
2469 {
2470 const char *starts = s;
2471 Py_ssize_t startinpos;
2472 Py_ssize_t endinpos;
2473 Py_ssize_t outpos;
2474 PyUnicodeObject *unicode;
2475 Py_UNICODE *p;
2476 const unsigned char *q, *e;
2477 int bo = 0; /* assume native ordering by default */
2478 const char *errmsg = "";
2479 /* Offsets from q for retrieving byte pairs in the right order. */
2480 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2481 int ihi = 1, ilo = 0;
2482 #else
2483 int ihi = 0, ilo = 1;
2484 #endif
2485 PyObject *errorHandler = NULL;
2486 PyObject *exc = NULL;
2487
2488 /* Note: size will always be longer than the resulting Unicode
2489 character count */
2490 unicode = _PyUnicode_New(size);
2491 if (!unicode)
2492 return NULL;
2493 if (size == 0)
2494 return (PyObject *)unicode;
2495
2496 /* Unpack UTF-16 encoded data */
2497 p = unicode->str;
2498 q = (unsigned char *)s;
2499 e = q + size;
2500
2501 if (byteorder)
2502 bo = *byteorder;
2503
2504 /* Check for BOM marks (U+FEFF) in the input and adjust current
2505 byte order setting accordingly. In native mode, the leading BOM
2506 mark is skipped, in all other modes, it is copied to the output
2507 stream as-is (giving a ZWNBSP character). */
2508 if (bo == 0) {
2509 if (size >= 2) {
2510 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2511 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2512 if (bom == 0xFEFF) {
2513 q += 2;
2514 bo = -1;
2515 }
2516 else if (bom == 0xFFFE) {
2517 q += 2;
2518 bo = 1;
2519 }
2520 #else
2521 if (bom == 0xFEFF) {
2522 q += 2;
2523 bo = 1;
2524 }
2525 else if (bom == 0xFFFE) {
2526 q += 2;
2527 bo = -1;
2528 }
2529 #endif
2530 }
2531 }
2532
2533 if (bo == -1) {
2534 /* force LE */
2535 ihi = 1;
2536 ilo = 0;
2537 }
2538 else if (bo == 1) {
2539 /* force BE */
2540 ihi = 0;
2541 ilo = 1;
2542 }
2543
2544 while (q < e) {
2545 Py_UNICODE ch;
2546 /* remaining bytes at the end? (size should be even) */
2547 if (e-q<2) {
2548 if (consumed)
2549 break;
2550 errmsg = "truncated data";
2551 startinpos = ((const char *)q)-starts;
2552 endinpos = ((const char *)e)-starts;
2553 goto utf16Error;
2554 /* The remaining input chars are ignored if the callback
2555 chooses to skip the input */
2556 }
2557 ch = (q[ihi] << 8) | q[ilo];
2558
2559 q += 2;
2560
2561 if (ch < 0xD800 || ch > 0xDFFF) {
2562 *p++ = ch;
2563 continue;
2564 }
2565
2566 /* UTF-16 code pair: */
2567 if (q >= e) {
2568 errmsg = "unexpected end of data";
2569 startinpos = (((const char *)q)-2)-starts;
2570 endinpos = ((const char *)e)-starts;
2571 goto utf16Error;
2572 }
2573 if (0xD800 <= ch && ch <= 0xDBFF) {
2574 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2575 q += 2;
2576 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2577 #ifndef Py_UNICODE_WIDE
2578 *p++ = ch;
2579 *p++ = ch2;
2580 #else
2581 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2582 #endif
2583 continue;
2584 }
2585 else {
2586 errmsg = "illegal UTF-16 surrogate";
2587 startinpos = (((const char *)q)-4)-starts;
2588 endinpos = startinpos+2;
2589 goto utf16Error;
2590 }
2591
2592 }
2593 errmsg = "illegal encoding";
2594 startinpos = (((const char *)q)-2)-starts;
2595 endinpos = startinpos+2;
2596 /* Fall through to report the error */
2597
2598 utf16Error:
2599 outpos = p-PyUnicode_AS_UNICODE(unicode);
2600 if (unicode_decode_call_errorhandler(
2601 errors, &errorHandler,
2602 "utf16", errmsg,
2603 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2604 &unicode, &outpos, &p))
2605 goto onError;
2606 }
2607
2608 if (byteorder)
2609 *byteorder = bo;
2610
2611 if (consumed)
2612 *consumed = (const char *)q-starts;
2613
2614 /* Adjust length */
2615 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2616 goto onError;
2617
2618 Py_XDECREF(errorHandler);
2619 Py_XDECREF(exc);
2620 return (PyObject *)unicode;
2621
2622 onError:
2623 Py_DECREF(unicode);
2624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
2626 return NULL;
2627 }
2628
2629 PyObject *
2630 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2631 Py_ssize_t size,
2632 const char *errors,
2633 int byteorder)
2634 {
2635 PyObject *v;
2636 unsigned char *p;
2637 Py_ssize_t nsize, bytesize;
2638 #ifdef Py_UNICODE_WIDE
2639 Py_ssize_t i, pairs;
2640 #else
2641 const int pairs = 0;
2642 #endif
2643 /* Offsets from p for storing byte pairs in the right order. */
2644 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2645 int ihi = 1, ilo = 0;
2646 #else
2647 int ihi = 0, ilo = 1;
2648 #endif
2649
2650 #define STORECHAR(CH) \
2651 do { \
2652 p[ihi] = ((CH) >> 8) & 0xff; \
2653 p[ilo] = (CH) & 0xff; \
2654 p += 2; \
2655 } while(0)
2656
2657 #ifdef Py_UNICODE_WIDE
2658 for (i = pairs = 0; i < size; i++)
2659 if (s[i] >= 0x10000)
2660 pairs++;
2661 #endif
2662 /* 2 * (size + pairs + (byteorder == 0)) */
2663 if (size > PY_SSIZE_T_MAX ||
2664 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2665 return PyErr_NoMemory();
2666 nsize = size + pairs + (byteorder == 0);
2667 bytesize = nsize * 2;
2668 if (bytesize / 2 != nsize)
2669 return PyErr_NoMemory();
2670 v = PyString_FromStringAndSize(NULL, bytesize);
2671 if (v == NULL)
2672 return NULL;
2673
2674 p = (unsigned char *)PyString_AS_STRING(v);
2675 if (byteorder == 0)
2676 STORECHAR(0xFEFF);
2677 if (size == 0)
2678 return v;
2679
2680 if (byteorder == -1) {
2681 /* force LE */
2682 ihi = 1;
2683 ilo = 0;
2684 }
2685 else if (byteorder == 1) {
2686 /* force BE */
2687 ihi = 0;
2688 ilo = 1;
2689 }
2690
2691 while (size-- > 0) {
2692 Py_UNICODE ch = *s++;
2693 Py_UNICODE ch2 = 0;
2694 #ifdef Py_UNICODE_WIDE
2695 if (ch >= 0x10000) {
2696 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2697 ch = 0xD800 | ((ch-0x10000) >> 10);
2698 }
2699 #endif
2700 STORECHAR(ch);
2701 if (ch2)
2702 STORECHAR(ch2);
2703 }
2704 return v;
2705 #undef STORECHAR
2706 }
2707
2708 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2709 {
2710 if (!PyUnicode_Check(unicode)) {
2711 PyErr_BadArgument();
2712 return NULL;
2713 }
2714 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2715 PyUnicode_GET_SIZE(unicode),
2716 NULL,
2717 0);
2718 }
2719
2720 /* --- Unicode Escape Codec ----------------------------------------------- */
2721
2722 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2723
2724 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2725 Py_ssize_t size,
2726 const char *errors)
2727 {
2728 const char *starts = s;
2729 Py_ssize_t startinpos;
2730 Py_ssize_t endinpos;
2731 Py_ssize_t outpos;
2732 int i;
2733 PyUnicodeObject *v;
2734 Py_UNICODE *p;
2735 const char *end;
2736 char* message;
2737 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2738 PyObject *errorHandler = NULL;
2739 PyObject *exc = NULL;
2740
2741 /* Escaped strings will always be longer than the resulting
2742 Unicode string, so we start with size here and then reduce the
2743 length after conversion to the true value.
2744 (but if the error callback returns a long replacement string
2745 we'll have to allocate more space) */
2746 v = _PyUnicode_New(size);
2747 if (v == NULL)
2748 goto onError;
2749 if (size == 0)
2750 return (PyObject *)v;
2751
2752 p = PyUnicode_AS_UNICODE(v);
2753 end = s + size;
2754
2755 while (s < end) {
2756 unsigned char c;
2757 Py_UNICODE x;
2758 int digits;
2759
2760 /* Non-escape characters are interpreted as Unicode ordinals */
2761 if (*s != '\\') {
2762 *p++ = (unsigned char) *s++;
2763 continue;
2764 }
2765
2766 startinpos = s-starts;
2767 /* \ - Escapes */
2768 s++;
2769 c = *s++;
2770 if (s > end)
2771 c = '\0'; /* Invalid after \ */
2772 switch (c) {
2773
2774 /* \x escapes */
2775 case '\n': break;
2776 case '\\': *p++ = '\\'; break;
2777 case '\'': *p++ = '\''; break;
2778 case '\"': *p++ = '\"'; break;
2779 case 'b': *p++ = '\b'; break;
2780 case 'f': *p++ = '\014'; break; /* FF */
2781 case 't': *p++ = '\t'; break;
2782 case 'n': *p++ = '\n'; break;
2783 case 'r': *p++ = '\r'; break;
2784 case 'v': *p++ = '\013'; break; /* VT */
2785 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2786
2787 /* \OOO (octal) escapes */
2788 case '0': case '1': case '2': case '3':
2789 case '4': case '5': case '6': case '7':
2790 x = s[-1] - '0';
2791 if (s < end && '0' <= *s && *s <= '7') {
2792 x = (x<<3) + *s++ - '0';
2793 if (s < end && '0' <= *s && *s <= '7')
2794 x = (x<<3) + *s++ - '0';
2795 }
2796 *p++ = x;
2797 break;
2798
2799 /* hex escapes */
2800 /* \xXX */
2801 case 'x':
2802 digits = 2;
2803 message = "truncated \\xXX escape";
2804 goto hexescape;
2805
2806 /* \uXXXX */
2807 case 'u':
2808 digits = 4;
2809 message = "truncated \\uXXXX escape";
2810 goto hexescape;
2811
2812 /* \UXXXXXXXX */
2813 case 'U':
2814 digits = 8;
2815 message = "truncated \\UXXXXXXXX escape";
2816 hexescape:
2817 chr = 0;
2818 outpos = p-PyUnicode_AS_UNICODE(v);
2819 if (s+digits>end) {
2820 endinpos = size;
2821 if (unicode_decode_call_errorhandler(
2822 errors, &errorHandler,
2823 "unicodeescape", "end of string in escape sequence",
2824 starts, size, &startinpos, &endinpos, &exc, &s,
2825 &v, &outpos, &p))
2826 goto onError;
2827 goto nextByte;
2828 }
2829 for (i = 0; i < digits; ++i) {
2830 c = (unsigned char) s[i];
2831 if (!isxdigit(c)) {
2832 endinpos = (s+i+1)-starts;
2833 if (unicode_decode_call_errorhandler(
2834 errors, &errorHandler,
2835 "unicodeescape", message,
2836 starts, size, &startinpos, &endinpos, &exc, &s,
2837 &v, &outpos, &p))
2838 goto onError;
2839 goto nextByte;
2840 }
2841 chr = (chr<<4) & ~0xF;
2842 if (c >= '0' && c <= '9')
2843 chr += c - '0';
2844 else if (c >= 'a' && c <= 'f')
2845 chr += 10 + c - 'a';
2846 else
2847 chr += 10 + c - 'A';
2848 }
2849 s += i;
2850 if (chr == 0xffffffff && PyErr_Occurred())
2851 /* _decoding_error will have already written into the
2852 target buffer. */
2853 break;
2854 store:
2855 /* when we get here, chr is a 32-bit unicode character */
2856 if (chr <= 0xffff)
2857 /* UCS-2 character */
2858 *p++ = (Py_UNICODE) chr;
2859 else if (chr <= 0x10ffff) {
2860 /* UCS-4 character. Either store directly, or as
2861 surrogate pair. */
2862 #ifdef Py_UNICODE_WIDE
2863 *p++ = chr;
2864 #else
2865 chr -= 0x10000L;
2866 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2867 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2868 #endif
2869 } else {
2870 endinpos = s-starts;
2871 outpos = p-PyUnicode_AS_UNICODE(v);
2872 if (unicode_decode_call_errorhandler(
2873 errors, &errorHandler,
2874 "unicodeescape", "illegal Unicode character",
2875 starts, size, &startinpos, &endinpos, &exc, &s,
2876 &v, &outpos, &p))
2877 goto onError;
2878 }
2879 break;
2880
2881 /* \N{name} */
2882 case 'N':
2883 message = "malformed \\N character escape";
2884 if (ucnhash_CAPI == NULL) {
2885 /* load the unicode data module */
2886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2887 if (ucnhash_CAPI == NULL)
2888 goto ucnhashError;
2889 }
2890 if (*s == '{') {
2891 const char *start = s+1;
2892 /* look for the closing brace */
2893 while (*s != '}' && s < end)
2894 s++;
2895 if (s > start && s < end && *s == '}') {
2896 /* found a name. look it up in the unicode database */
2897 message = "unknown Unicode character name";
2898 s++;
2899 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2900 goto store;
2901 }
2902 }
2903 endinpos = s-starts;
2904 outpos = p-PyUnicode_AS_UNICODE(v);
2905 if (unicode_decode_call_errorhandler(
2906 errors, &errorHandler,
2907 "unicodeescape", message,
2908 starts, size, &startinpos, &endinpos, &exc, &s,
2909 &v, &outpos, &p))
2910 goto onError;
2911 break;
2912
2913 default:
2914 if (s > end) {
2915 message = "\\ at end of string";
2916 s--;
2917 endinpos = s-starts;
2918 outpos = p-PyUnicode_AS_UNICODE(v);
2919 if (unicode_decode_call_errorhandler(
2920 errors, &errorHandler,
2921 "unicodeescape", message,
2922 starts, size, &startinpos, &endinpos, &exc, &s,
2923 &v, &outpos, &p))
2924 goto onError;
2925 }
2926 else {
2927 *p++ = '\\';
2928 *p++ = (unsigned char)s[-1];
2929 }
2930 break;
2931 }
2932 nextByte:
2933 ;
2934 }
2935 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2936 goto onError;
2937 Py_XDECREF(errorHandler);
2938 Py_XDECREF(exc);
2939 return (PyObject *)v;
2940
2941 ucnhashError:
2942 PyErr_SetString(
2943 PyExc_UnicodeError,
2944 "\\N escapes not supported (can't load unicodedata module)"
2945 );
2946 Py_XDECREF(v);
2947 Py_XDECREF(errorHandler);
2948 Py_XDECREF(exc);
2949 return NULL;
2950
2951 onError:
2952 Py_XDECREF(v);
2953 Py_XDECREF(errorHandler);
2954 Py_XDECREF(exc);
2955 return NULL;
2956 }
2957
2958 /* Return a Unicode-Escape string version of the Unicode object.
2959
2960 If quotes is true, the string is enclosed in u"" or u'' quotes as
2961 appropriate.
2962
2963 */
2964
2965 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2966 Py_ssize_t size,
2967 Py_UNICODE ch)
2968 {
2969 /* like wcschr, but doesn't stop at NULL characters */
2970
2971 while (size-- > 0) {
2972 if (*s == ch)
2973 return s;
2974 s++;
2975 }
2976
2977 return NULL;
2978 }
2979
2980 static
2981 PyObject *unicodeescape_string(const Py_UNICODE *s,
2982 Py_ssize_t size,
2983 int quotes)
2984 {
2985 PyObject *repr;
2986 char *p;
2987
2988 static const char *hexdigit = "0123456789abcdef";
2989 #ifdef Py_UNICODE_WIDE
2990 const Py_ssize_t expandsize = 10;
2991 #else
2992 const Py_ssize_t expandsize = 6;
2993 #endif
2994
2995 /* XXX(nnorwitz): rather than over-allocating, it would be
2996 better to choose a different scheme. Perhaps scan the
2997 first N-chars of the string and allocate based on that size.
2998 */
2999 /* Initial allocation is based on the longest-possible unichr
3000 escape.
3001
3002 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3003 unichr, so in this case it's the longest unichr escape. In
3004 narrow (UTF-16) builds this is five chars per source unichr
3005 since there are two unichrs in the surrogate pair, so in narrow
3006 (UTF-16) builds it's not the longest unichr escape.
3007
3008 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3009 so in the narrow (UTF-16) build case it's the longest unichr
3010 escape.
3011 */
3012
3013 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3014 return PyErr_NoMemory();
3015
3016 repr = PyString_FromStringAndSize(NULL,
3017 2
3018 + expandsize*size
3019 + 1);
3020 if (repr == NULL)
3021 return NULL;
3022
3023 p = PyString_AS_STRING(repr);
3024
3025 if (quotes) {
3026 *p++ = 'u';
3027 *p++ = (findchar(s, size, '\'') &&
3028 !findchar(s, size, '"')) ? '"' : '\'';
3029 }
3030 while (size-- > 0) {
3031 Py_UNICODE ch = *s++;
3032
3033 /* Escape quotes and backslashes */
3034 if ((quotes &&
3035 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3036 *p++ = '\\';
3037 *p++ = (char) ch;
3038 continue;
3039 }
3040
3041 #ifdef Py_UNICODE_WIDE
3042 /* Map 21-bit characters to '\U00xxxxxx' */
3043 else if (ch >= 0x10000) {
3044 *p++ = '\\';
3045 *p++ = 'U';
3046 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3047 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3048 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3049 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3050 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3053 *p++ = hexdigit[ch & 0x0000000F];
3054 continue;
3055 }
3056 #else
3057 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3058 else if (ch >= 0xD800 && ch < 0xDC00) {
3059 Py_UNICODE ch2;
3060 Py_UCS4 ucs;
3061
3062 ch2 = *s++;
3063 size--;
3064 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3065 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3066 *p++ = '\\';
3067 *p++ = 'U';
3068 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3069 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3070 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3071 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3072 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3075 *p++ = hexdigit[ucs & 0x0000000F];
3076 continue;
3077 }
3078 /* Fall through: isolated surrogates are copied as-is */
3079 s--;
3080 size++;
3081 }
3082 #endif
3083
3084 /* Map 16-bit characters to '\uxxxx' */
3085 if (ch >= 256) {
3086 *p++ = '\\';
3087 *p++ = 'u';
3088 *p++ = hexdigit[(ch >> 12) & 0x000F];
3089 *p++ = hexdigit[(ch >> 8) & 0x000F];
3090 *p++ = hexdigit[(ch >> 4) & 0x000F];
3091 *p++ = hexdigit[ch & 0x000F];
3092 }
3093
3094 /* Map special whitespace to '\t', \n', '\r' */
3095 else if (ch == '\t') {
3096 *p++ = '\\';
3097 *p++ = 't';
3098 }
3099 else if (ch == '\n') {
3100 *p++ = '\\';
3101 *p++ = 'n';
3102 }
3103 else if (ch == '\r') {
3104 *p++ = '\\';
3105 *p++ = 'r';
3106 }
3107
3108 /* Map non-printable US ASCII to '\xhh' */
3109 else if (ch < ' ' || ch >= 0x7F) {
3110 *p++ = '\\';
3111 *p++ = 'x';
3112 *p++ = hexdigit[(ch >> 4) & 0x000F];
3113 *p++ = hexdigit[ch & 0x000F];
3114 }
3115
3116 /* Copy everything else as-is */
3117 else
3118 *p++ = (char) ch;
3119 }
3120 if (quotes)
3121 *p++ = PyString_AS_STRING(repr)[1];
3122
3123 *p = '\0';
3124 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3125 return NULL;
3126 return repr;
3127 }
3128
3129 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3130 Py_ssize_t size)
3131 {
3132 return unicodeescape_string(s, size, 0);
3133 }
3134
3135 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3136 {
3137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_BadArgument();
3139 return NULL;
3140 }
3141 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3142 PyUnicode_GET_SIZE(unicode));
3143 }
3144
3145 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3146
3147 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3148 Py_ssize_t size,
3149 const char *errors)
3150 {
3151 const char *starts = s;
3152 Py_ssize_t startinpos;
3153 Py_ssize_t endinpos;
3154 Py_ssize_t outpos;
3155 PyUnicodeObject *v;
3156 Py_UNICODE *p;
3157 const char *end;
3158 const char *bs;
3159 PyObject *errorHandler = NULL;
3160 PyObject *exc = NULL;
3161
3162 /* Escaped strings will always be longer than the resulting
3163 Unicode string, so we start with size here and then reduce the
3164 length after conversion to the true value. (But decoding error
3165 handler might have to resize the string) */
3166 v = _PyUnicode_New(size);
3167 if (v == NULL)
3168 goto onError;
3169 if (size == 0)
3170 return (PyObject *)v;
3171 p = PyUnicode_AS_UNICODE(v);
3172 end = s + size;
3173 while (s < end) {
3174 unsigned char c;
3175 Py_UCS4 x;
3176 int i;
3177 int count;
3178
3179 /* Non-escape characters are interpreted as Unicode ordinals */
3180 if (*s != '\\') {
3181 *p++ = (unsigned char)*s++;
3182 continue;
3183 }
3184 startinpos = s-starts;
3185
3186 /* \u-escapes are only interpreted iff the number of leading
3187 backslashes if odd */
3188 bs = s;
3189 for (;s < end;) {
3190 if (*s != '\\')
3191 break;
3192 *p++ = (unsigned char)*s++;
3193 }
3194 if (((s - bs) & 1) == 0 ||
3195 s >= end ||
3196 (*s != 'u' && *s != 'U')) {
3197 continue;
3198 }
3199 p--;
3200 count = *s=='u' ? 4 : 8;
3201 s++;
3202
3203 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3204 outpos = p-PyUnicode_AS_UNICODE(v);
3205 for (x = 0, i = 0; i < count; ++i, ++s) {
3206 c = (unsigned char)*s;
3207 if (!isxdigit(c)) {
3208 endinpos = s-starts;
3209 if (unicode_decode_call_errorhandler(
3210 errors, &errorHandler,
3211 "rawunicodeescape", "truncated \\uXXXX",
3212 starts, size, &startinpos, &endinpos, &exc, &s,
3213 &v, &outpos, &p))
3214 goto onError;
3215 goto nextByte;
3216 }
3217 x = (x<<4) & ~0xF;
3218 if (c >= '0' && c <= '9')
3219 x += c - '0';
3220 else if (c >= 'a' && c <= 'f')
3221 x += 10 + c - 'a';
3222 else
3223 x += 10 + c - 'A';
3224 }
3225 if (x <= 0xffff)
3226 /* UCS-2 character */
3227 *p++ = (Py_UNICODE) x;
3228 else if (x <= 0x10ffff) {
3229 /* UCS-4 character. Either store directly, or as
3230 surrogate pair. */
3231 #ifdef Py_UNICODE_WIDE
3232 *p++ = (Py_UNICODE) x;
3233 #else
3234 x -= 0x10000L;
3235 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3236 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3237 #endif
3238 } else {
3239 endinpos = s-starts;
3240 outpos = p-PyUnicode_AS_UNICODE(v);
3241 if (unicode_decode_call_errorhandler(
3242 errors, &errorHandler,
3243 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3244 starts, size, &startinpos, &endinpos, &exc, &s,
3245 &v, &outpos, &p))
3246 goto onError;
3247 }
3248 nextByte:
3249 ;
3250 }
3251 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3252 goto onError;
3253 Py_XDECREF(errorHandler);
3254 Py_XDECREF(exc);
3255 return (PyObject *)v;
3256
3257 onError:
3258 Py_XDECREF(v);
3259 Py_XDECREF(errorHandler);
3260 Py_XDECREF(exc);
3261 return NULL;
3262 }
3263
3264 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3265 Py_ssize_t size)
3266 {
3267 PyObject *repr;
3268 char *p;
3269 char *q;
3270
3271 static const char *hexdigit = "0123456789abcdef";
3272 #ifdef Py_UNICODE_WIDE
3273 const Py_ssize_t expandsize = 10;
3274 #else
3275 const Py_ssize_t expandsize = 6;
3276 #endif
3277
3278 if (size > PY_SSIZE_T_MAX / expandsize)
3279 return PyErr_NoMemory();
3280
3281 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3282 if (repr == NULL)
3283 return NULL;
3284 if (size == 0)
3285 return repr;
3286
3287 p = q = PyString_AS_STRING(repr);
3288 while (size-- > 0) {
3289 Py_UNICODE ch = *s++;
3290 #ifdef Py_UNICODE_WIDE
3291 /* Map 32-bit characters to '\Uxxxxxxxx' */
3292 if (ch >= 0x10000) {
3293 *p++ = '\\';
3294 *p++ = 'U';
3295 *p++ = hexdigit[(ch >> 28) & 0xf];
3296 *p++ = hexdigit[(ch >> 24) & 0xf];
3297 *p++ = hexdigit[(ch >> 20) & 0xf];
3298 *p++ = hexdigit[(ch >> 16) & 0xf];
3299 *p++ = hexdigit[(ch >> 12) & 0xf];
3300 *p++ = hexdigit[(ch >> 8) & 0xf];
3301 *p++ = hexdigit[(ch >> 4) & 0xf];
3302 *p++ = hexdigit[ch & 15];
3303 }
3304 else
3305 #else
3306 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3307 if (ch >= 0xD800 && ch < 0xDC00) {
3308 Py_UNICODE ch2;
3309 Py_UCS4 ucs;
3310
3311 ch2 = *s++;
3312 size--;
3313 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3314 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3315 *p++ = '\\';
3316 *p++ = 'U';
3317 *p++ = hexdigit[(ucs >> 28) & 0xf];
3318 *p++ = hexdigit[(ucs >> 24) & 0xf];
3319 *p++ = hexdigit[(ucs >> 20) & 0xf];
3320 *p++ = hexdigit[(ucs >> 16) & 0xf];
3321 *p++ = hexdigit[(ucs >> 12) & 0xf];
3322 *p++ = hexdigit[(ucs >> 8) & 0xf];
3323 *p++ = hexdigit[(ucs >> 4) & 0xf];
3324 *p++ = hexdigit[ucs & 0xf];
3325 continue;
3326 }
3327 /* Fall through: isolated surrogates are copied as-is */
3328 s--;
3329 size++;
3330 }
3331 #endif
3332 /* Map 16-bit characters to '\uxxxx' */
3333 if (ch >= 256) {
3334 *p++ = '\\';
3335 *p++ = 'u';
3336 *p++ = hexdigit[(ch >> 12) & 0xf];
3337 *p++ = hexdigit[(ch >> 8) & 0xf];
3338 *p++ = hexdigit[(ch >> 4) & 0xf];
3339 *p++ = hexdigit[ch & 15];
3340 }
3341 /* Copy everything else as-is */
3342 else
3343 *p++ = (char) ch;
3344 }
3345 *p = '\0';
3346 if (_PyString_Resize(&repr, p - q))
3347 return NULL;
3348 return repr;
3349 }
3350
3351 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3352 {
3353 if (!PyUnicode_Check(unicode)) {
3354 PyErr_BadArgument();
3355 return NULL;
3356 }
3357 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3358 PyUnicode_GET_SIZE(unicode));
3359 }
3360
3361 /* --- Unicode Internal Codec ------------------------------------------- */
3362
3363 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3364 Py_ssize_t size,
3365 const char *errors)
3366 {
3367 const char *starts = s;
3368 Py_ssize_t startinpos;
3369 Py_ssize_t endinpos;
3370 Py_ssize_t outpos;
3371 PyUnicodeObject *v;
3372 Py_UNICODE *p;
3373 const char *end;
3374 const char *reason;
3375 PyObject *errorHandler = NULL;
3376 PyObject *exc = NULL;
3377
3378 #ifdef Py_UNICODE_WIDE
3379 Py_UNICODE unimax = PyUnicode_GetMax();
3380 #endif
3381
3382 /* XXX overflow detection missing */
3383 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3384 if (v == NULL)
3385 goto onError;
3386 if (PyUnicode_GetSize((PyObject *)v) == 0)
3387 return (PyObject *)v;
3388 p = PyUnicode_AS_UNICODE(v);
3389 end = s + size;
3390
3391 while (s < end) {
3392 memcpy(p, s, sizeof(Py_UNICODE));
3393 /* We have to sanity check the raw data, otherwise doom looms for
3394 some malformed UCS-4 data. */
3395 if (
3396 #ifdef Py_UNICODE_WIDE
3397 *p > unimax || *p < 0 ||
3398 #endif
3399 end-s < Py_UNICODE_SIZE
3400 )
3401 {
3402 startinpos = s - starts;
3403 if (end-s < Py_UNICODE_SIZE) {
3404 endinpos = end-starts;
3405 reason = "truncated input";
3406 }
3407 else {
3408 endinpos = s - starts + Py_UNICODE_SIZE;
3409 reason = "illegal code point (> 0x10FFFF)";
3410 }
3411 outpos = p - PyUnicode_AS_UNICODE(v);
3412 if (unicode_decode_call_errorhandler(
3413 errors, &errorHandler,
3414 "unicode_internal", reason,
3415 starts, size, &startinpos, &endinpos, &exc, &s,
3416 &v, &outpos, &p)) {
3417 goto onError;
3418 }
3419 }
3420 else {
3421 p++;
3422 s += Py_UNICODE_SIZE;
3423 }
3424 }
3425
3426 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3427 goto onError;
3428 Py_XDECREF(errorHandler);
3429 Py_XDECREF(exc);
3430 return (PyObject *)v;
3431
3432 onError:
3433 Py_XDECREF(v);
3434 Py_XDECREF(errorHandler);
3435 Py_XDECREF(exc);
3436 return NULL;
3437 }
3438
3439 /* --- Latin-1 Codec ------------------------------------------------------ */
3440
3441 PyObject *PyUnicode_DecodeLatin1(const char *s,
3442 Py_ssize_t size,
3443 const char *errors)
3444 {
3445 PyUnicodeObject *v;
3446 Py_UNICODE *p;
3447
3448 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3449 if (size == 1) {
3450 Py_UNICODE r = *(unsigned char*)s;
3451 return PyUnicode_FromUnicode(&r, 1);
3452 }
3453
3454 v = _PyUnicode_New(size);
3455 if (v == NULL)
3456 goto onError;
3457 if (size == 0)
3458 return (PyObject *)v;
3459 p = PyUnicode_AS_UNICODE(v);
3460 while (size-- > 0)
3461 *p++ = (unsigned char)*s++;
3462 return (PyObject *)v;
3463
3464 onError:
3465 Py_XDECREF(v);
3466 return NULL;
3467 }
3468
3469 /* create or adjust a UnicodeEncodeError */
3470 static void make_encode_exception(PyObject **exceptionObject,
3471 const char *encoding,
3472 const Py_UNICODE *unicode, Py_ssize_t size,
3473 Py_ssize_t startpos, Py_ssize_t endpos,
3474 const char *reason)
3475 {
3476 if (*exceptionObject == NULL) {
3477 *exceptionObject = PyUnicodeEncodeError_Create(
3478 encoding, unicode, size, startpos, endpos, reason);
3479 }
3480 else {
3481 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3482 goto onError;
3483 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3484 goto onError;
3485 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3486 goto onError;
3487 return;
3488 onError:
3489 Py_DECREF(*exceptionObject);
3490 *exceptionObject = NULL;
3491 }
3492 }
3493
3494 /* raises a UnicodeEncodeError */
3495 static void raise_encode_exception(PyObject **exceptionObject,
3496 const char *encoding,
3497 const Py_UNICODE *unicode, Py_ssize_t size,
3498 Py_ssize_t startpos, Py_ssize_t endpos,
3499 const char *reason)
3500 {
3501 make_encode_exception(exceptionObject,
3502 encoding, unicode, size, startpos, endpos, reason);
3503 if (*exceptionObject != NULL)
3504 PyCodec_StrictErrors(*exceptionObject);
3505 }
3506
3507 /* error handling callback helper:
3508 build arguments, call the callback and check the arguments,
3509 put the result into newpos and return the replacement string, which
3510 has to be freed by the caller */
3511 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3512 PyObject **errorHandler,
3513 const char *encoding, const char *reason,
3514 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3515 Py_ssize_t startpos, Py_ssize_t endpos,
3516 Py_ssize_t *newpos)
3517 {
3518 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3519
3520 PyObject *restuple;
3521 PyObject *resunicode;
3522
3523 if (*errorHandler == NULL) {
3524 *errorHandler = PyCodec_LookupError(errors);
3525 if (*errorHandler == NULL)
3526 return NULL;
3527 }
3528
3529 make_encode_exception(exceptionObject,
3530 encoding, unicode, size, startpos, endpos, reason);
3531 if (*exceptionObject == NULL)
3532 return NULL;
3533
3534 restuple = PyObject_CallFunctionObjArgs(
3535 *errorHandler, *exceptionObject, NULL);
3536 if (restuple == NULL)
3537 return NULL;
3538 if (!PyTuple_Check(restuple)) {
3539 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3540 Py_DECREF(restuple);
3541 return NULL;
3542 }
3543 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3544 &resunicode, newpos)) {
3545 Py_DECREF(restuple);
3546 return NULL;
3547 }
3548 if (*newpos<0)
3549 *newpos = size+*newpos;
3550 if (*newpos<0 || *newpos>size) {
3551 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3552 Py_DECREF(restuple);
3553 return NULL;
3554 }
3555 Py_INCREF(resunicode);
3556 Py_DECREF(restuple);
3557 return resunicode;
3558 }
3559
3560 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3561 Py_ssize_t size,
3562 const char *errors,
3563 int limit)
3564 {
3565 /* output object */
3566 PyObject *res;
3567 /* pointers to the beginning and end+1 of input */
3568 const Py_UNICODE *startp = p;
3569 const Py_UNICODE *endp = p + size;
3570 /* pointer to the beginning of the unencodable characters */
3571 /* const Py_UNICODE *badp = NULL; */
3572 /* pointer into the output */
3573 char *str;
3574 /* current output position */
3575 Py_ssize_t respos = 0;
3576 Py_ssize_t ressize;
3577 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3578 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3579 PyObject *errorHandler = NULL;
3580 PyObject *exc = NULL;
3581 /* the following variable is used for caching string comparisons
3582 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3583 int known_errorHandler = -1;
3584
3585 /* allocate enough for a simple encoding without
3586 replacements, if we need more, we'll resize */
3587 res = PyString_FromStringAndSize(NULL, size);
3588 if (res == NULL)
3589 goto onError;
3590 if (size == 0)
3591 return res;
3592 str = PyString_AS_STRING(res);
3593 ressize = size;
3594
3595 while (p<endp) {
3596 Py_UNICODE c = *p;
3597
3598 /* can we encode this? */
3599 if (c<limit) {
3600 /* no overflow check, because we know that the space is enough */
3601 *str++ = (char)c;
3602 ++p;
3603 }
3604 else {
3605 Py_ssize_t unicodepos = p-startp;
3606 Py_ssize_t requiredsize;
3607 PyObject *repunicode;
3608 Py_ssize_t repsize;
3609 Py_ssize_t newpos;
3610 Py_ssize_t respos;
3611 Py_UNICODE *uni2;
3612 /* startpos for collecting unencodable chars */
3613 const Py_UNICODE *collstart = p;
3614 const Py_UNICODE *collend = p;
3615 /* find all unecodable characters */
3616 while ((collend < endp) && ((*collend)>=limit))
3617 ++collend;
3618 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3619 if (known_errorHandler==-1) {
3620 if ((errors==NULL) || (!strcmp(errors, "strict")))
3621 known_errorHandler = 1;
3622 else if (!strcmp(errors, "replace"))
3623 known_errorHandler = 2;
3624 else if (!strcmp(errors, "ignore"))
3625 known_errorHandler = 3;
3626 else if (!strcmp(errors, "xmlcharrefreplace"))
3627 known_errorHandler = 4;
3628 else
3629 known_errorHandler = 0;
3630 }
3631 switch (known_errorHandler) {
3632 case 1: /* strict */
3633 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3634 goto onError;
3635 case 2: /* replace */
3636 while (collstart++<collend)
3637 *str++ = '?'; /* fall through */
3638 case 3: /* ignore */
3639 p = collend;
3640 break;
3641 case 4: /* xmlcharrefreplace */
3642 respos = str-PyString_AS_STRING(res);
3643 /* determine replacement size (temporarily (mis)uses p) */
3644 for (p = collstart, repsize = 0; p < collend; ++p) {
3645 if (*p<10)
3646 repsize += 2+1+1;
3647 else if (*p<100)
3648 repsize += 2+2+1;
3649 else if (*p<1000)
3650 repsize += 2+3+1;
3651 else if (*p<10000)
3652 repsize += 2+4+1;
3653 #ifndef Py_UNICODE_WIDE
3654 else
3655 repsize += 2+5+1;
3656 #else
3657 else if (*p<100000)
3658 repsize += 2+5+1;
3659 else if (*p<1000000)
3660 repsize += 2+6+1;
3661 else
3662 repsize += 2+7+1;
3663 #endif
3664 }
3665 requiredsize = respos+repsize+(endp-collend);
3666 if (requiredsize > ressize) {
3667 if (requiredsize<2*ressize)
3668 requiredsize = 2*ressize;
3669 if (_PyString_Resize(&res, requiredsize))
3670 goto onError;
3671 str = PyString_AS_STRING(res) + respos;
3672 ressize = requiredsize;
3673 }
3674 /* generate replacement (temporarily (mis)uses p) */
3675 for (p = collstart; p < collend; ++p) {
3676 str += sprintf(str, "&#%d;", (int)*p);
3677 }
3678 p = collend;
3679 break;
3680 default:
3681 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3682 encoding, reason, startp, size, &exc,
3683 collstart-startp, collend-startp, &newpos);
3684 if (repunicode == NULL)
3685 goto onError;
3686 /* need more space? (at least enough for what we have+the
3687 replacement+the rest of the string, so we won't have to
3688 check space for encodable characters) */
3689 respos = str-PyString_AS_STRING(res);
3690 repsize = PyUnicode_GET_SIZE(repunicode);
3691 requiredsize = respos+repsize+(endp-collend);
3692 if (requiredsize > ressize) {
3693 if (requiredsize<2*ressize)
3694 requiredsize = 2*ressize;
3695 if (_PyString_Resize(&res, requiredsize)) {
3696 Py_DECREF(repunicode);
3697 goto onError;
3698 }
3699 str = PyString_AS_STRING(res) + respos;
3700 ressize = requiredsize;
3701 }
3702 /* check if there is anything unencodable in the replacement
3703 and copy it to the output */
3704 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3705 c = *uni2;
3706 if (c >= limit) {
3707 raise_encode_exception(&exc, encoding, startp, size,
3708 unicodepos, unicodepos+1, reason);
3709 Py_DECREF(repunicode);
3710 goto onError;
3711 }
3712 *str = (char)c;
3713 }
3714 p = startp + newpos;
3715 Py_DECREF(repunicode);
3716 }
3717 }
3718 }
3719 /* Resize if we allocated to much */
3720 respos = str-PyString_AS_STRING(res);
3721 if (respos<ressize)
3722 /* If this falls res will be NULL */
3723 _PyString_Resize(&res, respos);
3724 Py_XDECREF(errorHandler);
3725 Py_XDECREF(exc);
3726 return res;
3727
3728 onError:
3729 Py_XDECREF(res);
3730 Py_XDECREF(errorHandler);
3731 Py_XDECREF(exc);
3732 return NULL;
3733 }
3734
3735 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3736 Py_ssize_t size,
3737 const char *errors)
3738 {
3739 return unicode_encode_ucs1(p, size, errors, 256);
3740 }
3741
3742 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3743 {
3744 if (!PyUnicode_Check(unicode)) {
3745 PyErr_BadArgument();
3746 return NULL;
3747 }
3748 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3749 PyUnicode_GET_SIZE(unicode),
3750 NULL);
3751 }
3752
3753 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3754
3755 PyObject *PyUnicode_DecodeASCII(const char *s,
3756 Py_ssize_t size,
3757 const char *errors)
3758 {
3759 const char *starts = s;
3760 PyUnicodeObject *v;
3761 Py_UNICODE *p;
3762 Py_ssize_t startinpos;
3763 Py_ssize_t endinpos;
3764 Py_ssize_t outpos;
3765 const char *e;
3766 PyObject *errorHandler = NULL;
3767 PyObject *exc = NULL;
3768
3769 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3770 if (size == 1 && *(unsigned char*)s < 128) {
3771 Py_UNICODE r = *(unsigned char*)s;
3772 return PyUnicode_FromUnicode(&r, 1);
3773 }
3774
3775 v = _PyUnicode_New(size);
3776 if (v == NULL)
3777 goto onError;
3778 if (size == 0)
3779 return (PyObject *)v;
3780 p = PyUnicode_AS_UNICODE(v);
3781 e = s + size;
3782 while (s < e) {
3783 register unsigned char c = (unsigned char)*s;
3784 if (c < 128) {
3785 *p++ = c;
3786 ++s;
3787 }
3788 else {
3789 startinpos = s-starts;
3790 endinpos = startinpos + 1;
3791 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3792 if (unicode_decode_call_errorhandler(
3793 errors, &errorHandler,
3794 "ascii", "ordinal not in range(128)",
3795 starts, size, &startinpos, &endinpos, &exc, &s,
3796 &v, &outpos, &p))
3797 goto onError;
3798 }
3799 }
3800 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3801 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3802 goto onError;
3803 Py_XDECREF(errorHandler);
3804 Py_XDECREF(exc);
3805 return (PyObject *)v;
3806
3807 onError:
3808 Py_XDECREF(v);
3809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
3811 return NULL;
3812 }
3813
3814 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3815 Py_ssize_t size,
3816 const char *errors)
3817 {
3818 return unicode_encode_ucs1(p, size, errors, 128);
3819 }
3820
3821 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3822 {
3823 if (!PyUnicode_Check(unicode)) {
3824 PyErr_BadArgument();
3825 return NULL;
3826 }
3827 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3828 PyUnicode_GET_SIZE(unicode),
3829 NULL);
3830 }
3831
3832 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3833
3834 /* --- MBCS codecs for Windows -------------------------------------------- */
3835
3836 #if SIZEOF_INT < SIZEOF_SIZE_T
3837 #define NEED_RETRY
3838 #endif
3839
3840 /* XXX This code is limited to "true" double-byte encodings, as
3841 a) it assumes an incomplete character consists of a single byte, and
3842 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3843 encodings, see IsDBCSLeadByteEx documentation. */
3844
3845 static int is_dbcs_lead_byte(const char *s, int offset)
3846 {
3847 const char *curr = s + offset;
3848
3849 if (IsDBCSLeadByte(*curr)) {
3850 const char *prev = CharPrev(s, curr);
3851 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3852 }
3853 return 0;
3854 }
3855
3856 /*
3857 * Decode MBCS string into unicode object. If 'final' is set, converts
3858 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3859 */
3860 static int decode_mbcs(PyUnicodeObject **v,
3861 const char *s, /* MBCS string */
3862 int size, /* sizeof MBCS string */
3863 int final)
3864 {
3865 Py_UNICODE *p;
3866 Py_ssize_t n = 0;
3867 int usize = 0;
3868
3869 assert(size >= 0);
3870
3871 /* Skip trailing lead-byte unless 'final' is set */
3872 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3873 --size;
3874
3875 /* First get the size of the result */
3876 if (size > 0) {
3877 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3878 if (usize == 0) {
3879 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3880 return -1;
3881 }
3882 }
3883
3884 if (*v == NULL) {
3885 /* Create unicode object */
3886 *v = _PyUnicode_New(usize);
3887 if (*v == NULL)
3888 return -1;
3889 }
3890 else {
3891 /* Extend unicode object */
3892 n = PyUnicode_GET_SIZE(*v);
3893 if (_PyUnicode_Resize(v, n + usize) < 0)
3894 return -1;
3895 }
3896
3897 /* Do the conversion */
3898 if (size > 0) {
3899 p = PyUnicode_AS_UNICODE(*v) + n;
3900 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3901 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3902 return -1;
3903 }
3904 }
3905
3906 return size;
3907 }
3908
3909 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3910 Py_ssize_t size,
3911 const char *errors,
3912 Py_ssize_t *consumed)
3913 {
3914 PyUnicodeObject *v = NULL;
3915 int done;
3916
3917 if (consumed)
3918 *consumed = 0;
3919
3920 #ifdef NEED_RETRY
3921 retry:
3922 if (size > INT_MAX)
3923 done = decode_mbcs(&v, s, INT_MAX, 0);
3924 else
3925 #endif
3926 done = decode_mbcs(&v, s, (int)size, !consumed);
3927
3928 if (done < 0) {
3929 Py_XDECREF(v);
3930 return NULL;
3931 }
3932
3933 if (consumed)
3934 *consumed += done;
3935
3936 #ifdef NEED_RETRY
3937 if (size > INT_MAX) {
3938 s += done;
3939 size -= done;
3940 goto retry;
3941 }
3942 #endif
3943
3944 return (PyObject *)v;
3945 }
3946
3947 PyObject *PyUnicode_DecodeMBCS(const char *s,
3948 Py_ssize_t size,
3949 const char *errors)
3950 {
3951 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3952 }
3953
3954 /*
3955 * Convert unicode into string object (MBCS).
3956 * Returns 0 if succeed, -1 otherwise.
3957 */
3958 static int encode_mbcs(PyObject **repr,
3959 const Py_UNICODE *p, /* unicode */
3960 int size) /* size of unicode */
3961 {
3962 int mbcssize = 0;
3963 Py_ssize_t n = 0;
3964
3965 assert(size >= 0);
3966
3967 /* First get the size of the result */
3968 if (size > 0) {
3969 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3970 if (mbcssize == 0) {
3971 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3972 return -1;
3973 }
3974 }
3975
3976 if (*repr == NULL) {
3977 /* Create string object */
3978 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3979 if (*repr == NULL)
3980 return -1;
3981 }
3982 else {
3983 /* Extend string object */
3984 n = PyString_Size(*repr);
3985 if (_PyString_Resize(repr, n + mbcssize) < 0)
3986 return -1;
3987 }
3988
3989 /* Do the conversion */
3990 if (size > 0) {
3991 char *s = PyString_AS_STRING(*repr) + n;
3992 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3993 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3994 return -1;
3995 }
3996 }
3997
3998 return 0;
3999 }
4000
4001 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4002 Py_ssize_t size,
4003 const char *errors)
4004 {
4005 PyObject *repr = NULL;
4006 int ret;
4007
4008 #ifdef NEED_RETRY
4009 retry:
4010 if (size > INT_MAX)
4011 ret = encode_mbcs(&repr, p, INT_MAX);
4012 else
4013 #endif
4014 ret = encode_mbcs(&repr, p, (int)size);
4015
4016 if (ret < 0) {
4017 Py_XDECREF(repr);
4018 return NULL;
4019 }
4020
4021 #ifdef NEED_RETRY
4022 if (size > INT_MAX) {
4023 p += INT_MAX;
4024 size -= INT_MAX;
4025 goto retry;
4026 }
4027 #endif
4028
4029 return repr;
4030 }
4031
4032 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4033 {
4034 if (!PyUnicode_Check(unicode)) {
4035 PyErr_BadArgument();
4036 return NULL;
4037 }
4038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4039 PyUnicode_GET_SIZE(unicode),
4040 NULL);
4041 }
4042
4043 #undef NEED_RETRY
4044
4045 #endif /* MS_WINDOWS */
4046
4047 /* --- Character Mapping Codec -------------------------------------------- */
4048
4049 PyObject *PyUnicode_DecodeCharmap(const char *s,
4050 Py_ssize_t size,
4051 PyObject *mapping,
4052 const char *errors)
4053 {
4054 const char *starts = s;
4055 Py_ssize_t startinpos;
4056 Py_ssize_t endinpos;
4057 Py_ssize_t outpos;
4058 const char *e;
4059 PyUnicodeObject *v;
4060 Py_UNICODE *p;
4061 Py_ssize_t extrachars = 0;
4062 PyObject *errorHandler = NULL;
4063 PyObject *exc = NULL;
4064 Py_UNICODE *mapstring = NULL;
4065 Py_ssize_t maplen = 0;
4066
4067 /* Default to Latin-1 */
4068 if (mapping == NULL)
4069 return PyUnicode_DecodeLatin1(s, size, errors);
4070
4071 v = _PyUnicode_New(size);
4072 if (v == NULL)
4073 goto onError;
4074 if (size == 0)
4075 return (PyObject *)v;
4076 p = PyUnicode_AS_UNICODE(v);
4077 e = s + size;
4078 if (PyUnicode_CheckExact(mapping)) {
4079 mapstring = PyUnicode_AS_UNICODE(mapping);
4080 maplen = PyUnicode_GET_SIZE(mapping);
4081 while (s < e) {
4082 unsigned char ch = *s;
4083 Py_UNICODE x = 0xfffe; /* illegal value */
4084
4085 if (ch < maplen)
4086 x = mapstring[ch];
4087
4088 if (x == 0xfffe) {
4089 /* undefined mapping */
4090 outpos = p-PyUnicode_AS_UNICODE(v);
4091 startinpos = s-starts;
4092 endinpos = startinpos+1;
4093 if (unicode_decode_call_errorhandler(
4094 errors, &errorHandler,
4095 "charmap", "character maps to <undefined>",
4096 starts, size, &startinpos, &endinpos, &exc, &s,
4097 &v, &outpos, &p)) {
4098 goto onError;
4099 }
4100 continue;
4101 }
4102 *p++ = x;
4103 ++s;
4104 }
4105 }
4106 else {
4107 while (s < e) {
4108 unsigned char ch = *s;
4109 PyObject *w, *x;
4110
4111 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4112 w = PyInt_FromLong((long)ch);
4113 if (w == NULL)
4114 goto onError;
4115 x = PyObject_GetItem(mapping, w);
4116 Py_DECREF(w);
4117 if (x == NULL) {
4118 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4119 /* No mapping found means: mapping is undefined. */
4120 PyErr_Clear();
4121 x = Py_None;
4122 Py_INCREF(x);
4123 } else
4124 goto onError;
4125 }
4126
4127 /* Apply mapping */
4128 if (PyInt_Check(x)) {
4129 long value = PyInt_AS_LONG(x);
4130 if (value < 0 || value > 65535) {
4131 PyErr_SetString(PyExc_TypeError,
4132 "character mapping must be in range(65536)");
4133 Py_DECREF(x);
4134 goto onError;
4135 }
4136 *p++ = (Py_UNICODE)value;
4137 }
4138 else if (x == Py_None) {
4139 /* undefined mapping */
4140 outpos = p-PyUnicode_AS_UNICODE(v);
4141 startinpos = s-starts;
4142 endinpos = startinpos+1;
4143 if (unicode_decode_call_errorhandler(
4144 errors, &errorHandler,
4145 "charmap", "character maps to <undefined>",
4146 starts, size, &startinpos, &endinpos, &exc, &s,
4147 &v, &outpos, &p)) {
4148 Py_DECREF(x);
4149 goto onError;
4150 }
4151 Py_DECREF(x);
4152 continue;
4153 }
4154 else if (PyUnicode_Check(x)) {
4155 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4156
4157 if (targetsize == 1)
4158 /* 1-1 mapping */
4159 *p++ = *PyUnicode_AS_UNICODE(x);
4160
4161 else if (targetsize > 1) {
4162 /* 1-n mapping */
4163 if (targetsize > extrachars) {
4164 /* resize first */
4165 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4166 Py_ssize_t needed = (targetsize - extrachars) + \
4167 (targetsize << 2);
4168 extrachars += needed;
4169 /* XXX overflow detection missing */
4170 if (_PyUnicode_Resize(&v,
4171 PyUnicode_GET_SIZE(v) + needed) < 0) {
4172 Py_DECREF(x);
4173 goto onError;
4174 }
4175 p = PyUnicode_AS_UNICODE(v) + oldpos;
4176 }
4177 Py_UNICODE_COPY(p,
4178 PyUnicode_AS_UNICODE(x),
4179 targetsize);
4180 p += targetsize;
4181 extrachars -= targetsize;
4182 }
4183 /* 1-0 mapping: skip the character */
4184 }
4185 else {
4186 /* wrong return value */
4187 PyErr_SetString(PyExc_TypeError,
4188 "character mapping must return integer, None or unicode");
4189 Py_DECREF(x);
4190 goto onError;
4191 }
4192 Py_DECREF(x);
4193 ++s;
4194 }
4195 }
4196 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4197 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4198 goto onError;
4199 Py_XDECREF(errorHandler);
4200 Py_XDECREF(exc);
4201 return (PyObject *)v;
4202
4203 onError:
4204 Py_XDECREF(errorHandler);
4205 Py_XDECREF(exc);
4206 Py_XDECREF(v);
4207 return NULL;
4208 }
4209
4210 /* Charmap encoding: the lookup table */
4211
4212 struct encoding_map{
4213 PyObject_HEAD
4214 unsigned char level1[32];
4215 int count2, count3;
4216 unsigned char level23[1];
4217 };
4218
4219 static PyObject*
4220 encoding_map_size(PyObject *obj, PyObject* args)
4221 {
4222 struct encoding_map *map = (struct encoding_map*)obj;
4223 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4224 128*map->count3);
4225 }
4226
4227 static PyMethodDef encoding_map_methods[] = {
4228 {"size", encoding_map_size, METH_NOARGS,
4229 PyDoc_STR("Return the size (in bytes) of this object") },
4230 { 0 }
4231 };
4232
4233 static void
4234 encoding_map_dealloc(PyObject* o)
4235 {
4236 PyObject_FREE(o);
4237 }
4238
4239 static PyTypeObject EncodingMapType = {
4240 PyVarObject_HEAD_INIT(NULL, 0)
4241 "EncodingMap", /*tp_name*/
4242 sizeof(struct encoding_map), /*tp_basicsize*/
4243 0, /*tp_itemsize*/
4244 /* methods */
4245 encoding_map_dealloc, /*tp_dealloc*/
4246 0, /*tp_print*/
4247 0, /*tp_getattr*/
4248 0, /*tp_setattr*/
4249 0, /*tp_compare*/
4250 0, /*tp_repr*/
4251 0, /*tp_as_number*/
4252 0, /*tp_as_sequence*/
4253 0, /*tp_as_mapping*/
4254 0, /*tp_hash*/
4255 0, /*tp_call*/
4256 0, /*tp_str*/
4257 0, /*tp_getattro*/
4258 0, /*tp_setattro*/
4259 0, /*tp_as_buffer*/
4260 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4261 0, /*tp_doc*/
4262 0, /*tp_traverse*/
4263 0, /*tp_clear*/
4264 0, /*tp_richcompare*/
4265 0, /*tp_weaklistoffset*/
4266 0, /*tp_iter*/
4267 0, /*tp_iternext*/
4268 encoding_map_methods, /*tp_methods*/
4269 0, /*tp_members*/
4270 0, /*tp_getset*/
4271 0, /*tp_base*/
4272 0, /*tp_dict*/
4273 0, /*tp_descr_get*/
4274 0, /*tp_descr_set*/
4275 0, /*tp_dictoffset*/
4276 0, /*tp_init*/
4277 0, /*tp_alloc*/
4278 0, /*tp_new*/
4279 0, /*tp_free*/
4280 0, /*tp_is_gc*/
4281 };
4282
4283 PyObject*
4284 PyUnicode_BuildEncodingMap(PyObject* string)
4285 {
4286 Py_UNICODE *decode;
4287 PyObject *result;
4288 struct encoding_map *mresult;
4289 int i;
4290 int need_dict = 0;
4291 unsigned char level1[32];
4292 unsigned char level2[512];
4293 unsigned char *mlevel1, *mlevel2, *mlevel3;
4294 int count2 = 0, count3 = 0;
4295
4296 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4297 PyErr_BadArgument();
4298 return NULL;
4299 }
4300 decode = PyUnicode_AS_UNICODE(string);
4301 memset(level1, 0xFF, sizeof level1);
4302 memset(level2, 0xFF, sizeof level2);
4303
4304 /* If there isn't a one-to-one mapping of NULL to \0,
4305 or if there are non-BMP characters, we need to use
4306 a mapping dictionary. */
4307 if (decode[0] != 0)
4308 need_dict = 1;
4309 for (i = 1; i < 256; i++) {
4310 int l1, l2;
4311 if (decode[i] == 0
4312 #ifdef Py_UNICODE_WIDE
4313 || decode[i] > 0xFFFF
4314 #endif
4315 ) {
4316 need_dict = 1;
4317 break;
4318 }
4319 if (decode[i] == 0xFFFE)
4320 /* unmapped character */
4321 continue;
4322 l1 = decode[i] >> 11;
4323 l2 = decode[i] >> 7;
4324 if (level1[l1] == 0xFF)
4325 level1[l1] = count2++;
4326 if (level2[l2] == 0xFF)
4327 level2[l2] = count3++;
4328 }
4329
4330 if (count2 >= 0xFF || count3 >= 0xFF)
4331 need_dict = 1;
4332
4333 if (need_dict) {
4334 PyObject *result = PyDict_New();
4335 PyObject *key, *value;
4336 if (!result)
4337 return NULL;
4338 for (i = 0; i < 256; i++) {
4339 value = NULL;
4340 key = PyInt_FromLong(decode[i]);
4341 value = PyInt_FromLong(i);
4342 if (!key || !value)
4343 goto failed1;
4344 if (PyDict_SetItem(result, key, value) == -1)
4345 goto failed1;
4346 Py_DECREF(key);
4347 Py_DECREF(value);
4348 }
4349 return result;
4350 failed1:
4351 Py_XDECREF(key);
4352 Py_XDECREF(value);
4353 Py_DECREF(result);
4354 return NULL;
4355 }
4356
4357 /* Create a three-level trie */
4358 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4359 16*count2 + 128*count3 - 1);
4360 if (!result)
4361 return PyErr_NoMemory();
4362 PyObject_Init(result, &EncodingMapType);
4363 mresult = (struct encoding_map*)result;
4364 mresult->count2 = count2;
4365 mresult->count3 = count3;
4366 mlevel1 = mresult->level1;
4367 mlevel2 = mresult->level23;
4368 mlevel3 = mresult->level23 + 16*count2;
4369 memcpy(mlevel1, level1, 32);
4370 memset(mlevel2, 0xFF, 16*count2);
4371 memset(mlevel3, 0, 128*count3);
4372 count3 = 0;
4373 for (i = 1; i < 256; i++) {
4374 int o1, o2, o3, i2, i3;
4375 if (decode[i] == 0xFFFE)
4376 /* unmapped character */
4377 continue;
4378 o1 = decode[i]>>11;
4379 o2 = (decode[i]>>7) & 0xF;
4380 i2 = 16*mlevel1[o1] + o2;
4381 if (mlevel2[i2] == 0xFF)
4382 mlevel2[i2] = count3++;
4383 o3 = decode[i] & 0x7F;
4384 i3 = 128*mlevel2[i2] + o3;
4385 mlevel3[i3] = i;
4386 }
4387 return result;
4388 }
4389
4390 static int
4391 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4392 {
4393 struct encoding_map *map = (struct encoding_map*)mapping;
4394 int l1 = c>>11;
4395 int l2 = (c>>7) & 0xF;
4396 int l3 = c & 0x7F;
4397 int i;
4398
4399 #ifdef Py_UNICODE_WIDE
4400 if (c > 0xFFFF) {
4401 return -1;
4402 }
4403 #endif
4404 if (c == 0)
4405 return 0;
4406 /* level 1*/
4407 i = map->level1[l1];
4408 if (i == 0xFF) {
4409 return -1;
4410 }
4411 /* level 2*/
4412 i = map->level23[16*i+l2];
4413 if (i == 0xFF) {
4414 return -1;
4415 }
4416 /* level 3 */
4417 i = map->level23[16*map->count2 + 128*i + l3];
4418 if (i == 0) {
4419 return -1;
4420 }
4421 return i;
4422 }
4423
4424 /* Lookup the character ch in the mapping. If the character
4425 can't be found, Py_None is returned (or NULL, if another
4426 error occurred). */
4427 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4428 {
4429 PyObject *w = PyInt_FromLong((long)c);
4430 PyObject *x;
4431
4432 if (w == NULL)
4433 return NULL;
4434 x = PyObject_GetItem(mapping, w);
4435 Py_DECREF(w);
4436 if (x == NULL) {
4437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4438 /* No mapping found means: mapping is undefined. */
4439 PyErr_Clear();
4440 x = Py_None;
4441 Py_INCREF(x);
4442 return x;
4443 } else
4444 return NULL;
4445 }
4446 else if (x == Py_None)
4447 return x;
4448 else if (PyInt_Check(x)) {
4449 long value = PyInt_AS_LONG(x);
4450 if (value < 0 || value > 255) {
4451 PyErr_SetString(PyExc_TypeError,
4452 "character mapping must be in range(256)");
4453 Py_DECREF(x);
4454 return NULL;
4455 }
4456 return x;
4457 }
4458 else if (PyString_Check(x))
4459 return x;
4460 else {
4461 /* wrong return value */
4462 PyErr_SetString(PyExc_TypeError,
4463 "character mapping must return integer, None or str");
4464 Py_DECREF(x);
4465 return NULL;
4466 }
4467 }
4468
4469 static int
4470 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4471 {
4472 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4473 /* exponentially overallocate to minimize reallocations */
4474 if (requiredsize < 2*outsize)
4475 requiredsize = 2*outsize;
4476 if (_PyString_Resize(outobj, requiredsize)) {
4477 return 0;
4478 }
4479 return 1;
4480 }
4481
4482 typedef enum charmapencode_result {
4483 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4484 }charmapencode_result;
4485 /* lookup the character, put the result in the output string and adjust
4486 various state variables. Reallocate the output string if not enough
4487 space is available. Return a new reference to the object that
4488 was put in the output buffer, or Py_None, if the mapping was undefined
4489 (in which case no character was written) or NULL, if a
4490 reallocation error occurred. The caller must decref the result */
4491 static
4492 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4493 PyObject **outobj, Py_ssize_t *outpos)
4494 {
4495 PyObject *rep;
4496 char *outstart;
4497 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4498
4499 if (Py_TYPE(mapping) == &EncodingMapType) {
4500 int res = encoding_map_lookup(c, mapping);
4501 Py_ssize_t requiredsize = *outpos+1;
4502 if (res == -1)
4503 return enc_FAILED;
4504 if (outsize<requiredsize)
4505 if (!charmapencode_resize(outobj, outpos, requiredsize))
4506 return enc_EXCEPTION;
4507 outstart = PyString_AS_STRING(*outobj);
4508 outstart[(*outpos)++] = (char)res;
4509 return enc_SUCCESS;
4510 }
4511
4512 rep = charmapencode_lookup(c, mapping);
4513 if (rep==NULL)
4514 return enc_EXCEPTION;
4515 else if (rep==Py_None) {
4516 Py_DECREF(rep);
4517 return enc_FAILED;
4518 } else {
4519 if (PyInt_Check(rep)) {
4520 Py_ssize_t requiredsize = *outpos+1;
4521 if (outsize<requiredsize)
4522 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4523 Py_DECREF(rep);
4524 return enc_EXCEPTION;
4525 }
4526 outstart = PyString_AS_STRING(*outobj);
4527 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4528 }
4529 else {
4530 const char *repchars = PyString_AS_STRING(rep);
4531 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4532 Py_ssize_t requiredsize = *outpos+repsize;
4533 if (outsize<requiredsize)
4534 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4535 Py_DECREF(rep);
4536 return enc_EXCEPTION;
4537 }
4538 outstart = PyString_AS_STRING(*outobj);
4539 memcpy(outstart + *outpos, repchars, repsize);
4540 *outpos += repsize;
4541 }
4542 }
4543 Py_DECREF(rep);
4544 return enc_SUCCESS;
4545 }
4546
4547 /* handle an error in PyUnicode_EncodeCharmap
4548 Return 0 on success, -1 on error */
4549 static
4550 int charmap_encoding_error(
4551 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4552 PyObject **exceptionObject,
4553 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4554 PyObject **res, Py_ssize_t *respos)
4555 {
4556 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4557 Py_ssize_t repsize;
4558 Py_ssize_t newpos;
4559 Py_UNICODE *uni2;
4560 /* startpos for collecting unencodable chars */
4561 Py_ssize_t collstartpos = *inpos;
4562 Py_ssize_t collendpos = *inpos+1;
4563 Py_ssize_t collpos;
4564 char *encoding = "charmap";
4565 char *reason = "character maps to <undefined>";
4566 charmapencode_result x;
4567
4568 /* find all unencodable characters */
4569 while (collendpos < size) {
4570 PyObject *rep;
4571 if (Py_TYPE(mapping) == &EncodingMapType) {
4572 int res = encoding_map_lookup(p[collendpos], mapping);
4573 if (res != -1)
4574 break;
4575 ++collendpos;
4576 continue;
4577 }
4578
4579 rep = charmapencode_lookup(p[collendpos], mapping);
4580 if (rep==NULL)
4581 return -1;
4582 else if (rep!=Py_None) {
4583 Py_DECREF(rep);
4584 break;
4585 }
4586 Py_DECREF(rep);
4587 ++collendpos;
4588 }
4589 /* cache callback name lookup
4590 * (if not done yet, i.e. it's the first error) */
4591 if (*known_errorHandler==-1) {
4592 if ((errors==NULL) || (!strcmp(errors, "strict")))
4593 *known_errorHandler = 1;
4594 else if (!strcmp(errors, "replace"))
4595 *known_errorHandler = 2;
4596 else if (!strcmp(errors, "ignore"))
4597 *known_errorHandler = 3;
4598 else if (!strcmp(errors, "xmlcharrefreplace"))
4599 *known_errorHandler = 4;
4600 else
4601 *known_errorHandler = 0;
4602 }
4603 switch (*known_errorHandler) {
4604 case 1: /* strict */
4605 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4606 return -1;
4607 case 2: /* replace */
4608 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4609 x = charmapencode_output('?', mapping, res, respos);
4610 if (x==enc_EXCEPTION) {
4611 return -1;
4612 }
4613 else if (x==enc_FAILED) {
4614 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4615 return -1;
4616 }
4617 }
4618 /* fall through */
4619 case 3: /* ignore */
4620 *inpos = collendpos;
4621 break;
4622 case 4: /* xmlcharrefreplace */
4623 /* generate replacement (temporarily (mis)uses p) */
4624 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4625 char buffer[2+29+1+1];
4626 char *cp;
4627 sprintf(buffer, "&#%d;", (int)p[collpos]);
4628 for (cp = buffer; *cp; ++cp) {
4629 x = charmapencode_output(*cp, mapping, res, respos);
4630 if (x==enc_EXCEPTION)
4631 return -1;
4632 else if (x==enc_FAILED) {
4633 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4634 return -1;
4635 }
4636 }
4637 }
4638 *inpos = collendpos;
4639 break;
4640 default:
4641 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4642 encoding, reason, p, size, exceptionObject,
4643 collstartpos, collendpos, &newpos);
4644 if (repunicode == NULL)
4645 return -1;
4646 /* generate replacement */
4647 repsize = PyUnicode_GET_SIZE(repunicode);
4648 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4649 x = charmapencode_output(*uni2, mapping, res, respos);
4650 if (x==enc_EXCEPTION) {
4651 return -1;
4652 }
4653 else if (x==enc_FAILED) {
4654 Py_DECREF(repunicode);
4655 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4656 return -1;
4657 }
4658 }
4659 *inpos = newpos;
4660 Py_DECREF(repunicode);
4661 }
4662 return 0;
4663 }
4664
4665 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4666 Py_ssize_t size,
4667 PyObject *mapping,
4668 const char *errors)
4669 {
4670 /* output object */
4671 PyObject *res = NULL;
4672 /* current input position */
4673 Py_ssize_t inpos = 0;
4674 /* current output position */
4675 Py_ssize_t respos = 0;
4676 PyObject *errorHandler = NULL;
4677 PyObject *exc = NULL;
4678 /* the following variable is used for caching string comparisons
4679 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4680 * 3=ignore, 4=xmlcharrefreplace */
4681 int known_errorHandler = -1;
4682
4683 /* Default to Latin-1 */
4684 if (mapping == NULL)
4685 return PyUnicode_EncodeLatin1(p, size, errors);
4686
4687 /* allocate enough for a simple encoding without
4688 replacements, if we need more, we'll resize */
4689 res = PyString_FromStringAndSize(NULL, size);
4690 if (res == NULL)
4691 goto onError;
4692 if (size == 0)
4693 return res;
4694
4695 while (inpos<size) {
4696 /* try to encode it */
4697 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4698 if (x==enc_EXCEPTION) /* error */
4699 goto onError;
4700 if (x==enc_FAILED) { /* unencodable character */
4701 if (charmap_encoding_error(p, size, &inpos, mapping,
4702 &exc,
4703 &known_errorHandler, &errorHandler, errors,
4704 &res, &respos)) {
4705 goto onError;
4706 }
4707 }
4708 else
4709 /* done with this character => adjust input position */
4710 ++inpos;
4711 }
4712
4713 /* Resize if we allocated to much */
4714 if (respos<PyString_GET_SIZE(res)) {
4715 if (_PyString_Resize(&res, respos))
4716 goto onError;
4717 }
4718 Py_XDECREF(exc);
4719 Py_XDECREF(errorHandler);
4720 return res;
4721
4722 onError:
4723 Py_XDECREF(res);
4724 Py_XDECREF(exc);
4725 Py_XDECREF(errorHandler);
4726 return NULL;
4727 }
4728
4729 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4730 PyObject *mapping)
4731 {
4732 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4733 PyErr_BadArgument();
4734 return NULL;
4735 }
4736 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4737 PyUnicode_GET_SIZE(unicode),
4738 mapping,
4739 NULL);
4740 }
4741
4742 /* create or adjust a UnicodeTranslateError */
4743 static void make_translate_exception(PyObject **exceptionObject,
4744 const Py_UNICODE *unicode, Py_ssize_t size,
4745 Py_ssize_t startpos, Py_ssize_t endpos,
4746 const char *reason)
4747 {
4748 if (*exceptionObject == NULL) {
4749 *exceptionObject = PyUnicodeTranslateError_Create(
4750 unicode, size, startpos, endpos, reason);
4751 }
4752 else {
4753 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4754 goto onError;
4755 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4756 goto onError;
4757 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4758 goto onError;
4759 return;
4760 onError:
4761 Py_DECREF(*exceptionObject);
4762 *exceptionObject = NULL;
4763 }
4764 }
4765
4766 /* raises a UnicodeTranslateError */
4767 static void raise_translate_exception(PyObject **exceptionObject,
4768 const Py_UNICODE *unicode, Py_ssize_t size,
4769 Py_ssize_t startpos, Py_ssize_t endpos,
4770 const char *reason)
4771 {
4772 make_translate_exception(exceptionObject,
4773 unicode, size, startpos, endpos, reason);
4774 if (*exceptionObject != NULL)
4775 PyCodec_StrictErrors(*exceptionObject);
4776 }
4777
4778 /* error handling callback helper:
4779 build arguments, call the callback and check the arguments,
4780 put the result into newpos and return the replacement string, which
4781 has to be freed by the caller */
4782 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4783 PyObject **errorHandler,
4784 const char *reason,
4785 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4786 Py_ssize_t startpos, Py_ssize_t endpos,
4787 Py_ssize_t *newpos)
4788 {
4789 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4790
4791 Py_ssize_t i_newpos;
4792 PyObject *restuple;
4793 PyObject *resunicode;
4794
4795 if (*errorHandler == NULL) {
4796 *errorHandler = PyCodec_LookupError(errors);
4797 if (*errorHandler == NULL)
4798 return NULL;
4799 }
4800
4801 make_translate_exception(exceptionObject,
4802 unicode, size, startpos, endpos, reason);
4803 if (*exceptionObject == NULL)
4804 return NULL;
4805
4806 restuple = PyObject_CallFunctionObjArgs(
4807 *errorHandler, *exceptionObject, NULL);
4808 if (restuple == NULL)
4809 return NULL;
4810 if (!PyTuple_Check(restuple)) {
4811 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4812 Py_DECREF(restuple);
4813 return NULL;
4814 }
4815 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4816 &resunicode, &i_newpos)) {
4817 Py_DECREF(restuple);
4818 return NULL;
4819 }
4820 if (i_newpos<0)
4821 *newpos = size+i_newpos;
4822 else
4823 *newpos = i_newpos;
4824 if (*newpos<0 || *newpos>size) {
4825 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4826 Py_DECREF(restuple);
4827 return NULL;
4828 }
4829 Py_INCREF(resunicode);
4830 Py_DECREF(restuple);
4831 return resunicode;
4832 }
4833
4834 /* Lookup the character ch in the mapping and put the result in result,
4835 which must be decrefed by the caller.
4836 Return 0 on success, -1 on error */
4837 static
4838 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4839 {
4840 PyObject *w = PyInt_FromLong((long)c);
4841 PyObject *x;
4842
4843 if (w == NULL)
4844 return -1;
4845 x = PyObject_GetItem(mapping, w);
4846 Py_DECREF(w);
4847 if (x == NULL) {
4848 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4849 /* No mapping found means: use 1:1 mapping. */
4850 PyErr_Clear();
4851 *result = NULL;
4852 return 0;
4853 } else
4854 return -1;
4855 }
4856 else if (x == Py_None) {
4857 *result = x;
4858 return 0;
4859 }
4860 else if (PyInt_Check(x)) {
4861 long value = PyInt_AS_LONG(x);
4862 long max = PyUnicode_GetMax();
4863 if (value < 0 || value > max) {
4864 PyErr_Format(PyExc_TypeError,
4865 "character mapping must be in range(0x%lx)", max+1);
4866 Py_DECREF(x);
4867 return -1;
4868 }
4869 *result = x;
4870 return 0;
4871 }
4872 else if (PyUnicode_Check(x)) {
4873 *result = x;
4874 return 0;
4875 }
4876 else {
4877 /* wrong return value */
4878 PyErr_SetString(PyExc_TypeError,
4879 "character mapping must return integer, None or unicode");
4880 Py_DECREF(x);
4881 return -1;
4882 }
4883 }
4884 /* ensure that *outobj is at least requiredsize characters long,
4885 if not reallocate and adjust various state variables.
4886 Return 0 on success, -1 on error */
4887 static
4888 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4889 Py_ssize_t requiredsize)
4890 {
4891 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4892 if (requiredsize > oldsize) {
4893 /* remember old output position */
4894 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4895 /* exponentially overallocate to minimize reallocations */
4896 if (requiredsize < 2 * oldsize)
4897 requiredsize = 2 * oldsize;
4898 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4899 return -1;
4900 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4901 }
4902 return 0;
4903 }
4904 /* lookup the character, put the result in the output string and adjust
4905 various state variables. Return a new reference to the object that
4906 was put in the output buffer in *result, or Py_None, if the mapping was
4907 undefined (in which case no character was written).
4908 The called must decref result.
4909 Return 0 on success, -1 on error. */
4910 static
4911 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4912 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4913 PyObject **res)
4914 {
4915 if (charmaptranslate_lookup(*curinp, mapping, res))
4916 return -1;
4917 if (*res==NULL) {
4918 /* not found => default to 1:1 mapping */
4919 *(*outp)++ = *curinp;
4920 }
4921 else if (*res==Py_None)
4922 ;
4923 else if (PyInt_Check(*res)) {
4924 /* no overflow check, because we know that the space is enough */
4925 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4926 }
4927 else if (PyUnicode_Check(*res)) {
4928 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4929 if (repsize==1) {
4930 /* no overflow check, because we know that the space is enough */
4931 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4932 }
4933 else if (repsize!=0) {
4934 /* more than one character */
4935 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4936 (insize - (curinp-startinp)) +
4937 repsize - 1;
4938 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4939 return -1;
4940 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4941 *outp += repsize;
4942 }
4943 }
4944 else
4945 return -1;
4946 return 0;
4947 }
4948
4949 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4950 Py_ssize_t size,
4951 PyObject *mapping,
4952 const char *errors)
4953 {
4954 /* output object */
4955 PyObject *res = NULL;
4956 /* pointers to the beginning and end+1 of input */
4957 const Py_UNICODE *startp = p;
4958 const Py_UNICODE *endp = p + size;
4959 /* pointer into the output */
4960 Py_UNICODE *str;
4961 /* current output position */
4962 Py_ssize_t respos = 0;
4963 char *reason = "character maps to <undefined>";
4964 PyObject *errorHandler = NULL;
4965 PyObject *exc = NULL;
4966 /* the following variable is used for caching string comparisons
4967 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4968 * 3=ignore, 4=xmlcharrefreplace */
4969 int known_errorHandler = -1;
4970
4971 if (mapping == NULL) {
4972 PyErr_BadArgument();
4973 return NULL;
4974 }
4975
4976 /* allocate enough for a simple 1:1 translation without
4977 replacements, if we need more, we'll resize */
4978 res = PyUnicode_FromUnicode(NULL, size);
4979 if (res == NULL)
4980 goto onError;
4981 if (size == 0)
4982 return res;
4983 str = PyUnicode_AS_UNICODE(res);
4984
4985 while (p<endp) {
4986 /* try to encode it */
4987 PyObject *x = NULL;
4988 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4989 Py_XDECREF(x);
4990 goto onError;
4991 }
4992 Py_XDECREF(x);
4993 if (x!=Py_None) /* it worked => adjust input pointer */
4994 ++p;
4995 else { /* untranslatable character */
4996 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4997 Py_ssize_t repsize;
4998 Py_ssize_t newpos;
4999 Py_UNICODE *uni2;
5000 /* startpos for collecting untranslatable chars */
5001 const Py_UNICODE *collstart = p;
5002 const Py_UNICODE *collend = p+1;
5003 const Py_UNICODE *coll;
5004
5005 /* find all untranslatable characters */
5006 while (collend < endp) {
5007 if (charmaptranslate_lookup(*collend, mapping, &x))
5008 goto onError;
5009 Py_XDECREF(x);
5010 if (x!=Py_None)
5011 break;
5012 ++collend;
5013 }
5014 /* cache callback name lookup
5015 * (if not done yet, i.e. it's the first error) */
5016 if (known_errorHandler==-1) {
5017 if ((errors==NULL) || (!strcmp(errors, "strict")))
5018 known_errorHandler = 1;
5019 else if (!strcmp(errors, "replace"))
5020 known_errorHandler = 2;
5021 else if (!strcmp(errors, "ignore"))
5022 known_errorHandler = 3;
5023 else if (!strcmp(errors, "xmlcharrefreplace"))
5024 known_errorHandler = 4;
5025 else
5026 known_errorHandler = 0;
5027 }
5028 switch (known_errorHandler) {
5029 case 1: /* strict */
5030 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5031 goto onError;
5032 case 2: /* replace */
5033 /* No need to check for space, this is a 1:1 replacement */
5034 for (coll = collstart; coll<collend; ++coll)
5035 *str++ = '?';
5036 /* fall through */
5037 case 3: /* ignore */
5038 p = collend;
5039 break;
5040 case 4: /* xmlcharrefreplace */
5041 /* generate replacement (temporarily (mis)uses p) */
5042 for (p = collstart; p < collend; ++p) {
5043 char buffer[2+29+1+1];
5044 char *cp;
5045 sprintf(buffer, "&#%d;", (int)*p);
5046 if (charmaptranslate_makespace(&res, &str,
5047 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5048 goto onError;
5049 for (cp = buffer; *cp; ++cp)
5050 *str++ = *cp;
5051 }
5052 p = collend;
5053 break;
5054 default:
5055 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5056 reason, startp, size, &exc,
5057 collstart-startp, collend-startp, &newpos);
5058 if (repunicode == NULL)
5059 goto onError;
5060 /* generate replacement */
5061 repsize = PyUnicode_GET_SIZE(repunicode);
5062 if (charmaptranslate_makespace(&res, &str,
5063 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5064 Py_DECREF(repunicode);
5065 goto onError;
5066 }
5067 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5068 *str++ = *uni2;
5069 p = startp + newpos;
5070 Py_DECREF(repunicode);
5071 }
5072 }
5073 }
5074 /* Resize if we allocated to much */
5075 respos = str-PyUnicode_AS_UNICODE(res);
5076 if (respos<PyUnicode_GET_SIZE(res)) {
5077 if (PyUnicode_Resize(&res, respos) < 0)
5078 goto onError;
5079 }
5080 Py_XDECREF(exc);
5081 Py_XDECREF(errorHandler);
5082 return res;
5083
5084 onError:
5085 Py_XDECREF(res);
5086 Py_XDECREF(exc);
5087 Py_XDECREF(errorHandler);
5088 return NULL;
5089 }
5090
5091 PyObject *PyUnicode_Translate(PyObject *str,
5092 PyObject *mapping,
5093 const char *errors)
5094 {
5095 PyObject *result;
5096
5097 str = PyUnicode_FromObject(str);
5098 if (str == NULL)
5099 goto onError;
5100 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5101 PyUnicode_GET_SIZE(str),
5102 mapping,
5103 errors);
5104 Py_DECREF(str);
5105 return result;
5106
5107 onError:
5108 Py_XDECREF(str);
5109 return NULL;
5110 }
5111
5112 /* --- Decimal Encoder ---------------------------------------------------- */
5113
5114 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5115 Py_ssize_t length,
5116 char *output,
5117 const char *errors)
5118 {
5119 Py_UNICODE *p, *end;
5120 PyObject *errorHandler = NULL;
5121 PyObject *exc = NULL;
5122 const char *encoding = "decimal";
5123 const char *reason = "invalid decimal Unicode string";
5124 /* the following variable is used for caching string comparisons
5125 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5126 int known_errorHandler = -1;
5127
5128 if (output == NULL) {
5129 PyErr_BadArgument();
5130 return -1;
5131 }
5132
5133 p = s;
5134 end = s + length;
5135 while (p < end) {
5136 register Py_UNICODE ch = *p;
5137 int decimal;
5138 PyObject *repunicode;
5139 Py_ssize_t repsize;
5140 Py_ssize_t newpos;
5141 Py_UNICODE *uni2;
5142 Py_UNICODE *collstart;
5143 Py_UNICODE *collend;
5144
5145 if (Py_UNICODE_ISSPACE(ch)) {
5146 *output++ = ' ';
5147 ++p;
5148 continue;
5149 }
5150 decimal = Py_UNICODE_TODECIMAL(ch);
5151 if (decimal >= 0) {
5152 *output++ = '0' + decimal;
5153 ++p;
5154 continue;
5155 }
5156 if (0 < ch && ch < 256) {
5157 *output++ = (char)ch;
5158 ++p;
5159 continue;
5160 }
5161 /* All other characters are considered unencodable */
5162 collstart = p;
5163 for (collend = p+1; collend < end; collend++) {
5164 if ((0 < *collend && *collend < 256) ||
5165 Py_UNICODE_ISSPACE(*collend) ||
5166 0 <= Py_UNICODE_TODECIMAL(*collend))
5167 break;
5168 }
5169 /* cache callback name lookup
5170 * (if not done yet, i.e. it's the first error) */
5171 if (known_errorHandler==-1) {
5172 if ((errors==NULL) || (!strcmp(errors, "strict")))
5173 known_errorHandler = 1;
5174 else if (!strcmp(errors, "replace"))
5175 known_errorHandler = 2;
5176 else if (!strcmp(errors, "ignore"))
5177 known_errorHandler = 3;
5178 else if (!strcmp(errors, "xmlcharrefreplace"))
5179 known_errorHandler = 4;
5180 else
5181 known_errorHandler = 0;
5182 }
5183 switch (known_errorHandler) {
5184 case 1: /* strict */
5185 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5186 goto onError;
5187 case 2: /* replace */
5188 for (p = collstart; p < collend; ++p)
5189 *output++ = '?';
5190 /* fall through */
5191 case 3: /* ignore */
5192 p = collend;
5193 break;
5194 case 4: /* xmlcharrefreplace */
5195 /* generate replacement (temporarily (mis)uses p) */
5196 for (p = collstart; p < collend; ++p)
5197 output += sprintf(output, "&#%d;", (int)*p);
5198 p = collend;
5199 break;
5200 default:
5201 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5202 encoding, reason, s, length, &exc,
5203 collstart-s, collend-s, &newpos);
5204 if (repunicode == NULL)
5205 goto onError;
5206 /* generate replacement */
5207 repsize = PyUnicode_GET_SIZE(repunicode);
5208 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5209 Py_UNICODE ch = *uni2;
5210 if (Py_UNICODE_ISSPACE(ch))
5211 *output++ = ' ';
5212 else {
5213 decimal = Py_UNICODE_TODECIMAL(ch);
5214 if (decimal >= 0)
5215 *output++ = '0' + decimal;
5216 else if (0 < ch && ch < 256)
5217 *output++ = (char)ch;
5218 else {
5219 Py_DECREF(repunicode);
5220 raise_encode_exception(&exc, encoding,
5221 s, length, collstart-s, collend-s, reason);
5222 goto onError;
5223 }
5224 }
5225 }
5226 p = s + newpos;
5227 Py_DECREF(repunicode);
5228 }
5229 }
5230 /* 0-terminate the output string */
5231 *output++ = '\0';
5232 Py_XDECREF(exc);
5233 Py_XDECREF(errorHandler);
5234 return 0;
5235
5236 onError:
5237 Py_XDECREF(exc);
5238 Py_XDECREF(errorHandler);
5239 return -1;
5240 }
5241
5242 /* --- Helpers ------------------------------------------------------------ */
5243
5244 #include "stringlib/unicodedefs.h"
5245 #include "stringlib/fastsearch.h"
5246
5247 #include "stringlib/count.h"
5248 #include "stringlib/find.h"
5249 #include "stringlib/partition.h"
5250 #include "stringlib/split.h"
5251
5252 /* helper macro to fixup start/end slice values */
5253 #define ADJUST_INDICES(start, end, len) \
5254 if (end > len) \
5255 end = len; \
5256 else if (end < 0) { \
5257 end += len; \
5258 if (end < 0) \
5259 end = 0; \
5260 } \
5261 if (start < 0) { \
5262 start += len; \
5263 if (start < 0) \
5264 start = 0; \
5265 }
5266
5267 Py_ssize_t PyUnicode_Count(PyObject *str,
5268 PyObject *substr,
5269 Py_ssize_t start,
5270 Py_ssize_t end)
5271 {
5272 Py_ssize_t result;
5273 PyUnicodeObject* str_obj;
5274 PyUnicodeObject* sub_obj;
5275
5276 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5277 if (!str_obj)
5278 return -1;
5279 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5280 if (!sub_obj) {
5281 Py_DECREF(str_obj);
5282 return -1;
5283 }
5284
5285 ADJUST_INDICES(start, end, str_obj->length);
5286 result = stringlib_count(
5287 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5288 PY_SSIZE_T_MAX
5289 );
5290
5291 Py_DECREF(sub_obj);
5292 Py_DECREF(str_obj);
5293
5294 return result;
5295 }
5296
5297 Py_ssize_t PyUnicode_Find(PyObject *str,
5298 PyObject *sub,
5299 Py_ssize_t start,
5300 Py_ssize_t end,
5301 int direction)
5302 {
5303 Py_ssize_t result;
5304
5305 str = PyUnicode_FromObject(str);
5306 if (!str)
5307 return -2;
5308 sub = PyUnicode_FromObject(sub);
5309 if (!sub) {
5310 Py_DECREF(str);
5311 return -2;
5312 }
5313
5314 if (direction > 0)
5315 result = stringlib_find_slice(
5316 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5317 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5318 start, end
5319 );
5320 else
5321 result = stringlib_rfind_slice(
5322 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5323 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5324 start, end
5325 );
5326
5327 Py_DECREF(str);
5328 Py_DECREF(sub);
5329
5330 return result;
5331 }
5332
5333 static
5334 int tailmatch(PyUnicodeObject *self,
5335 PyUnicodeObject *substring,
5336 Py_ssize_t start,
5337 Py_ssize_t end,
5338 int direction)
5339 {
5340 if (substring->length == 0)
5341 return 1;
5342
5343 ADJUST_INDICES(start, end, self->length);
5344 end -= substring->length;
5345 if (end < start)
5346 return 0;
5347
5348 if (direction > 0) {
5349 if (Py_UNICODE_MATCH(self, end, substring))
5350 return 1;
5351 } else {
5352 if (Py_UNICODE_MATCH(self, start, substring))
5353 return 1;
5354 }
5355
5356 return 0;
5357 }
5358
5359 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5360 PyObject *substr,
5361 Py_ssize_t start,
5362 Py_ssize_t end,
5363 int direction)
5364 {
5365 Py_ssize_t result;
5366
5367 str = PyUnicode_FromObject(str);
5368 if (str == NULL)
5369 return -1;
5370 substr = PyUnicode_FromObject(substr);
5371 if (substr == NULL) {
5372 Py_DECREF(str);
5373 return -1;
5374 }
5375
5376 result = tailmatch((PyUnicodeObject *)str,
5377 (PyUnicodeObject *)substr,
5378 start, end, direction);
5379 Py_DECREF(str);
5380 Py_DECREF(substr);
5381 return result;
5382 }
5383
5384 /* Apply fixfct filter to the Unicode object self and return a
5385 reference to the modified object */
5386
5387 static
5388 PyObject *fixup(PyUnicodeObject *self,
5389 int (*fixfct)(PyUnicodeObject *s))
5390 {
5391
5392 PyUnicodeObject *u;
5393
5394 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5395 if (u == NULL)
5396 return NULL;
5397
5398 Py_UNICODE_COPY(u->str, self->str, self->length);
5399
5400 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5401 /* fixfct should return TRUE if it modified the buffer. If
5402 FALSE, return a reference to the original buffer instead
5403 (to save space, not time) */
5404 Py_INCREF(self);
5405 Py_DECREF(u);
5406 return (PyObject*) self;
5407 }
5408 return (PyObject*) u;
5409 }
5410
5411 static
5412 int fixupper(PyUnicodeObject *self)
5413 {
5414 Py_ssize_t len = self->length;
5415 Py_UNICODE *s = self->str;
5416 int status = 0;
5417
5418 while (len-- > 0) {
5419 register Py_UNICODE ch;
5420
5421 ch = Py_UNICODE_TOUPPER(*s);
5422 if (ch != *s) {
5423 status = 1;
5424 *s = ch;
5425 }
5426 s++;
5427 }
5428
5429 return status;
5430 }
5431
5432 static
5433 int fixlower(PyUnicodeObject *self)
5434 {
5435 Py_ssize_t len = self->length;
5436 Py_UNICODE *s = self->str;
5437 int status = 0;
5438
5439 while (len-- > 0) {
5440 register Py_UNICODE ch;
5441
5442 ch = Py_UNICODE_TOLOWER(*s);
5443 if (ch != *s) {
5444 status = 1;
5445 *s = ch;
5446 }
5447 s++;
5448 }
5449
5450 return status;
5451 }
5452
5453 static
5454 int fixswapcase(PyUnicodeObject *self)
5455 {
5456 Py_ssize_t len = self->length;
5457 Py_UNICODE *s = self->str;
5458 int status = 0;
5459
5460 while (len-- > 0) {
5461 if (Py_UNICODE_ISUPPER(*s)) {
5462 *s = Py_UNICODE_TOLOWER(*s);
5463 status = 1;
5464 } else if (Py_UNICODE_ISLOWER(*s)) {
5465 *s = Py_UNICODE_TOUPPER(*s);
5466 status = 1;
5467 }
5468 s++;
5469 }
5470
5471 return status;
5472 }
5473
5474 static
5475 int fixcapitalize(PyUnicodeObject *self)
5476 {
5477 Py_ssize_t len = self->length;
5478 Py_UNICODE *s = self->str;
5479 int status = 0;
5480
5481 if (len == 0)
5482 return 0;
5483 if (!Py_UNICODE_ISUPPER(*s)) {
5484 *s = Py_UNICODE_TOUPPER(*s);
5485 status = 1;
5486 }
5487 s++;
5488 while (--len > 0) {
5489 if (!Py_UNICODE_ISLOWER(*s)) {
5490 *s = Py_UNICODE_TOLOWER(*s);
5491 status = 1;
5492 }
5493 s++;
5494 }
5495 return status;
5496 }
5497
5498 static
5499 int fixtitle(PyUnicodeObject *self)
5500 {
5501 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5502 register Py_UNICODE *e;
5503 int previous_is_cased;
5504
5505 /* Shortcut for single character strings */
5506 if (PyUnicode_GET_SIZE(self) == 1) {
5507 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5508 if (*p != ch) {
5509 *p = ch;
5510 return 1;
5511 }
5512 else
5513 return 0;
5514 }
5515
5516 e = p + PyUnicode_GET_SIZE(self);
5517 previous_is_cased = 0;
5518 for (; p < e; p++) {
5519 register const Py_UNICODE ch = *p;
5520
5521 if (previous_is_cased)
5522 *p = Py_UNICODE_TOLOWER(ch);
5523 else
5524 *p = Py_UNICODE_TOTITLE(ch);
5525
5526 if (Py_UNICODE_ISLOWER(ch) ||
5527 Py_UNICODE_ISUPPER(ch) ||
5528 Py_UNICODE_ISTITLE(ch))
5529 previous_is_cased = 1;
5530 else
5531 previous_is_cased = 0;
5532 }
5533 return 1;
5534 }
5535
5536 PyObject *
5537 PyUnicode_Join(PyObject *separator, PyObject *seq)
5538 {
5539 PyObject *internal_separator = NULL;
5540 const Py_UNICODE blank = ' ';
5541 const Py_UNICODE *sep = ␣
5542 Py_ssize_t seplen = 1;
5543 PyUnicodeObject *res = NULL; /* the result */
5544 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5545 Py_ssize_t res_used; /* # used bytes */
5546 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5547 PyObject *fseq; /* PySequence_Fast(seq) */
5548 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5549 PyObject *item;
5550 Py_ssize_t i;
5551
5552 fseq = PySequence_Fast(seq, "");
5553 if (fseq == NULL) {
5554 return NULL;
5555 }
5556
5557 /* Grrrr. A codec may be invoked to convert str objects to
5558 * Unicode, and so it's possible to call back into Python code
5559 * during PyUnicode_FromObject(), and so it's possible for a sick
5560 * codec to change the size of fseq (if seq is a list). Therefore
5561 * we have to keep refetching the size -- can't assume seqlen
5562 * is invariant.
5563 */
5564 seqlen = PySequence_Fast_GET_SIZE(fseq);
5565 /* If empty sequence, return u"". */
5566 if (seqlen == 0) {
5567 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5568 goto Done;
5569 }
5570 /* If singleton sequence with an exact Unicode, return that. */
5571 if (seqlen == 1) {
5572 item = PySequence_Fast_GET_ITEM(fseq, 0);
5573 if (PyUnicode_CheckExact(item)) {
5574 Py_INCREF(item);
5575 res = (PyUnicodeObject *)item;
5576 goto Done;
5577 }
5578 }
5579
5580 /* At least two items to join, or one that isn't exact Unicode. */
5581 if (seqlen > 1) {
5582 /* Set up sep and seplen -- they're needed. */
5583 if (separator == NULL) {
5584 sep = ␣
5585 seplen = 1;
5586 }
5587 else {
5588 internal_separator = PyUnicode_FromObject(separator);
5589 if (internal_separator == NULL)
5590 goto onError;
5591 sep = PyUnicode_AS_UNICODE(internal_separator);
5592 seplen = PyUnicode_GET_SIZE(internal_separator);
5593 /* In case PyUnicode_FromObject() mutated seq. */
5594 seqlen = PySequence_Fast_GET_SIZE(fseq);
5595 }
5596 }
5597
5598 /* Get space. */
5599 res = _PyUnicode_New(res_alloc);
5600 if (res == NULL)
5601 goto onError;
5602 res_p = PyUnicode_AS_UNICODE(res);
5603 res_used = 0;
5604
5605 for (i = 0; i < seqlen; ++i) {
5606 Py_ssize_t itemlen;
5607 Py_ssize_t new_res_used;
5608
5609 item = PySequence_Fast_GET_ITEM(fseq, i);
5610 /* Convert item to Unicode. */
5611 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5612 PyErr_Format(PyExc_TypeError,
5613 "sequence item %zd: expected string or Unicode,"
5614 " %.80s found",
5615 i, Py_TYPE(item)->tp_name);
5616 goto onError;
5617 }
5618 item = PyUnicode_FromObject(item);
5619 if (item == NULL)
5620 goto onError;
5621 /* We own a reference to item from here on. */
5622
5623 /* In case PyUnicode_FromObject() mutated seq. */
5624 seqlen = PySequence_Fast_GET_SIZE(fseq);
5625
5626 /* Make sure we have enough space for the separator and the item. */
5627 itemlen = PyUnicode_GET_SIZE(item);
5628 new_res_used = res_used + itemlen;
5629 if (new_res_used < 0)
5630 goto Overflow;
5631 if (i < seqlen - 1) {
5632 new_res_used += seplen;
5633 if (new_res_used < 0)
5634 goto Overflow;
5635 }
5636 if (new_res_used > res_alloc) {
5637 /* double allocated size until it's big enough */
5638 do {
5639 res_alloc += res_alloc;
5640 if (res_alloc <= 0)
5641 goto Overflow;
5642 } while (new_res_used > res_alloc);
5643 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5644 Py_DECREF(item);
5645 goto onError;
5646 }
5647 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5648 }
5649
5650 /* Copy item, and maybe the separator. */
5651 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5652 res_p += itemlen;
5653 if (i < seqlen - 1) {
5654 Py_UNICODE_COPY(res_p, sep, seplen);
5655 res_p += seplen;
5656 }
5657 Py_DECREF(item);
5658 res_used = new_res_used;
5659 }
5660
5661 /* Shrink res to match the used area; this probably can't fail,
5662 * but it's cheap to check.
5663 */
5664 if (_PyUnicode_Resize(&res, res_used) < 0)
5665 goto onError;
5666
5667 Done:
5668 Py_XDECREF(internal_separator);
5669 Py_DECREF(fseq);
5670 return (PyObject *)res;
5671
5672 Overflow:
5673 PyErr_SetString(PyExc_OverflowError,
5674 "join() result is too long for a Python string");
5675 Py_DECREF(item);
5676 /* fall through */
5677
5678 onError:
5679 Py_XDECREF(internal_separator);
5680 Py_DECREF(fseq);
5681 Py_XDECREF(res);
5682 return NULL;
5683 }
5684
5685 static
5686 PyUnicodeObject *pad(PyUnicodeObject *self,
5687 Py_ssize_t left,
5688 Py_ssize_t right,
5689 Py_UNICODE fill)
5690 {
5691 PyUnicodeObject *u;
5692
5693 if (left < 0)
5694 left = 0;
5695 if (right < 0)
5696 right = 0;
5697
5698 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5699 Py_INCREF(self);
5700 return self;
5701 }
5702
5703 if (left > PY_SSIZE_T_MAX - self->length ||
5704 right > PY_SSIZE_T_MAX - (left + self->length)) {
5705 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5706 return NULL;
5707 }
5708 u = _PyUnicode_New(left + self->length + right);
5709 if (u) {
5710 if (left)
5711 Py_UNICODE_FILL(u->str, fill, left);
5712 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5713 if (right)
5714 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5715 }
5716
5717 return u;
5718 }
5719
5720 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5721 {
5722 PyObject *list;
5723
5724 string = PyUnicode_FromObject(string);
5725 if (string == NULL)
5726 return NULL;
5727
5728 list = stringlib_splitlines(
5729 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5730 PyUnicode_GET_SIZE(string), keepends);
5731
5732 Py_DECREF(string);
5733 return list;
5734 }
5735
5736 static
5737 PyObject *split(PyUnicodeObject *self,
5738 PyUnicodeObject *substring,
5739 Py_ssize_t maxcount)
5740 {
5741 if (maxcount < 0)
5742 maxcount = PY_SSIZE_T_MAX;
5743
5744 if (substring == NULL)
5745 return stringlib_split_whitespace(
5746 (PyObject*) self, self->str, self->length, maxcount
5747 );
5748
5749 return stringlib_split(
5750 (PyObject*) self, self->str, self->length,
5751 substring->str, substring->length,
5752 maxcount
5753 );
5754 }
5755
5756 static
5757 PyObject *rsplit(PyUnicodeObject *self,
5758 PyUnicodeObject *substring,
5759 Py_ssize_t maxcount)
5760 {
5761 if (maxcount < 0)
5762 maxcount = PY_SSIZE_T_MAX;
5763
5764 if (substring == NULL)
5765 return stringlib_rsplit_whitespace(
5766 (PyObject*) self, self->str, self->length, maxcount
5767 );
5768
5769 return stringlib_rsplit(
5770 (PyObject*) self, self->str, self->length,
5771 substring->str, substring->length,
5772 maxcount
5773 );
5774 }
5775
5776 static
5777 PyObject *replace(PyUnicodeObject *self,
5778 PyUnicodeObject *str1,
5779 PyUnicodeObject *str2,
5780 Py_ssize_t maxcount)
5781 {
5782 PyUnicodeObject *u;
5783
5784 if (maxcount < 0)
5785 maxcount = PY_SSIZE_T_MAX;
5786 else if (maxcount == 0 || self->length == 0)
5787 goto nothing;
5788
5789 if (str1->length == str2->length) {
5790 Py_ssize_t i;
5791 /* same length */
5792 if (str1->length == 0)
5793 goto nothing;
5794 if (str1->length == 1) {
5795 /* replace characters */
5796 Py_UNICODE u1, u2;
5797 if (!findchar(self->str, self->length, str1->str[0]))
5798 goto nothing;
5799 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5800 if (!u)
5801 return NULL;
5802 Py_UNICODE_COPY(u->str, self->str, self->length);
5803 u1 = str1->str[0];
5804 u2 = str2->str[0];
5805 for (i = 0; i < u->length; i++)
5806 if (u->str[i] == u1) {
5807 if (--maxcount < 0)
5808 break;
5809 u->str[i] = u2;
5810 }
5811 } else {
5812 i = stringlib_find(
5813 self->str, self->length, str1->str, str1->length, 0
5814 );
5815 if (i < 0)
5816 goto nothing;
5817 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5818 if (!u)
5819 return NULL;
5820 Py_UNICODE_COPY(u->str, self->str, self->length);
5821
5822 /* change everything in-place, starting with this one */
5823 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5824 i += str1->length;
5825
5826 while ( --maxcount > 0) {
5827 i = stringlib_find(self->str+i, self->length-i,
5828 str1->str, str1->length,
5829 i);
5830 if (i == -1)
5831 break;
5832 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5833 i += str1->length;
5834 }
5835 }
5836 } else {
5837
5838 Py_ssize_t n, i, j;
5839 Py_ssize_t product, new_size, delta;
5840 Py_UNICODE *p;
5841
5842 /* replace strings */
5843 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5844 maxcount);
5845 if (n == 0)
5846 goto nothing;
5847 /* new_size = self->length + n * (str2->length - str1->length)); */
5848 delta = (str2->length - str1->length);
5849 if (delta == 0) {
5850 new_size = self->length;
5851 } else {
5852 product = n * (str2->length - str1->length);
5853 if ((product / (str2->length - str1->length)) != n) {
5854 PyErr_SetString(PyExc_OverflowError,
5855 "replace string is too long");
5856 return NULL;
5857 }
5858 new_size = self->length + product;
5859 if (new_size < 0) {
5860 PyErr_SetString(PyExc_OverflowError,
5861 "replace string is too long");
5862 return NULL;
5863 }
5864 }
5865 u = _PyUnicode_New(new_size);
5866 if (!u)
5867 return NULL;
5868 i = 0;
5869 p = u->str;
5870 if (str1->length > 0) {
5871 while (n-- > 0) {
5872 /* look for next match */
5873 j = stringlib_find(self->str+i, self->length-i,
5874 str1->str, str1->length,
5875 i);
5876 if (j == -1)
5877 break;
5878 else if (j > i) {
5879 /* copy unchanged part [i:j] */
5880 Py_UNICODE_COPY(p, self->str+i, j-i);
5881 p += j - i;
5882 }
5883 /* copy substitution string */
5884 if (str2->length > 0) {
5885 Py_UNICODE_COPY(p, str2->str, str2->length);
5886 p += str2->length;
5887 }
5888 i = j + str1->length;
5889 }
5890 if (i < self->length)
5891 /* copy tail [i:] */
5892 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5893 } else {
5894 /* interleave */
5895 while (n > 0) {
5896 Py_UNICODE_COPY(p, str2->str, str2->length);
5897 p += str2->length;
5898 if (--n <= 0)
5899 break;
5900 *p++ = self->str[i++];
5901 }
5902 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5903 }
5904 }
5905 return (PyObject *) u;
5906
5907 nothing:
5908 /* nothing to replace; return original string (when possible) */
5909 if (PyUnicode_CheckExact(self)) {
5910 Py_INCREF(self);
5911 return (PyObject *) self;
5912 }
5913 return PyUnicode_FromUnicode(self->str, self->length);
5914 }
5915
5916 /* --- Unicode Object Methods --------------------------------------------- */
5917
5918 PyDoc_STRVAR(title__doc__,
5919 "S.title() -> unicode\n\
5920 \n\
5921 Return a titlecased version of S, i.e. words start with title case\n\
5922 characters, all remaining cased characters have lower case.");
5923
5924 static PyObject*
5925 unicode_title(PyUnicodeObject *self)
5926 {
5927 return fixup(self, fixtitle);
5928 }
5929
5930 PyDoc_STRVAR(capitalize__doc__,
5931 "S.capitalize() -> unicode\n\
5932 \n\
5933 Return a capitalized version of S, i.e. make the first character\n\
5934 have upper case and the rest lower case.");
5935
5936 static PyObject*
5937 unicode_capitalize(PyUnicodeObject *self)
5938 {
5939 return fixup(self, fixcapitalize);
5940 }
5941
5942 #if 0
5943 PyDoc_STRVAR(capwords__doc__,
5944 "S.capwords() -> unicode\n\
5945 \n\
5946 Apply .capitalize() to all words in S and return the result with\n\
5947 normalized whitespace (all whitespace strings are replaced by ' ').");
5948
5949 static PyObject*
5950 unicode_capwords(PyUnicodeObject *self)
5951 {
5952 PyObject *list;
5953 PyObject *item;
5954 Py_ssize_t i;
5955
5956 /* Split into words */
5957 list = split(self, NULL, -1);
5958 if (!list)
5959 return NULL;
5960
5961 /* Capitalize each word */
5962 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5963 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5964 fixcapitalize);
5965 if (item == NULL)
5966 goto onError;
5967 Py_DECREF(PyList_GET_ITEM(list, i));
5968 PyList_SET_ITEM(list, i, item);
5969 }
5970
5971 /* Join the words to form a new string */
5972 item = PyUnicode_Join(NULL, list);
5973
5974 onError:
5975 Py_DECREF(list);
5976 return (PyObject *)item;
5977 }
5978 #endif
5979
5980 /* Argument converter. Coerces to a single unicode character */
5981
5982 static int
5983 convert_uc(PyObject *obj, void *addr)
5984 {
5985 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5986 PyObject *uniobj;
5987 Py_UNICODE *unistr;
5988
5989 uniobj = PyUnicode_FromObject(obj);
5990 if (uniobj == NULL) {
5991 PyErr_SetString(PyExc_TypeError,
5992 "The fill character cannot be converted to Unicode");
5993 return 0;
5994 }
5995 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5996 PyErr_SetString(PyExc_TypeError,
5997 "The fill character must be exactly one character long");
5998 Py_DECREF(uniobj);
5999 return 0;
6000 }
6001 unistr = PyUnicode_AS_UNICODE(uniobj);
6002 *fillcharloc = unistr[0];
6003 Py_DECREF(uniobj);
6004 return 1;
6005 }
6006
6007 PyDoc_STRVAR(center__doc__,
6008 "S.center(width[, fillchar]) -> unicode\n\
6009 \n\
6010 Return S centered in a Unicode string of length width. Padding is\n\
6011 done using the specified fill character (default is a space)");
6012
6013 static PyObject *
6014 unicode_center(PyUnicodeObject *self, PyObject *args)
6015 {
6016 Py_ssize_t marg, left;
6017 Py_ssize_t width;
6018 Py_UNICODE fillchar = ' ';
6019
6020 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6021 return NULL;
6022
6023 if (self->length >= width && PyUnicode_CheckExact(self)) {
6024 Py_INCREF(self);
6025 return (PyObject*) self;
6026 }
6027
6028 marg = width - self->length;
6029 left = marg / 2 + (marg & width & 1);
6030
6031 return (PyObject*) pad(self, left, marg - left, fillchar);
6032 }
6033
6034 #if 0
6035
6036 /* This code should go into some future Unicode collation support
6037 module. The basic comparison should compare ordinals on a naive
6038 basis (this is what Java does and thus Jython too). */
6039
6040 /* speedy UTF-16 code point order comparison */
6041 /* gleaned from: */
6042 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6043
6044 static short utf16Fixup[32] =
6045 {
6046 0, 0, 0, 0, 0, 0, 0, 0,
6047 0, 0, 0, 0, 0, 0, 0, 0,
6048 0, 0, 0, 0, 0, 0, 0, 0,
6049 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6050 };
6051
6052 static int
6053 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6054 {
6055 Py_ssize_t len1, len2;
6056
6057 Py_UNICODE *s1 = str1->str;
6058 Py_UNICODE *s2 = str2->str;
6059
6060 len1 = str1->length;
6061 len2 = str2->length;
6062
6063 while (len1 > 0 && len2 > 0) {
6064 Py_UNICODE c1, c2;
6065
6066 c1 = *s1++;
6067 c2 = *s2++;
6068
6069 if (c1 > (1<<11) * 26)
6070 c1 += utf16Fixup[c1>>11];
6071 if (c2 > (1<<11) * 26)
6072 c2 += utf16Fixup[c2>>11];
6073 /* now c1 and c2 are in UTF-32-compatible order */
6074
6075 if (c1 != c2)
6076 return (c1 < c2) ? -1 : 1;
6077
6078 len1--; len2--;
6079 }
6080
6081 return (len1 < len2) ? -1 : (len1 != len2);
6082 }
6083
6084 #else
6085
6086 static int
6087 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6088 {
6089 register Py_ssize_t len1, len2;
6090
6091 Py_UNICODE *s1 = str1->str;
6092 Py_UNICODE *s2 = str2->str;
6093
6094 len1 = str1->length;
6095 len2 = str2->length;
6096
6097 while (len1 > 0 && len2 > 0) {
6098 Py_UNICODE c1, c2;
6099
6100 c1 = *s1++;
6101 c2 = *s2++;
6102
6103 if (c1 != c2)
6104 return (c1 < c2) ? -1 : 1;
6105
6106 len1--; len2--;
6107 }
6108
6109 return (len1 < len2) ? -1 : (len1 != len2);
6110 }
6111
6112 #endif
6113
6114 int PyUnicode_Compare(PyObject *left,
6115 PyObject *right)
6116 {
6117 PyUnicodeObject *u = NULL, *v = NULL;
6118 int result;
6119
6120 /* Coerce the two arguments */
6121 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6122 if (u == NULL)
6123 goto onError;
6124 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6125 if (v == NULL)
6126 goto onError;
6127
6128 /* Shortcut for empty or interned objects */
6129 if (v == u) {
6130 Py_DECREF(u);
6131 Py_DECREF(v);
6132 return 0;
6133 }
6134
6135 result = unicode_compare(u, v);
6136
6137 Py_DECREF(u);
6138 Py_DECREF(v);
6139 return result;
6140
6141 onError:
6142 Py_XDECREF(u);
6143 Py_XDECREF(v);
6144 return -1;
6145 }
6146
6147 PyObject *PyUnicode_RichCompare(PyObject *left,
6148 PyObject *right,
6149 int op)
6150 {
6151 int result;
6152
6153 result = PyUnicode_Compare(left, right);
6154 if (result == -1 && PyErr_Occurred())
6155 goto onError;
6156
6157 /* Convert the return value to a Boolean */
6158 switch (op) {
6159 case Py_EQ:
6160 result = (result == 0);
6161 break;
6162 case Py_NE:
6163 result = (result != 0);
6164 break;
6165 case Py_LE:
6166 result = (result <= 0);
6167 break;
6168 case Py_GE:
6169 result = (result >= 0);
6170 break;
6171 case Py_LT:
6172 result = (result == -1);
6173 break;
6174 case Py_GT:
6175 result = (result == 1);
6176 break;
6177 }
6178 return PyBool_FromLong(result);
6179
6180 onError:
6181
6182 /* Standard case
6183
6184 Type errors mean that PyUnicode_FromObject() could not convert
6185 one of the arguments (usually the right hand side) to Unicode,
6186 ie. we can't handle the comparison request. However, it is
6187 possible that the other object knows a comparison method, which
6188 is why we return Py_NotImplemented to give the other object a
6189 chance.
6190
6191 */
6192 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6193 PyErr_Clear();
6194 Py_INCREF(Py_NotImplemented);
6195 return Py_NotImplemented;
6196 }
6197 if (op != Py_EQ && op != Py_NE)
6198 return NULL;
6199
6200 /* Equality comparison.
6201
6202 This is a special case: we silence any PyExc_UnicodeDecodeError
6203 and instead turn it into a PyErr_UnicodeWarning.
6204
6205 */
6206 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6207 return NULL;
6208 PyErr_Clear();
6209 if (PyErr_Warn(PyExc_UnicodeWarning,
6210 (op == Py_EQ) ?
6211 "Unicode equal comparison "
6212 "failed to convert both arguments to Unicode - "
6213 "interpreting them as being unequal" :
6214 "Unicode unequal comparison "
6215 "failed to convert both arguments to Unicode - "
6216 "interpreting them as being unequal"
6217 ) < 0)
6218 return NULL;
6219 result = (op == Py_NE);
6220 return PyBool_FromLong(result);
6221 }
6222
6223 int PyUnicode_Contains(PyObject *container,
6224 PyObject *element)
6225 {
6226 PyObject *str, *sub;
6227 int result;
6228
6229 /* Coerce the two arguments */
6230 sub = PyUnicode_FromObject(element);
6231 if (!sub) {
6232 return -1;
6233 }
6234
6235 str = PyUnicode_FromObject(container);
6236 if (!str) {
6237 Py_DECREF(sub);
6238 return -1;
6239 }
6240
6241 result = stringlib_contains_obj(str, sub);
6242
6243 Py_DECREF(str);
6244 Py_DECREF(sub);
6245
6246 return result;
6247 }
6248
6249 /* Concat to string or Unicode object giving a new Unicode object. */
6250
6251 PyObject *PyUnicode_Concat(PyObject *left,
6252 PyObject *right)
6253 {
6254 PyUnicodeObject *u = NULL, *v = NULL, *w;
6255
6256 /* Coerce the two arguments */
6257 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6258 if (u == NULL)
6259 goto onError;
6260 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6261 if (v == NULL)
6262 goto onError;
6263
6264 /* Shortcuts */
6265 if (v == unicode_empty) {
6266 Py_DECREF(v);
6267 return (PyObject *)u;
6268 }
6269 if (u == unicode_empty) {
6270 Py_DECREF(u);
6271 return (PyObject *)v;
6272 }
6273
6274 /* Concat the two Unicode strings */
6275 w = _PyUnicode_New(u->length + v->length);
6276 if (w == NULL)
6277 goto onError;
6278 Py_UNICODE_COPY(w->str, u->str, u->length);
6279 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6280
6281 Py_DECREF(u);
6282 Py_DECREF(v);
6283 return (PyObject *)w;
6284
6285 onError:
6286 Py_XDECREF(u);
6287 Py_XDECREF(v);
6288 return NULL;
6289 }
6290
6291 PyDoc_STRVAR(count__doc__,
6292 "S.count(sub[, start[, end]]) -> int\n\
6293 \n\
6294 Return the number of non-overlapping occurrences of substring sub in\n\
6295 Unicode string S[start:end]. Optional arguments start and end are\n\
6296 interpreted as in slice notation.");
6297
6298 static PyObject *
6299 unicode_count(PyUnicodeObject *self, PyObject *args)
6300 {
6301 PyUnicodeObject *substring;
6302 Py_ssize_t start = 0;
6303 Py_ssize_t end = PY_SSIZE_T_MAX;
6304 PyObject *result;
6305
6306 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6307 &start, &end))
6308 return NULL;
6309
6310 ADJUST_INDICES(start, end, self->length);
6311 result = PyInt_FromSsize_t(
6312 stringlib_count(self->str + start, end - start,
6313 substring->str, substring->length,
6314 PY_SSIZE_T_MAX)
6315 );
6316
6317 Py_DECREF(substring);
6318
6319 return result;
6320 }
6321
6322 PyDoc_STRVAR(encode__doc__,
6323 "S.encode([encoding[,errors]]) -> string or unicode\n\
6324 \n\
6325 Encodes S using the codec registered for encoding. encoding defaults\n\
6326 to the default encoding. errors may be given to set a different error\n\
6327 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6328 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6329 'xmlcharrefreplace' as well as any other name registered with\n\
6330 codecs.register_error that can handle UnicodeEncodeErrors.");
6331
6332 static PyObject *
6333 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6334 {
6335 static char *kwlist[] = {"encoding", "errors", 0};
6336 char *encoding = NULL;
6337 char *errors = NULL;
6338 PyObject *v;
6339
6340 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6341 kwlist, &encoding, &errors))
6342 return NULL;
6343 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6344 if (v == NULL)
6345 goto onError;
6346 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6347 PyErr_Format(PyExc_TypeError,
6348 "encoder did not return a string/unicode object "
6349 "(type=%.400s)",
6350 Py_TYPE(v)->tp_name);
6351 Py_DECREF(v);
6352 return NULL;
6353 }
6354 return v;
6355
6356 onError:
6357 return NULL;
6358 }
6359
6360 PyDoc_STRVAR(decode__doc__,
6361 "S.decode([encoding[,errors]]) -> string or unicode\n\
6362 \n\
6363 Decodes S using the codec registered for encoding. encoding defaults\n\
6364 to the default encoding. errors may be given to set a different error\n\
6365 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6366 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6367 as well as any other name registered with codecs.register_error that is\n\
6368 able to handle UnicodeDecodeErrors.");
6369
6370 static PyObject *
6371 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6372 {
6373 static char *kwlist[] = {"encoding", "errors", 0};
6374 char *encoding = NULL;
6375 char *errors = NULL;
6376 PyObject *v;
6377
6378 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6379 kwlist, &encoding, &errors))
6380 return NULL;
6381 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6382 if (v == NULL)
6383 goto onError;
6384 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6385 PyErr_Format(PyExc_TypeError,
6386 "decoder did not return a string/unicode object "
6387 "(type=%.400s)",
6388 Py_TYPE(v)->tp_name);
6389 Py_DECREF(v);
6390 return NULL;
6391 }
6392 return v;
6393
6394 onError:
6395 return NULL;
6396 }
6397
6398 PyDoc_STRVAR(expandtabs__doc__,
6399 "S.expandtabs([tabsize]) -> unicode\n\
6400 \n\
6401 Return a copy of S where all tab characters are expanded using spaces.\n\
6402 If tabsize is not given, a tab size of 8 characters is assumed.");
6403
6404 static PyObject*
6405 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6406 {
6407 Py_UNICODE *e;
6408 Py_UNICODE *p;
6409 Py_UNICODE *q;
6410 Py_UNICODE *qe;
6411 Py_ssize_t i, j, incr;
6412 PyUnicodeObject *u;
6413 int tabsize = 8;
6414
6415 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6416 return NULL;
6417
6418 /* First pass: determine size of output string */
6419 i = 0; /* chars up to and including most recent \n or \r */
6420 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6421 e = self->str + self->length; /* end of input */
6422 for (p = self->str; p < e; p++)
6423 if (*p == '\t') {
6424 if (tabsize > 0) {
6425 incr = tabsize - (j % tabsize); /* cannot overflow */
6426 if (j > PY_SSIZE_T_MAX - incr)
6427 goto overflow1;
6428 j += incr;
6429 }
6430 }
6431 else {
6432 if (j > PY_SSIZE_T_MAX - 1)
6433 goto overflow1;
6434 j++;
6435 if (*p == '\n' || *p == '\r') {
6436 if (i > PY_SSIZE_T_MAX - j)
6437 goto overflow1;
6438 i += j;
6439 j = 0;
6440 }
6441 }
6442
6443 if (i > PY_SSIZE_T_MAX - j)
6444 goto overflow1;
6445
6446 /* Second pass: create output string and fill it */
6447 u = _PyUnicode_New(i + j);
6448 if (!u)
6449 return NULL;
6450
6451 j = 0; /* same as in first pass */
6452 q = u->str; /* next output char */
6453 qe = u->str + u->length; /* end of output */
6454
6455 for (p = self->str; p < e; p++)
6456 if (*p == '\t') {
6457 if (tabsize > 0) {
6458 i = tabsize - (j % tabsize);
6459 j += i;
6460 while (i--) {
6461 if (q >= qe)
6462 goto overflow2;
6463 *q++ = ' ';
6464 }
6465 }
6466 }
6467 else {
6468 if (q >= qe)
6469 goto overflow2;
6470 *q++ = *p;
6471 j++;
6472 if (*p == '\n' || *p == '\r')
6473 j = 0;
6474 }
6475
6476 return (PyObject*) u;
6477
6478 overflow2:
6479 Py_DECREF(u);
6480 overflow1:
6481 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6482 return NULL;
6483 }
6484
6485 PyDoc_STRVAR(find__doc__,
6486 "S.find(sub [,start [,end]]) -> int\n\
6487 \n\
6488 Return the lowest index in S where substring sub is found,\n\
6489 such that sub is contained within S[start:end]. Optional\n\
6490 arguments start and end are interpreted as in slice notation.\n\
6491 \n\
6492 Return -1 on failure.");
6493
6494 static PyObject *
6495 unicode_find(PyUnicodeObject *self, PyObject *args)
6496 {
6497 PyUnicodeObject *substring;
6498 Py_ssize_t start;
6499 Py_ssize_t end;
6500 Py_ssize_t result;
6501
6502 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6503 &start, &end))
6504 return NULL;
6505
6506 result = stringlib_find_slice(
6507 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6508 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6509 start, end
6510 );
6511
6512 Py_DECREF(substring);
6513
6514 return PyInt_FromSsize_t(result);
6515 }
6516
6517 static PyObject *
6518 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6519 {
6520 if (index < 0 || index >= self->length) {
6521 PyErr_SetString(PyExc_IndexError, "string index out of range");
6522 return NULL;
6523 }
6524
6525 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6526 }
6527
6528 static long
6529 unicode_hash(PyUnicodeObject *self)
6530 {
6531 /* Since Unicode objects compare equal to their ASCII string
6532 counterparts, they should use the individual character values
6533 as basis for their hash value. This is needed to assure that
6534 strings and Unicode objects behave in the same way as
6535 dictionary keys. */
6536
6537 register Py_ssize_t len;
6538 register Py_UNICODE *p;
6539 register long x;
6540
6541 #ifdef Py_DEBUG
6542 assert(_Py_HashSecret_Initialized);
6543 #endif
6544 if (self->hash != -1)
6545 return self->hash;
6546 len = PyUnicode_GET_SIZE(self);
6547 /*
6548 We make the hash of the empty string be 0, rather than using
6549 (prefix ^ suffix), since this slightly obfuscates the hash secret
6550 */
6551 if (len == 0) {
6552 self->hash = 0;
6553 return 0;
6554 }
6555 p = PyUnicode_AS_UNICODE(self);
6556 x = _Py_HashSecret.prefix;
6557 x ^= *p << 7;
6558 while (--len >= 0)
6559 x = (1000003*x) ^ *p++;
6560 x ^= PyUnicode_GET_SIZE(self);
6561 x ^= _Py_HashSecret.suffix;
6562 if (x == -1)
6563 x = -2;
6564 self->hash = x;
6565 return x;
6566 }
6567
6568 PyDoc_STRVAR(index__doc__,
6569 "S.index(sub [,start [,end]]) -> int\n\
6570 \n\
6571 Like S.find() but raise ValueError when the substring is not found.");
6572
6573 static PyObject *
6574 unicode_index(PyUnicodeObject *self, PyObject *args)
6575 {
6576 Py_ssize_t result;
6577 PyUnicodeObject *substring;
6578 Py_ssize_t start;
6579 Py_ssize_t end;
6580
6581 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6582 &start, &end))
6583 return NULL;
6584
6585 result = stringlib_find_slice(
6586 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6587 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6588 start, end
6589 );
6590
6591 Py_DECREF(substring);
6592
6593 if (result < 0) {
6594 PyErr_SetString(PyExc_ValueError, "substring not found");
6595 return NULL;
6596 }
6597
6598 return PyInt_FromSsize_t(result);
6599 }
6600
6601 PyDoc_STRVAR(islower__doc__,
6602 "S.islower() -> bool\n\
6603 \n\
6604 Return True if all cased characters in S are lowercase and there is\n\
6605 at least one cased character in S, False otherwise.");
6606
6607 static PyObject*
6608 unicode_islower(PyUnicodeObject *self)
6609 {
6610 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6611 register const Py_UNICODE *e;
6612 int cased;
6613
6614 /* Shortcut for single character strings */
6615 if (PyUnicode_GET_SIZE(self) == 1)
6616 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6617
6618 /* Special case for empty strings */
6619 if (PyUnicode_GET_SIZE(self) == 0)
6620 return PyBool_FromLong(0);
6621
6622 e = p + PyUnicode_GET_SIZE(self);
6623 cased = 0;
6624 for (; p < e; p++) {
6625 register const Py_UNICODE ch = *p;
6626
6627 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6628 return PyBool_FromLong(0);
6629 else if (!cased && Py_UNICODE_ISLOWER(ch))
6630 cased = 1;
6631 }
6632 return PyBool_FromLong(cased);
6633 }
6634
6635 PyDoc_STRVAR(isupper__doc__,
6636 "S.isupper() -> bool\n\
6637 \n\
6638 Return True if all cased characters in S are uppercase and there is\n\
6639 at least one cased character in S, False otherwise.");
6640
6641 static PyObject*
6642 unicode_isupper(PyUnicodeObject *self)
6643 {
6644 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6645 register const Py_UNICODE *e;
6646 int cased;
6647
6648 /* Shortcut for single character strings */
6649 if (PyUnicode_GET_SIZE(self) == 1)
6650 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6651
6652 /* Special case for empty strings */
6653 if (PyUnicode_GET_SIZE(self) == 0)
6654 return PyBool_FromLong(0);
6655
6656 e = p + PyUnicode_GET_SIZE(self);
6657 cased = 0;
6658 for (; p < e; p++) {
6659 register const Py_UNICODE ch = *p;
6660
6661 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6662 return PyBool_FromLong(0);
6663 else if (!cased && Py_UNICODE_ISUPPER(ch))
6664 cased = 1;
6665 }
6666 return PyBool_FromLong(cased);
6667 }
6668
6669 PyDoc_STRVAR(istitle__doc__,
6670 "S.istitle() -> bool\n\
6671 \n\
6672 Return True if S is a titlecased string and there is at least one\n\
6673 character in S, i.e. upper- and titlecase characters may only\n\
6674 follow uncased characters and lowercase characters only cased ones.\n\
6675 Return False otherwise.");
6676
6677 static PyObject*
6678 unicode_istitle(PyUnicodeObject *self)
6679 {
6680 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6681 register const Py_UNICODE *e;
6682 int cased, previous_is_cased;
6683
6684 /* Shortcut for single character strings */
6685 if (PyUnicode_GET_SIZE(self) == 1)
6686 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6687 (Py_UNICODE_ISUPPER(*p) != 0));
6688
6689 /* Special case for empty strings */
6690 if (PyUnicode_GET_SIZE(self) == 0)
6691 return PyBool_FromLong(0);
6692
6693 e = p + PyUnicode_GET_SIZE(self);
6694 cased = 0;
6695 previous_is_cased = 0;
6696 for (; p < e; p++) {
6697 register const Py_UNICODE ch = *p;
6698
6699 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6700 if (previous_is_cased)
6701 return PyBool_FromLong(0);
6702 previous_is_cased = 1;
6703 cased = 1;
6704 }
6705 else if (Py_UNICODE_ISLOWER(ch)) {
6706 if (!previous_is_cased)
6707 return PyBool_FromLong(0);
6708 previous_is_cased = 1;
6709 cased = 1;
6710 }
6711 else
6712 previous_is_cased = 0;
6713 }
6714 return PyBool_FromLong(cased);
6715 }
6716
6717 PyDoc_STRVAR(isspace__doc__,
6718 "S.isspace() -> bool\n\
6719 \n\
6720 Return True if all characters in S are whitespace\n\
6721 and there is at least one character in S, False otherwise.");
6722
6723 static PyObject*
6724 unicode_isspace(PyUnicodeObject *self)
6725 {
6726 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6727 register const Py_UNICODE *e;
6728
6729 /* Shortcut for single character strings */
6730 if (PyUnicode_GET_SIZE(self) == 1 &&
6731 Py_UNICODE_ISSPACE(*p))
6732 return PyBool_FromLong(1);
6733
6734 /* Special case for empty strings */
6735 if (PyUnicode_GET_SIZE(self) == 0)
6736 return PyBool_FromLong(0);
6737
6738 e = p + PyUnicode_GET_SIZE(self);
6739 for (; p < e; p++) {
6740 if (!Py_UNICODE_ISSPACE(*p))
6741 return PyBool_FromLong(0);
6742 }
6743 return PyBool_FromLong(1);
6744 }
6745
6746 PyDoc_STRVAR(isalpha__doc__,
6747 "S.isalpha() -> bool\n\
6748 \n\
6749 Return True if all characters in S are alphabetic\n\
6750 and there is at least one character in S, False otherwise.");
6751
6752 static PyObject*
6753 unicode_isalpha(PyUnicodeObject *self)
6754 {
6755 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6756 register const Py_UNICODE *e;
6757
6758 /* Shortcut for single character strings */
6759 if (PyUnicode_GET_SIZE(self) == 1 &&
6760 Py_UNICODE_ISALPHA(*p))
6761 return PyBool_FromLong(1);
6762
6763 /* Special case for empty strings */
6764 if (PyUnicode_GET_SIZE(self) == 0)
6765 return PyBool_FromLong(0);
6766
6767 e = p + PyUnicode_GET_SIZE(self);
6768 for (; p < e; p++) {
6769 if (!Py_UNICODE_ISALPHA(*p))
6770 return PyBool_FromLong(0);
6771 }
6772 return PyBool_FromLong(1);
6773 }
6774
6775 PyDoc_STRVAR(isalnum__doc__,
6776 "S.isalnum() -> bool\n\
6777 \n\
6778 Return True if all characters in S are alphanumeric\n\
6779 and there is at least one character in S, False otherwise.");
6780
6781 static PyObject*
6782 unicode_isalnum(PyUnicodeObject *self)
6783 {
6784 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6785 register const Py_UNICODE *e;
6786
6787 /* Shortcut for single character strings */
6788 if (PyUnicode_GET_SIZE(self) == 1 &&
6789 Py_UNICODE_ISALNUM(*p))
6790 return PyBool_FromLong(1);
6791
6792 /* Special case for empty strings */
6793 if (PyUnicode_GET_SIZE(self) == 0)
6794 return PyBool_FromLong(0);
6795
6796 e = p + PyUnicode_GET_SIZE(self);
6797 for (; p < e; p++) {
6798 if (!Py_UNICODE_ISALNUM(*p))
6799 return PyBool_FromLong(0);
6800 }
6801 return PyBool_FromLong(1);
6802 }
6803
6804 PyDoc_STRVAR(isdecimal__doc__,
6805 "S.isdecimal() -> bool\n\
6806 \n\
6807 Return True if there are only decimal characters in S,\n\
6808 False otherwise.");
6809
6810 static PyObject*
6811 unicode_isdecimal(PyUnicodeObject *self)
6812 {
6813 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6814 register const Py_UNICODE *e;
6815
6816 /* Shortcut for single character strings */
6817 if (PyUnicode_GET_SIZE(self) == 1 &&
6818 Py_UNICODE_ISDECIMAL(*p))
6819 return PyBool_FromLong(1);
6820
6821 /* Special case for empty strings */
6822 if (PyUnicode_GET_SIZE(self) == 0)
6823 return PyBool_FromLong(0);
6824
6825 e = p + PyUnicode_GET_SIZE(self);
6826 for (; p < e; p++) {
6827 if (!Py_UNICODE_ISDECIMAL(*p))
6828 return PyBool_FromLong(0);
6829 }
6830 return PyBool_FromLong(1);
6831 }
6832
6833 PyDoc_STRVAR(isdigit__doc__,
6834 "S.isdigit() -> bool\n\
6835 \n\
6836 Return True if all characters in S are digits\n\
6837 and there is at least one character in S, False otherwise.");
6838
6839 static PyObject*
6840 unicode_isdigit(PyUnicodeObject *self)
6841 {
6842 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6843 register const Py_UNICODE *e;
6844
6845 /* Shortcut for single character strings */
6846 if (PyUnicode_GET_SIZE(self) == 1 &&
6847 Py_UNICODE_ISDIGIT(*p))
6848 return PyBool_FromLong(1);
6849
6850 /* Special case for empty strings */
6851 if (PyUnicode_GET_SIZE(self) == 0)
6852 return PyBool_FromLong(0);
6853
6854 e = p + PyUnicode_GET_SIZE(self);
6855 for (; p < e; p++) {
6856 if (!Py_UNICODE_ISDIGIT(*p))
6857 return PyBool_FromLong(0);
6858 }
6859 return PyBool_FromLong(1);
6860 }
6861
6862 PyDoc_STRVAR(isnumeric__doc__,
6863 "S.isnumeric() -> bool\n\
6864 \n\
6865 Return True if there are only numeric characters in S,\n\
6866 False otherwise.");
6867
6868 static PyObject*
6869 unicode_isnumeric(PyUnicodeObject *self)
6870 {
6871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872 register const Py_UNICODE *e;
6873
6874 /* Shortcut for single character strings */
6875 if (PyUnicode_GET_SIZE(self) == 1 &&
6876 Py_UNICODE_ISNUMERIC(*p))
6877 return PyBool_FromLong(1);
6878
6879 /* Special case for empty strings */
6880 if (PyUnicode_GET_SIZE(self) == 0)
6881 return PyBool_FromLong(0);
6882
6883 e = p + PyUnicode_GET_SIZE(self);
6884 for (; p < e; p++) {
6885 if (!Py_UNICODE_ISNUMERIC(*p))
6886 return PyBool_FromLong(0);
6887 }
6888 return PyBool_FromLong(1);
6889 }
6890
6891 PyDoc_STRVAR(join__doc__,
6892 "S.join(iterable) -> unicode\n\
6893 \n\
6894 Return a string which is the concatenation of the strings in the\n\
6895 iterable. The separator between elements is S.");
6896
6897 static PyObject*
6898 unicode_join(PyObject *self, PyObject *data)
6899 {
6900 return PyUnicode_Join(self, data);
6901 }
6902
6903 static Py_ssize_t
6904 unicode_length(PyUnicodeObject *self)
6905 {
6906 return self->length;
6907 }
6908
6909 PyDoc_STRVAR(ljust__doc__,
6910 "S.ljust(width[, fillchar]) -> int\n\
6911 \n\
6912 Return S left-justified in a Unicode string of length width. Padding is\n\
6913 done using the specified fill character (default is a space).");
6914
6915 static PyObject *
6916 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6917 {
6918 Py_ssize_t width;
6919 Py_UNICODE fillchar = ' ';
6920
6921 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6922 return NULL;
6923
6924 if (self->length >= width && PyUnicode_CheckExact(self)) {
6925 Py_INCREF(self);
6926 return (PyObject*) self;
6927 }
6928
6929 return (PyObject*) pad(self, 0, width - self->length, fillchar);
6930 }
6931
6932 PyDoc_STRVAR(lower__doc__,
6933 "S.lower() -> unicode\n\
6934 \n\
6935 Return a copy of the string S converted to lowercase.");
6936
6937 static PyObject*
6938 unicode_lower(PyUnicodeObject *self)
6939 {
6940 return fixup(self, fixlower);
6941 }
6942
6943 #define LEFTSTRIP 0
6944 #define RIGHTSTRIP 1
6945 #define BOTHSTRIP 2
6946
6947 /* Arrays indexed by above */
6948 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6949
6950 #define STRIPNAME(i) (stripformat[i]+3)
6951
6952 /* externally visible for str.strip(unicode) */
6953 PyObject *
6954 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6955 {
6956 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6957 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6958 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6959 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6960 Py_ssize_t i, j;
6961
6962 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6963
6964 i = 0;
6965 if (striptype != RIGHTSTRIP) {
6966 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6967 i++;
6968 }
6969 }
6970
6971 j = len;
6972 if (striptype != LEFTSTRIP) {
6973 do {
6974 j--;
6975 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6976 j++;
6977 }
6978
6979 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6980 Py_INCREF(self);
6981 return (PyObject*)self;
6982 }
6983 else
6984 return PyUnicode_FromUnicode(s+i, j-i);
6985 }
6986
6987
6988 static PyObject *
6989 do_strip(PyUnicodeObject *self, int striptype)
6990 {
6991 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6992 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6993
6994 i = 0;
6995 if (striptype != RIGHTSTRIP) {
6996 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6997 i++;
6998 }
6999 }
7000
7001 j = len;
7002 if (striptype != LEFTSTRIP) {
7003 do {
7004 j--;
7005 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7006 j++;
7007 }
7008
7009 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7010 Py_INCREF(self);
7011 return (PyObject*)self;
7012 }
7013 else
7014 return PyUnicode_FromUnicode(s+i, j-i);
7015 }
7016
7017
7018 static PyObject *
7019 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7020 {
7021 PyObject *sep = NULL;
7022
7023 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7024 return NULL;
7025
7026 if (sep != NULL && sep != Py_None) {
7027 if (PyUnicode_Check(sep))
7028 return _PyUnicode_XStrip(self, striptype, sep);
7029 else if (PyString_Check(sep)) {
7030 PyObject *res;
7031 sep = PyUnicode_FromObject(sep);
7032 if (sep==NULL)
7033 return NULL;
7034 res = _PyUnicode_XStrip(self, striptype, sep);
7035 Py_DECREF(sep);
7036 return res;
7037 }
7038 else {
7039 PyErr_Format(PyExc_TypeError,
7040 "%s arg must be None, unicode or str",
7041 STRIPNAME(striptype));
7042 return NULL;
7043 }
7044 }
7045
7046 return do_strip(self, striptype);
7047 }
7048
7049
7050 PyDoc_STRVAR(strip__doc__,
7051 "S.strip([chars]) -> unicode\n\
7052 \n\
7053 Return a copy of the string S with leading and trailing\n\
7054 whitespace removed.\n\
7055 If chars is given and not None, remove characters in chars instead.\n\
7056 If chars is a str, it will be converted to unicode before stripping");
7057
7058 static PyObject *
7059 unicode_strip(PyUnicodeObject *self, PyObject *args)
7060 {
7061 if (PyTuple_GET_SIZE(args) == 0)
7062 return do_strip(self, BOTHSTRIP); /* Common case */
7063 else
7064 return do_argstrip(self, BOTHSTRIP, args);
7065 }
7066
7067
7068 PyDoc_STRVAR(lstrip__doc__,
7069 "S.lstrip([chars]) -> unicode\n\
7070 \n\
7071 Return a copy of the string S with leading whitespace removed.\n\
7072 If chars is given and not None, remove characters in chars instead.\n\
7073 If chars is a str, it will be converted to unicode before stripping");
7074
7075 static PyObject *
7076 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7077 {
7078 if (PyTuple_GET_SIZE(args) == 0)
7079 return do_strip(self, LEFTSTRIP); /* Common case */
7080 else
7081 return do_argstrip(self, LEFTSTRIP, args);
7082 }
7083
7084
7085 PyDoc_STRVAR(rstrip__doc__,
7086 "S.rstrip([chars]) -> unicode\n\
7087 \n\
7088 Return a copy of the string S with trailing whitespace removed.\n\
7089 If chars is given and not None, remove characters in chars instead.\n\
7090 If chars is a str, it will be converted to unicode before stripping");
7091
7092 static PyObject *
7093 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7094 {
7095 if (PyTuple_GET_SIZE(args) == 0)
7096 return do_strip(self, RIGHTSTRIP); /* Common case */
7097 else
7098 return do_argstrip(self, RIGHTSTRIP, args);
7099 }
7100
7101
7102 static PyObject*
7103 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7104 {
7105 PyUnicodeObject *u;
7106 Py_UNICODE *p;
7107 Py_ssize_t nchars;
7108 size_t nbytes;
7109
7110 if (len < 0)
7111 len = 0;
7112
7113 if (len == 1 && PyUnicode_CheckExact(str)) {
7114 /* no repeat, return original string */
7115 Py_INCREF(str);
7116 return (PyObject*) str;
7117 }
7118
7119 /* ensure # of chars needed doesn't overflow int and # of bytes
7120 * needed doesn't overflow size_t
7121 */
7122 nchars = len * str->length;
7123 if (len && nchars / len != str->length) {
7124 PyErr_SetString(PyExc_OverflowError,
7125 "repeated string is too long");
7126 return NULL;
7127 }
7128 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7129 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7130 PyErr_SetString(PyExc_OverflowError,
7131 "repeated string is too long");
7132 return NULL;
7133 }
7134 u = _PyUnicode_New(nchars);
7135 if (!u)
7136 return NULL;
7137
7138 p = u->str;
7139
7140 if (str->length == 1 && len > 0) {
7141 Py_UNICODE_FILL(p, str->str[0], len);
7142 } else {
7143 Py_ssize_t done = 0; /* number of characters copied this far */
7144 if (done < nchars) {
7145 Py_UNICODE_COPY(p, str->str, str->length);
7146 done = str->length;
7147 }
7148 while (done < nchars) {
7149 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7150 Py_UNICODE_COPY(p+done, p, n);
7151 done += n;
7152 }
7153 }
7154
7155 return (PyObject*) u;
7156 }
7157
7158 PyObject *PyUnicode_Replace(PyObject *obj,
7159 PyObject *subobj,
7160 PyObject *replobj,
7161 Py_ssize_t maxcount)
7162 {
7163 PyObject *self;
7164 PyObject *str1;
7165 PyObject *str2;
7166 PyObject *result;
7167
7168 self = PyUnicode_FromObject(obj);
7169 if (self == NULL)
7170 return NULL;
7171 str1 = PyUnicode_FromObject(subobj);
7172 if (str1 == NULL) {
7173 Py_DECREF(self);
7174 return NULL;
7175 }
7176 str2 = PyUnicode_FromObject(replobj);
7177 if (str2 == NULL) {
7178 Py_DECREF(self);
7179 Py_DECREF(str1);
7180 return NULL;
7181 }
7182 result = replace((PyUnicodeObject *)self,
7183 (PyUnicodeObject *)str1,
7184 (PyUnicodeObject *)str2,
7185 maxcount);
7186 Py_DECREF(self);
7187 Py_DECREF(str1);
7188 Py_DECREF(str2);
7189 return result;
7190 }
7191
7192 PyDoc_STRVAR(replace__doc__,
7193 "S.replace(old, new[, count]) -> unicode\n\
7194 \n\
7195 Return a copy of S with all occurrences of substring\n\
7196 old replaced by new. If the optional argument count is\n\
7197 given, only the first count occurrences are replaced.");
7198
7199 static PyObject*
7200 unicode_replace(PyUnicodeObject *self, PyObject *args)
7201 {
7202 PyUnicodeObject *str1;
7203 PyUnicodeObject *str2;
7204 Py_ssize_t maxcount = -1;
7205 PyObject *result;
7206
7207 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7208 return NULL;
7209 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7210 if (str1 == NULL)
7211 return NULL;
7212 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7213 if (str2 == NULL) {
7214 Py_DECREF(str1);
7215 return NULL;
7216 }
7217
7218 result = replace(self, str1, str2, maxcount);
7219
7220 Py_DECREF(str1);
7221 Py_DECREF(str2);
7222 return result;
7223 }
7224
7225 static
7226 PyObject *unicode_repr(PyObject *unicode)
7227 {
7228 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7229 PyUnicode_GET_SIZE(unicode),
7230 1);
7231 }
7232
7233 PyDoc_STRVAR(rfind__doc__,
7234 "S.rfind(sub [,start [,end]]) -> int\n\
7235 \n\
7236 Return the highest index in S where substring sub is found,\n\
7237 such that sub is contained within S[start:end]. Optional\n\
7238 arguments start and end are interpreted as in slice notation.\n\
7239 \n\
7240 Return -1 on failure.");
7241
7242 static PyObject *
7243 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7244 {
7245 PyUnicodeObject *substring;
7246 Py_ssize_t start;
7247 Py_ssize_t end;
7248 Py_ssize_t result;
7249
7250 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7251 &start, &end))
7252 return NULL;
7253
7254 result = stringlib_rfind_slice(
7255 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7256 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7257 start, end
7258 );
7259
7260 Py_DECREF(substring);
7261
7262 return PyInt_FromSsize_t(result);
7263 }
7264
7265 PyDoc_STRVAR(rindex__doc__,
7266 "S.rindex(sub [,start [,end]]) -> int\n\
7267 \n\
7268 Like S.rfind() but raise ValueError when the substring is not found.");
7269
7270 static PyObject *
7271 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7272 {
7273 PyUnicodeObject *substring;
7274 Py_ssize_t start;
7275 Py_ssize_t end;
7276 Py_ssize_t result;
7277
7278 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7279 &start, &end))
7280 return NULL;
7281
7282 result = stringlib_rfind_slice(
7283 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7284 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7285 start, end
7286 );
7287
7288 Py_DECREF(substring);
7289
7290 if (result < 0) {
7291 PyErr_SetString(PyExc_ValueError, "substring not found");
7292 return NULL;
7293 }
7294 return PyInt_FromSsize_t(result);
7295 }
7296
7297 PyDoc_STRVAR(rjust__doc__,
7298 "S.rjust(width[, fillchar]) -> unicode\n\
7299 \n\
7300 Return S right-justified in a Unicode string of length width. Padding is\n\
7301 done using the specified fill character (default is a space).");
7302
7303 static PyObject *
7304 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7305 {
7306 Py_ssize_t width;
7307 Py_UNICODE fillchar = ' ';
7308
7309 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7310 return NULL;
7311
7312 if (self->length >= width && PyUnicode_CheckExact(self)) {
7313 Py_INCREF(self);
7314 return (PyObject*) self;
7315 }
7316
7317 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7318 }
7319
7320 static PyObject*
7321 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7322 {
7323 /* standard clamping */
7324 if (start < 0)
7325 start = 0;
7326 if (end < 0)
7327 end = 0;
7328 if (end > self->length)
7329 end = self->length;
7330 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7331 /* full slice, return original string */
7332 Py_INCREF(self);
7333 return (PyObject*) self;
7334 }
7335 if (start > end)
7336 start = end;
7337 /* copy slice */
7338 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7339 end - start);
7340 }
7341
7342 PyObject *PyUnicode_Split(PyObject *s,
7343 PyObject *sep,
7344 Py_ssize_t maxsplit)
7345 {
7346 PyObject *result;
7347
7348 s = PyUnicode_FromObject(s);
7349 if (s == NULL)
7350 return NULL;
7351 if (sep != NULL) {
7352 sep = PyUnicode_FromObject(sep);
7353 if (sep == NULL) {
7354 Py_DECREF(s);
7355 return NULL;
7356 }
7357 }
7358
7359 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7360
7361 Py_DECREF(s);
7362 Py_XDECREF(sep);
7363 return result;
7364 }
7365
7366 PyDoc_STRVAR(split__doc__,
7367 "S.split([sep [,maxsplit]]) -> list of strings\n\
7368 \n\
7369 Return a list of the words in S, using sep as the\n\
7370 delimiter string. If maxsplit is given, at most maxsplit\n\
7371 splits are done. If sep is not specified or is None, any\n\
7372 whitespace string is a separator and empty strings are\n\
7373 removed from the result.");
7374
7375 static PyObject*
7376 unicode_split(PyUnicodeObject *self, PyObject *args)
7377 {
7378 PyObject *substring = Py_None;
7379 Py_ssize_t maxcount = -1;
7380
7381 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7382 return NULL;
7383
7384 if (substring == Py_None)
7385 return split(self, NULL, maxcount);
7386 else if (PyUnicode_Check(substring))
7387 return split(self, (PyUnicodeObject *)substring, maxcount);
7388 else
7389 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7390 }
7391
7392 PyObject *
7393 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7394 {
7395 PyObject* str_obj;
7396 PyObject* sep_obj;
7397 PyObject* out;
7398
7399 str_obj = PyUnicode_FromObject(str_in);
7400 if (!str_obj)
7401 return NULL;
7402 sep_obj = PyUnicode_FromObject(sep_in);
7403 if (!sep_obj) {
7404 Py_DECREF(str_obj);
7405 return NULL;
7406 }
7407
7408 out = stringlib_partition(
7409 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7410 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7411 );
7412
7413 Py_DECREF(sep_obj);
7414 Py_DECREF(str_obj);
7415
7416 return out;
7417 }
7418
7419
7420 PyObject *
7421 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7422 {
7423 PyObject* str_obj;
7424 PyObject* sep_obj;
7425 PyObject* out;
7426
7427 str_obj = PyUnicode_FromObject(str_in);
7428 if (!str_obj)
7429 return NULL;
7430 sep_obj = PyUnicode_FromObject(sep_in);
7431 if (!sep_obj) {
7432 Py_DECREF(str_obj);
7433 return NULL;
7434 }
7435
7436 out = stringlib_rpartition(
7437 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7438 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7439 );
7440
7441 Py_DECREF(sep_obj);
7442 Py_DECREF(str_obj);
7443
7444 return out;
7445 }
7446
7447 PyDoc_STRVAR(partition__doc__,
7448 "S.partition(sep) -> (head, sep, tail)\n\
7449 \n\
7450 Search for the separator sep in S, and return the part before it,\n\
7451 the separator itself, and the part after it. If the separator is not\n\
7452 found, return S and two empty strings.");
7453
7454 static PyObject*
7455 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7456 {
7457 return PyUnicode_Partition((PyObject *)self, separator);
7458 }
7459
7460 PyDoc_STRVAR(rpartition__doc__,
7461 "S.rpartition(sep) -> (head, sep, tail)\n\
7462 \n\
7463 Search for the separator sep in S, starting at the end of S, and return\n\
7464 the part before it, the separator itself, and the part after it. If the\n\
7465 separator is not found, return two empty strings and S.");
7466
7467 static PyObject*
7468 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7469 {
7470 return PyUnicode_RPartition((PyObject *)self, separator);
7471 }
7472
7473 PyObject *PyUnicode_RSplit(PyObject *s,
7474 PyObject *sep,
7475 Py_ssize_t maxsplit)
7476 {
7477 PyObject *result;
7478
7479 s = PyUnicode_FromObject(s);
7480 if (s == NULL)
7481 return NULL;
7482 if (sep != NULL) {
7483 sep = PyUnicode_FromObject(sep);
7484 if (sep == NULL) {
7485 Py_DECREF(s);
7486 return NULL;
7487 }
7488 }
7489
7490 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7491
7492 Py_DECREF(s);
7493 Py_XDECREF(sep);
7494 return result;
7495 }
7496
7497 PyDoc_STRVAR(rsplit__doc__,
7498 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7499 \n\
7500 Return a list of the words in S, using sep as the\n\
7501 delimiter string, starting at the end of the string and\n\
7502 working to the front. If maxsplit is given, at most maxsplit\n\
7503 splits are done. If sep is not specified, any whitespace string\n\
7504 is a separator.");
7505
7506 static PyObject*
7507 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7508 {
7509 PyObject *substring = Py_None;
7510 Py_ssize_t maxcount = -1;
7511
7512 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7513 return NULL;
7514
7515 if (substring == Py_None)
7516 return rsplit(self, NULL, maxcount);
7517 else if (PyUnicode_Check(substring))
7518 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7519 else
7520 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7521 }
7522
7523 PyDoc_STRVAR(splitlines__doc__,
7524 "S.splitlines([keepends]) -> list of strings\n\
7525 \n\
7526 Return a list of the lines in S, breaking at line boundaries.\n\
7527 Line breaks are not included in the resulting list unless keepends\n\
7528 is given and true.");
7529
7530 static PyObject*
7531 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7532 {
7533 int keepends = 0;
7534
7535 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7536 return NULL;
7537
7538 return PyUnicode_Splitlines((PyObject *)self, keepends);
7539 }
7540
7541 static
7542 PyObject *unicode_str(PyUnicodeObject *self)
7543 {
7544 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7545 }
7546
7547 PyDoc_STRVAR(swapcase__doc__,
7548 "S.swapcase() -> unicode\n\
7549 \n\
7550 Return a copy of S with uppercase characters converted to lowercase\n\
7551 and vice versa.");
7552
7553 static PyObject*
7554 unicode_swapcase(PyUnicodeObject *self)
7555 {
7556 return fixup(self, fixswapcase);
7557 }
7558
7559 PyDoc_STRVAR(translate__doc__,
7560 "S.translate(table) -> unicode\n\
7561 \n\
7562 Return a copy of the string S, where all characters have been mapped\n\
7563 through the given translation table, which must be a mapping of\n\
7564 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7565 Unmapped characters are left untouched. Characters mapped to None\n\
7566 are deleted.");
7567
7568 static PyObject*
7569 unicode_translate(PyUnicodeObject *self, PyObject *table)
7570 {
7571 return PyUnicode_TranslateCharmap(self->str,
7572 self->length,
7573 table,
7574 "ignore");
7575 }
7576
7577 PyDoc_STRVAR(upper__doc__,
7578 "S.upper() -> unicode\n\
7579 \n\
7580 Return a copy of S converted to uppercase.");
7581
7582 static PyObject*
7583 unicode_upper(PyUnicodeObject *self)
7584 {
7585 return fixup(self, fixupper);
7586 }
7587
7588 PyDoc_STRVAR(zfill__doc__,
7589 "S.zfill(width) -> unicode\n\
7590 \n\
7591 Pad a numeric string S with zeros on the left, to fill a field\n\
7592 of the specified width. The string S is never truncated.");
7593
7594 static PyObject *
7595 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7596 {
7597 Py_ssize_t fill;
7598 PyUnicodeObject *u;
7599
7600 Py_ssize_t width;
7601 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7602 return NULL;
7603
7604 if (self->length >= width) {
7605 if (PyUnicode_CheckExact(self)) {
7606 Py_INCREF(self);
7607 return (PyObject*) self;
7608 }
7609 else
7610 return PyUnicode_FromUnicode(
7611 PyUnicode_AS_UNICODE(self),
7612 PyUnicode_GET_SIZE(self)
7613 );
7614 }
7615
7616 fill = width - self->length;
7617
7618 u = pad(self, fill, 0, '0');
7619
7620 if (u == NULL)
7621 return NULL;
7622
7623 if (u->str[fill] == '+' || u->str[fill] == '-') {
7624 /* move sign to beginning of string */
7625 u->str[0] = u->str[fill];
7626 u->str[fill] = '0';
7627 }
7628
7629 return (PyObject*) u;
7630 }
7631
7632 #if 0
7633 static PyObject*
7634 free_listsize(PyUnicodeObject *self)
7635 {
7636 return PyInt_FromLong(numfree);
7637 }
7638 #endif
7639
7640 PyDoc_STRVAR(startswith__doc__,
7641 "S.startswith(prefix[, start[, end]]) -> bool\n\
7642 \n\
7643 Return True if S starts with the specified prefix, False otherwise.\n\
7644 With optional start, test S beginning at that position.\n\
7645 With optional end, stop comparing S at that position.\n\
7646 prefix can also be a tuple of strings to try.");
7647
7648 static PyObject *
7649 unicode_startswith(PyUnicodeObject *self,
7650 PyObject *args)
7651 {
7652 PyObject *subobj;
7653 PyUnicodeObject *substring;
7654 Py_ssize_t start = 0;
7655 Py_ssize_t end = PY_SSIZE_T_MAX;
7656 int result;
7657
7658 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7659 return NULL;
7660 if (PyTuple_Check(subobj)) {
7661 Py_ssize_t i;
7662 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7663 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7664 PyTuple_GET_ITEM(subobj, i));
7665 if (substring == NULL)
7666 return NULL;
7667 result = tailmatch(self, substring, start, end, -1);
7668 Py_DECREF(substring);
7669 if (result) {
7670 Py_RETURN_TRUE;
7671 }
7672 }
7673 /* nothing matched */
7674 Py_RETURN_FALSE;
7675 }
7676 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7677 if (substring == NULL) {
7678 if (PyErr_ExceptionMatches(PyExc_TypeError))
7679 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7680 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7681 return NULL;
7682 }
7683 result = tailmatch(self, substring, start, end, -1);
7684 Py_DECREF(substring);
7685 return PyBool_FromLong(result);
7686 }
7687
7688
7689 PyDoc_STRVAR(endswith__doc__,
7690 "S.endswith(suffix[, start[, end]]) -> bool\n\
7691 \n\
7692 Return True if S ends with the specified suffix, False otherwise.\n\
7693 With optional start, test S beginning at that position.\n\
7694 With optional end, stop comparing S at that position.\n\
7695 suffix can also be a tuple of strings to try.");
7696
7697 static PyObject *
7698 unicode_endswith(PyUnicodeObject *self,
7699 PyObject *args)
7700 {
7701 PyObject *subobj;
7702 PyUnicodeObject *substring;
7703 Py_ssize_t start = 0;
7704 Py_ssize_t end = PY_SSIZE_T_MAX;
7705 int result;
7706
7707 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7708 return NULL;
7709 if (PyTuple_Check(subobj)) {
7710 Py_ssize_t i;
7711 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7712 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7713 PyTuple_GET_ITEM(subobj, i));
7714 if (substring == NULL)
7715 return NULL;
7716 result = tailmatch(self, substring, start, end, +1);
7717 Py_DECREF(substring);
7718 if (result) {
7719 Py_RETURN_TRUE;
7720 }
7721 }
7722 Py_RETURN_FALSE;
7723 }
7724 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7725 if (substring == NULL) {
7726 if (PyErr_ExceptionMatches(PyExc_TypeError))
7727 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7728 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7729 return NULL;
7730 }
7731 result = tailmatch(self, substring, start, end, +1);
7732 Py_DECREF(substring);
7733 return PyBool_FromLong(result);
7734 }
7735
7736
7737 /* Implements do_string_format, which is unicode because of stringlib */
7738 #include "stringlib/string_format.h"
7739
7740 PyDoc_STRVAR(format__doc__,
7741 "S.format(*args, **kwargs) -> unicode\n\
7742 \n\
7743 Return a formatted version of S, using substitutions from args and kwargs.\n\
7744 The substitutions are identified by braces ('{' and '}').");
7745
7746 static PyObject *
7747 unicode__format__(PyObject *self, PyObject *args)
7748 {
7749 PyObject *format_spec;
7750 PyObject *result = NULL;
7751 PyObject *tmp = NULL;
7752
7753 /* If 2.x, convert format_spec to the same type as value */
7754 /* This is to allow things like u''.format('') */
7755 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7756 goto done;
7757 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7758 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7759 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7760 goto done;
7761 }
7762 tmp = PyObject_Unicode(format_spec);
7763 if (tmp == NULL)
7764 goto done;
7765 format_spec = tmp;
7766
7767 result = _PyUnicode_FormatAdvanced(self,
7768 PyUnicode_AS_UNICODE(format_spec),
7769 PyUnicode_GET_SIZE(format_spec));
7770 done:
7771 Py_XDECREF(tmp);
7772 return result;
7773 }
7774
7775 PyDoc_STRVAR(p_format__doc__,
7776 "S.__format__(format_spec) -> unicode\n\
7777 \n\
7778 Return a formatted version of S as described by format_spec.");
7779
7780 static PyObject *
7781 unicode__sizeof__(PyUnicodeObject *v)
7782 {
7783 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7784 sizeof(Py_UNICODE) * (v->length + 1));
7785 }
7786
7787 PyDoc_STRVAR(sizeof__doc__,
7788 "S.__sizeof__() -> size of S in memory, in bytes\n\
7789 \n\
7790 ");
7791
7792 static PyObject *
7793 unicode_getnewargs(PyUnicodeObject *v)
7794 {
7795 return Py_BuildValue("(u#)", v->str, v->length);
7796 }
7797
7798
7799 static PyMethodDef unicode_methods[] = {
7800
7801 /* Order is according to common usage: often used methods should
7802 appear first, since lookup is done sequentially. */
7803
7804 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7805 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7806 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7807 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7808 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7809 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7810 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7811 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7812 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7813 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7814 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7815 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7816 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7817 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7818 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7819 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7820 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7821 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7822 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7823 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7824 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7825 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7826 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7827 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7828 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7829 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7830 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7831 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7832 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7833 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7834 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7835 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7836 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7837 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7838 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7839 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7840 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7841 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7842 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7843 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7844 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7845 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7846 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7847 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7848 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7849 #if 0
7850 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7851 #endif
7852
7853 #if 0
7854 /* This one is just used for debugging the implementation. */
7855 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7856 #endif
7857
7858 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7859 {NULL, NULL}
7860 };
7861
7862 static PyObject *
7863 unicode_mod(PyObject *v, PyObject *w)
7864 {
7865 if (!PyUnicode_Check(v)) {
7866 Py_INCREF(Py_NotImplemented);
7867 return Py_NotImplemented;
7868 }
7869 return PyUnicode_Format(v, w);
7870 }
7871
7872 static PyNumberMethods unicode_as_number = {
7873 0, /*nb_add*/
7874 0, /*nb_subtract*/
7875 0, /*nb_multiply*/
7876 0, /*nb_divide*/
7877 unicode_mod, /*nb_remainder*/
7878 };
7879
7880 static PySequenceMethods unicode_as_sequence = {
7881 (lenfunc) unicode_length, /* sq_length */
7882 PyUnicode_Concat, /* sq_concat */
7883 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7884 (ssizeargfunc) unicode_getitem, /* sq_item */
7885 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7886 0, /* sq_ass_item */
7887 0, /* sq_ass_slice */
7888 PyUnicode_Contains, /* sq_contains */
7889 };
7890
7891 static PyObject*
7892 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7893 {
7894 if (PyIndex_Check(item)) {
7895 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7896 if (i == -1 && PyErr_Occurred())
7897 return NULL;
7898 if (i < 0)
7899 i += PyUnicode_GET_SIZE(self);
7900 return unicode_getitem(self, i);
7901 } else if (PySlice_Check(item)) {
7902 Py_ssize_t start, stop, step, slicelength, cur, i;
7903 Py_UNICODE* source_buf;
7904 Py_UNICODE* result_buf;
7905 PyObject* result;
7906
7907 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7908 &start, &stop, &step, &slicelength) < 0) {
7909 return NULL;
7910 }
7911
7912 if (slicelength <= 0) {
7913 return PyUnicode_FromUnicode(NULL, 0);
7914 } else if (start == 0 && step == 1 && slicelength == self->length &&
7915 PyUnicode_CheckExact(self)) {
7916 Py_INCREF(self);
7917 return (PyObject *)self;
7918 } else if (step == 1) {
7919 return PyUnicode_FromUnicode(self->str + start, slicelength);
7920 } else {
7921 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7922 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7923 sizeof(Py_UNICODE));
7924
7925 if (result_buf == NULL)
7926 return PyErr_NoMemory();
7927
7928 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7929 result_buf[i] = source_buf[cur];
7930 }
7931
7932 result = PyUnicode_FromUnicode(result_buf, slicelength);
7933 PyObject_FREE(result_buf);
7934 return result;
7935 }
7936 } else {
7937 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7938 return NULL;
7939 }
7940 }
7941
7942 static PyMappingMethods unicode_as_mapping = {
7943 (lenfunc)unicode_length, /* mp_length */
7944 (binaryfunc)unicode_subscript, /* mp_subscript */
7945 (objobjargproc)0, /* mp_ass_subscript */
7946 };
7947
7948 static Py_ssize_t
7949 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7950 Py_ssize_t index,
7951 const void **ptr)
7952 {
7953 if (index != 0) {
7954 PyErr_SetString(PyExc_SystemError,
7955 "accessing non-existent unicode segment");
7956 return -1;
7957 }
7958 *ptr = (void *) self->str;
7959 return PyUnicode_GET_DATA_SIZE(self);
7960 }
7961
7962 static Py_ssize_t
7963 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7964 const void **ptr)
7965 {
7966 PyErr_SetString(PyExc_TypeError,
7967 "cannot use unicode as modifiable buffer");
7968 return -1;
7969 }
7970
7971 static int
7972 unicode_buffer_getsegcount(PyUnicodeObject *self,
7973 Py_ssize_t *lenp)
7974 {
7975 if (lenp)
7976 *lenp = PyUnicode_GET_DATA_SIZE(self);
7977 return 1;
7978 }
7979
7980 static Py_ssize_t
7981 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7982 Py_ssize_t index,
7983 const void **ptr)
7984 {
7985 PyObject *str;
7986
7987 if (index != 0) {
7988 PyErr_SetString(PyExc_SystemError,
7989 "accessing non-existent unicode segment");
7990 return -1;
7991 }
7992 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7993 if (str == NULL)
7994 return -1;
7995 *ptr = (void *) PyString_AS_STRING(str);
7996 return PyString_GET_SIZE(str);
7997 }
7998
7999 /* Helpers for PyUnicode_Format() */
8000
8001 static PyObject *
8002 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8003 {
8004 Py_ssize_t argidx = *p_argidx;
8005 if (argidx < arglen) {
8006 (*p_argidx)++;
8007 if (arglen < 0)
8008 return args;
8009 else
8010 return PyTuple_GetItem(args, argidx);
8011 }
8012 PyErr_SetString(PyExc_TypeError,
8013 "not enough arguments for format string");
8014 return NULL;
8015 }
8016
8017 #define F_LJUST (1<<0)
8018 #define F_SIGN (1<<1)
8019 #define F_BLANK (1<<2)
8020 #define F_ALT (1<<3)
8021 #define F_ZERO (1<<4)
8022
8023 static Py_ssize_t
8024 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8025 {
8026 register Py_ssize_t i;
8027 Py_ssize_t len = strlen(charbuffer);
8028 for (i = len - 1; i >= 0; i--)
8029 buffer[i] = (Py_UNICODE) charbuffer[i];
8030
8031 return len;
8032 }
8033
8034 static int
8035 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8036 {
8037 Py_ssize_t result;
8038
8039 PyOS_snprintf((char *)buffer, len, format, x);
8040 result = strtounicode(buffer, (char *)buffer);
8041 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8042 }
8043
8044 /* XXX To save some code duplication, formatfloat/long/int could have been
8045 shared with stringobject.c, converting from 8-bit to Unicode after the
8046 formatting is done. */
8047
8048 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8049
8050 static PyObject *
8051 formatfloat(PyObject *v, int flags, int prec, int type)
8052 {
8053 char *p;
8054 PyObject *result;
8055 double x;
8056
8057 x = PyFloat_AsDouble(v);
8058 if (x == -1.0 && PyErr_Occurred())
8059 return NULL;
8060
8061 if (prec < 0)
8062 prec = 6;
8063
8064 p = PyOS_double_to_string(x, type, prec,
8065 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8066 if (p == NULL)
8067 return NULL;
8068 result = PyUnicode_FromStringAndSize(p, strlen(p));
8069 PyMem_Free(p);
8070 return result;
8071 }
8072
8073 static PyObject*
8074 formatlong(PyObject *val, int flags, int prec, int type)
8075 {
8076 char *buf;
8077 int i, len;
8078 PyObject *str; /* temporary string object. */
8079 PyUnicodeObject *result;
8080
8081 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8082 if (!str)
8083 return NULL;
8084 result = _PyUnicode_New(len);
8085 if (!result) {
8086 Py_DECREF(str);
8087 return NULL;
8088 }
8089 for (i = 0; i < len; i++)
8090 result->str[i] = buf[i];
8091 result->str[len] = 0;
8092 Py_DECREF(str);
8093 return (PyObject*)result;
8094 }
8095
8096 static int
8097 formatint(Py_UNICODE *buf,
8098 size_t buflen,
8099 int flags,
8100 int prec,
8101 int type,
8102 PyObject *v)
8103 {
8104 /* fmt = '%#.' + `prec` + 'l' + `type`
8105 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8106 * + 1 + 1
8107 * = 24
8108 */
8109 char fmt[64]; /* plenty big enough! */
8110 char *sign;
8111 long x;
8112
8113 x = PyInt_AsLong(v);
8114 if (x == -1 && PyErr_Occurred())
8115 return -1;
8116 if (x < 0 && type == 'u') {
8117 type = 'd';
8118 }
8119 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8120 sign = "-";
8121 else
8122 sign = "";
8123 if (prec < 0)
8124 prec = 1;
8125
8126 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8127 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8128 */
8129 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8130 PyErr_SetString(PyExc_OverflowError,
8131 "formatted integer is too long (precision too large?)");
8132 return -1;
8133 }
8134
8135 if ((flags & F_ALT) &&
8136 (type == 'x' || type == 'X')) {
8137 /* When converting under %#x or %#X, there are a number
8138 * of issues that cause pain:
8139 * - when 0 is being converted, the C standard leaves off
8140 * the '0x' or '0X', which is inconsistent with other
8141 * %#x/%#X conversions and inconsistent with Python's
8142 * hex() function
8143 * - there are platforms that violate the standard and
8144 * convert 0 with the '0x' or '0X'
8145 * (Metrowerks, Compaq Tru64)
8146 * - there are platforms that give '0x' when converting
8147 * under %#X, but convert 0 in accordance with the
8148 * standard (OS/2 EMX)
8149 *
8150 * We can achieve the desired consistency by inserting our
8151 * own '0x' or '0X' prefix, and substituting %x/%X in place
8152 * of %#x/%#X.
8153 *
8154 * Note that this is the same approach as used in
8155 * formatint() in stringobject.c
8156 */
8157 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8158 sign, type, prec, type);
8159 }
8160 else {
8161 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8162 sign, (flags&F_ALT) ? "#" : "",
8163 prec, type);
8164 }
8165 if (sign[0])
8166 return longtounicode(buf, buflen, fmt, -x);
8167 else
8168 return longtounicode(buf, buflen, fmt, x);
8169 }
8170
8171 static int
8172 formatchar(Py_UNICODE *buf,
8173 size_t buflen,
8174 PyObject *v)
8175 {
8176 PyObject *unistr;
8177 char *str;
8178 /* presume that the buffer is at least 2 characters long */
8179 if (PyUnicode_Check(v)) {
8180 if (PyUnicode_GET_SIZE(v) != 1)
8181 goto onError;
8182 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8183 }
8184
8185 else if (PyString_Check(v)) {
8186 if (PyString_GET_SIZE(v) != 1)
8187 goto onError;
8188 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8189 with a UnicodeDecodeError if 'char' is not decodable with the
8190 default encoding (usually ASCII, but it might be something else) */
8191 str = PyString_AS_STRING(v);
8192 if ((unsigned char)str[0] > 0x7F) {
8193 /* the char is not ASCII; try to decode the string using the
8194 default encoding and return -1 to let the UnicodeDecodeError
8195 be raised if the string can't be decoded */
8196 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8197 if (unistr == NULL)
8198 return -1;
8199 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8200 Py_DECREF(unistr);
8201 }
8202 else
8203 buf[0] = (Py_UNICODE)str[0];
8204 }
8205
8206 else {
8207 /* Integer input truncated to a character */
8208 long x;
8209 x = PyInt_AsLong(v);
8210 if (x == -1 && PyErr_Occurred())
8211 goto onError;
8212 #ifdef Py_UNICODE_WIDE
8213 if (x < 0 || x > 0x10ffff) {
8214 PyErr_SetString(PyExc_OverflowError,
8215 "%c arg not in range(0x110000) "
8216 "(wide Python build)");
8217 return -1;
8218 }
8219 #else
8220 if (x < 0 || x > 0xffff) {
8221 PyErr_SetString(PyExc_OverflowError,
8222 "%c arg not in range(0x10000) "
8223 "(narrow Python build)");
8224 return -1;
8225 }
8226 #endif
8227 buf[0] = (Py_UNICODE) x;
8228 }
8229 buf[1] = '\0';
8230 return 1;
8231
8232 onError:
8233 PyErr_SetString(PyExc_TypeError,
8234 "%c requires int or char");
8235 return -1;
8236 }
8237
8238 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8239
8240 FORMATBUFLEN is the length of the buffer in which the ints &
8241 chars are formatted. XXX This is a magic number. Each formatting
8242 routine does bounds checking to ensure no overflow, but a better
8243 solution may be to malloc a buffer of appropriate size for each
8244 format. For now, the current solution is sufficient.
8245 */
8246 #define FORMATBUFLEN (size_t)120
8247
8248 PyObject *PyUnicode_Format(PyObject *format,
8249 PyObject *args)
8250 {
8251 Py_UNICODE *fmt, *res;
8252 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8253 int args_owned = 0;
8254 PyUnicodeObject *result = NULL;
8255 PyObject *dict = NULL;
8256 PyObject *uformat;
8257
8258 if (format == NULL || args == NULL) {
8259 PyErr_BadInternalCall();
8260 return NULL;
8261 }
8262 uformat = PyUnicode_FromObject(format);
8263 if (uformat == NULL)
8264 return NULL;
8265 fmt = PyUnicode_AS_UNICODE(uformat);
8266 fmtcnt = PyUnicode_GET_SIZE(uformat);
8267
8268 reslen = rescnt = fmtcnt + 100;
8269 result = _PyUnicode_New(reslen);
8270 if (result == NULL)
8271 goto onError;
8272 res = PyUnicode_AS_UNICODE(result);
8273
8274 if (PyTuple_Check(args)) {
8275 arglen = PyTuple_Size(args);
8276 argidx = 0;
8277 }
8278 else {
8279 arglen = -1;
8280 argidx = -2;
8281 }
8282 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8283 !PyObject_TypeCheck(args, &PyBaseString_Type))
8284 dict = args;
8285
8286 while (--fmtcnt >= 0) {
8287 if (*fmt != '%') {
8288 if (--rescnt < 0) {
8289 rescnt = fmtcnt + 100;
8290 reslen += rescnt;
8291 if (_PyUnicode_Resize(&result, reslen) < 0)
8292 goto onError;
8293 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8294 --rescnt;
8295 }
8296 *res++ = *fmt++;
8297 }
8298 else {
8299 /* Got a format specifier */
8300 int flags = 0;
8301 Py_ssize_t width = -1;
8302 int prec = -1;
8303 Py_UNICODE c = '\0';
8304 Py_UNICODE fill;
8305 int isnumok;
8306 PyObject *v = NULL;
8307 PyObject *temp = NULL;
8308 Py_UNICODE *pbuf;
8309 Py_UNICODE sign;
8310 Py_ssize_t len;
8311 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8312
8313 fmt++;
8314 if (*fmt == '(') {
8315 Py_UNICODE *keystart;
8316 Py_ssize_t keylen;
8317 PyObject *key;
8318 int pcount = 1;
8319
8320 if (dict == NULL) {
8321 PyErr_SetString(PyExc_TypeError,
8322 "format requires a mapping");
8323 goto onError;
8324 }
8325 ++fmt;
8326 --fmtcnt;
8327 keystart = fmt;
8328 /* Skip over balanced parentheses */
8329 while (pcount > 0 && --fmtcnt >= 0) {
8330 if (*fmt == ')')
8331 --pcount;
8332 else if (*fmt == '(')
8333 ++pcount;
8334 fmt++;
8335 }
8336 keylen = fmt - keystart - 1;
8337 if (fmtcnt < 0 || pcount > 0) {
8338 PyErr_SetString(PyExc_ValueError,
8339 "incomplete format key");
8340 goto onError;
8341 }
8342 #if 0
8343 /* keys are converted to strings using UTF-8 and
8344 then looked up since Python uses strings to hold
8345 variables names etc. in its namespaces and we
8346 wouldn't want to break common idioms. */
8347 key = PyUnicode_EncodeUTF8(keystart,
8348 keylen,
8349 NULL);
8350 #else
8351 key = PyUnicode_FromUnicode(keystart, keylen);
8352 #endif
8353 if (key == NULL)
8354 goto onError;
8355 if (args_owned) {
8356 Py_DECREF(args);
8357 args_owned = 0;
8358 }
8359 args = PyObject_GetItem(dict, key);
8360 Py_DECREF(key);
8361 if (args == NULL) {
8362 goto onError;
8363 }
8364 args_owned = 1;
8365 arglen = -1;
8366 argidx = -2;
8367 }
8368 while (--fmtcnt >= 0) {
8369 switch (c = *fmt++) {
8370 case '-': flags |= F_LJUST; continue;
8371 case '+': flags |= F_SIGN; continue;
8372 case ' ': flags |= F_BLANK; continue;
8373 case '#': flags |= F_ALT; continue;
8374 case '0': flags |= F_ZERO; continue;
8375 }
8376 break;
8377 }
8378 if (c == '*') {
8379 v = getnextarg(args, arglen, &argidx);
8380 if (v == NULL)
8381 goto onError;
8382 if (!PyInt_Check(v)) {
8383 PyErr_SetString(PyExc_TypeError,
8384 "* wants int");
8385 goto onError;
8386 }
8387 width = PyInt_AsLong(v);
8388 if (width < 0) {
8389 flags |= F_LJUST;
8390 width = -width;
8391 }
8392 if (--fmtcnt >= 0)
8393 c = *fmt++;
8394 }
8395 else if (c >= '0' && c <= '9') {
8396 width = c - '0';
8397 while (--fmtcnt >= 0) {
8398 c = *fmt++;
8399 if (c < '0' || c > '9')
8400 break;
8401 if ((width*10) / 10 != width) {
8402 PyErr_SetString(PyExc_ValueError,
8403 "width too big");
8404 goto onError;
8405 }
8406 width = width*10 + (c - '0');
8407 }
8408 }
8409 if (c == '.') {
8410 prec = 0;
8411 if (--fmtcnt >= 0)
8412 c = *fmt++;
8413 if (c == '*') {
8414 v = getnextarg(args, arglen, &argidx);
8415 if (v == NULL)
8416 goto onError;
8417 if (!PyInt_Check(v)) {
8418 PyErr_SetString(PyExc_TypeError,
8419 "* wants int");
8420 goto onError;
8421 }
8422 prec = PyInt_AsLong(v);
8423 if (prec < 0)
8424 prec = 0;
8425 if (--fmtcnt >= 0)
8426 c = *fmt++;
8427 }
8428 else if (c >= '0' && c <= '9') {
8429 prec = c - '0';
8430 while (--fmtcnt >= 0) {
8431 c = *fmt++;
8432 if (c < '0' || c > '9')
8433 break;
8434 if ((prec*10) / 10 != prec) {
8435 PyErr_SetString(PyExc_ValueError,
8436 "prec too big");
8437 goto onError;
8438 }
8439 prec = prec*10 + (c - '0');
8440 }
8441 }
8442 } /* prec */
8443 if (fmtcnt >= 0) {
8444 if (c == 'h' || c == 'l' || c == 'L') {
8445 if (--fmtcnt >= 0)
8446 c = *fmt++;
8447 }
8448 }
8449 if (fmtcnt < 0) {
8450 PyErr_SetString(PyExc_ValueError,
8451 "incomplete format");
8452 goto onError;
8453 }
8454 if (c != '%') {
8455 v = getnextarg(args, arglen, &argidx);
8456 if (v == NULL)
8457 goto onError;
8458 }
8459 sign = 0;
8460 fill = ' ';
8461 switch (c) {
8462
8463 case '%':
8464 pbuf = formatbuf;
8465 /* presume that buffer length is at least 1 */
8466 pbuf[0] = '%';
8467 len = 1;
8468 break;
8469
8470 case 's':
8471 case 'r':
8472 if (PyUnicode_CheckExact(v) && c == 's') {
8473 temp = v;
8474 Py_INCREF(temp);
8475 }
8476 else {
8477 PyObject *unicode;
8478 if (c == 's')
8479 temp = PyObject_Unicode(v);
8480 else
8481 temp = PyObject_Repr(v);
8482 if (temp == NULL)
8483 goto onError;
8484 if (PyUnicode_Check(temp))
8485 /* nothing to do */;
8486 else if (PyString_Check(temp)) {
8487 /* convert to string to Unicode */
8488 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8489 PyString_GET_SIZE(temp),
8490 NULL,
8491 "strict");
8492 Py_DECREF(temp);
8493 temp = unicode;
8494 if (temp == NULL)
8495 goto onError;
8496 }
8497 else {
8498 Py_DECREF(temp);
8499 PyErr_SetString(PyExc_TypeError,
8500 "%s argument has non-string str()");
8501 goto onError;
8502 }
8503 }
8504 pbuf = PyUnicode_AS_UNICODE(temp);
8505 len = PyUnicode_GET_SIZE(temp);
8506 if (prec >= 0 && len > prec)
8507 len = prec;
8508 break;
8509
8510 case 'i':
8511 case 'd':
8512 case 'u':
8513 case 'o':
8514 case 'x':
8515 case 'X':
8516 if (c == 'i')
8517 c = 'd';
8518 isnumok = 0;
8519 if (PyNumber_Check(v)) {
8520 PyObject *iobj=NULL;
8521
8522 if (PyInt_Check(v) || (PyLong_Check(v))) {
8523 iobj = v;
8524 Py_INCREF(iobj);
8525 }
8526 else {
8527 iobj = PyNumber_Int(v);
8528 if (iobj==NULL) iobj = PyNumber_Long(v);
8529 }
8530 if (iobj!=NULL) {
8531 if (PyInt_Check(iobj)) {
8532 isnumok = 1;
8533 pbuf = formatbuf;
8534 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8535 flags, prec, c, iobj);
8536 Py_DECREF(iobj);
8537 if (len < 0)
8538 goto onError;
8539 sign = 1;
8540 }
8541 else if (PyLong_Check(iobj)) {
8542 isnumok = 1;
8543 temp = formatlong(iobj, flags, prec, c);
8544 Py_DECREF(iobj);
8545 if (!temp)
8546 goto onError;
8547 pbuf = PyUnicode_AS_UNICODE(temp);
8548 len = PyUnicode_GET_SIZE(temp);
8549 sign = 1;
8550 }
8551 else {
8552 Py_DECREF(iobj);
8553 }
8554 }
8555 }
8556 if (!isnumok) {
8557 PyErr_Format(PyExc_TypeError,
8558 "%%%c format: a number is required, "
8559 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8560 goto onError;
8561 }
8562 if (flags & F_ZERO)
8563 fill = '0';
8564 break;
8565
8566 case 'e':
8567 case 'E':
8568 case 'f':
8569 case 'F':
8570 case 'g':
8571 case 'G':
8572 temp = formatfloat(v, flags, prec, c);
8573 if (temp == NULL)
8574 goto onError;
8575 pbuf = PyUnicode_AS_UNICODE(temp);
8576 len = PyUnicode_GET_SIZE(temp);
8577 sign = 1;
8578 if (flags & F_ZERO)
8579 fill = '0';
8580 break;
8581
8582 case 'c':
8583 pbuf = formatbuf;
8584 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8585 if (len < 0)
8586 goto onError;
8587 break;
8588
8589 default:
8590 PyErr_Format(PyExc_ValueError,
8591 "unsupported format character '%c' (0x%x) "
8592 "at index %zd",
8593 (31<=c && c<=126) ? (char)c : '?',
8594 (int)c,
8595 (Py_ssize_t)(fmt - 1 -
8596 PyUnicode_AS_UNICODE(uformat)));
8597 goto onError;
8598 }
8599 if (sign) {
8600 if (*pbuf == '-' || *pbuf == '+') {
8601 sign = *pbuf++;
8602 len--;
8603 }
8604 else if (flags & F_SIGN)
8605 sign = '+';
8606 else if (flags & F_BLANK)
8607 sign = ' ';
8608 else
8609 sign = 0;
8610 }
8611 if (width < len)
8612 width = len;
8613 if (rescnt - (sign != 0) < width) {
8614 reslen -= rescnt;
8615 rescnt = width + fmtcnt + 100;
8616 reslen += rescnt;
8617 if (reslen < 0) {
8618 Py_XDECREF(temp);
8619 PyErr_NoMemory();
8620 goto onError;
8621 }
8622 if (_PyUnicode_Resize(&result, reslen) < 0) {
8623 Py_XDECREF(temp);
8624 goto onError;
8625 }
8626 res = PyUnicode_AS_UNICODE(result)
8627 + reslen - rescnt;
8628 }
8629 if (sign) {
8630 if (fill != ' ')
8631 *res++ = sign;
8632 rescnt--;
8633 if (width > len)
8634 width--;
8635 }
8636 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8637 assert(pbuf[0] == '0');
8638 assert(pbuf[1] == c);
8639 if (fill != ' ') {
8640 *res++ = *pbuf++;
8641 *res++ = *pbuf++;
8642 }
8643 rescnt -= 2;
8644 width -= 2;
8645 if (width < 0)
8646 width = 0;
8647 len -= 2;
8648 }
8649 if (width > len && !(flags & F_LJUST)) {
8650 do {
8651 --rescnt;
8652 *res++ = fill;
8653 } while (--width > len);
8654 }
8655 if (fill == ' ') {
8656 if (sign)
8657 *res++ = sign;
8658 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8659 assert(pbuf[0] == '0');
8660 assert(pbuf[1] == c);
8661 *res++ = *pbuf++;
8662 *res++ = *pbuf++;
8663 }
8664 }
8665 Py_UNICODE_COPY(res, pbuf, len);
8666 res += len;
8667 rescnt -= len;
8668 while (--width >= len) {
8669 --rescnt;
8670 *res++ = ' ';
8671 }
8672 if (dict && (argidx < arglen) && c != '%') {
8673 PyErr_SetString(PyExc_TypeError,
8674 "not all arguments converted during string formatting");
8675 Py_XDECREF(temp);
8676 goto onError;
8677 }
8678 Py_XDECREF(temp);
8679 } /* '%' */
8680 } /* until end */
8681 if (argidx < arglen && !dict) {
8682 PyErr_SetString(PyExc_TypeError,
8683 "not all arguments converted during string formatting");
8684 goto onError;
8685 }
8686
8687 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8688 goto onError;
8689 if (args_owned) {
8690 Py_DECREF(args);
8691 }
8692 Py_DECREF(uformat);
8693 return (PyObject *)result;
8694
8695 onError:
8696 Py_XDECREF(result);
8697 Py_DECREF(uformat);
8698 if (args_owned) {
8699 Py_DECREF(args);
8700 }
8701 return NULL;
8702 }
8703
8704 static PyBufferProcs unicode_as_buffer = {
8705 (readbufferproc) unicode_buffer_getreadbuf,
8706 (writebufferproc) unicode_buffer_getwritebuf,
8707 (segcountproc) unicode_buffer_getsegcount,
8708 (charbufferproc) unicode_buffer_getcharbuf,
8709 };
8710
8711 static PyObject *
8712 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8713
8714 static PyObject *
8715 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8716 {
8717 PyObject *x = NULL;
8718 static char *kwlist[] = {"string", "encoding", "errors", 0};
8719 char *encoding = NULL;
8720 char *errors = NULL;
8721
8722 if (type != &PyUnicode_Type)
8723 return unicode_subtype_new(type, args, kwds);
8724 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8725 kwlist, &x, &encoding, &errors))
8726 return NULL;
8727 if (x == NULL)
8728 return (PyObject *)_PyUnicode_New(0);
8729 if (encoding == NULL && errors == NULL)
8730 return PyObject_Unicode(x);
8731 else
8732 return PyUnicode_FromEncodedObject(x, encoding, errors);
8733 }
8734
8735 static PyObject *
8736 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8737 {
8738 PyUnicodeObject *tmp, *pnew;
8739 Py_ssize_t n;
8740
8741 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8742 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8743 if (tmp == NULL)
8744 return NULL;
8745 assert(PyUnicode_Check(tmp));
8746 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8747 if (pnew == NULL) {
8748 Py_DECREF(tmp);
8749 return NULL;
8750 }
8751 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8752 if (pnew->str == NULL) {
8753 _Py_ForgetReference((PyObject *)pnew);
8754 PyObject_Del(pnew);
8755 Py_DECREF(tmp);
8756 return PyErr_NoMemory();
8757 }
8758 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8759 pnew->length = n;
8760 pnew->hash = tmp->hash;
8761 Py_DECREF(tmp);
8762 return (PyObject *)pnew;
8763 }
8764
8765 PyDoc_STRVAR(unicode_doc,
8766 "unicode(string [, encoding[, errors]]) -> object\n\
8767 \n\
8768 Create a new Unicode object from the given encoded string.\n\
8769 encoding defaults to the current default string encoding.\n\
8770 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8771
8772 PyTypeObject PyUnicode_Type = {
8773 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8774 "unicode", /* tp_name */
8775 sizeof(PyUnicodeObject), /* tp_size */
8776 0, /* tp_itemsize */
8777 /* Slots */
8778 (destructor)unicode_dealloc, /* tp_dealloc */
8779 0, /* tp_print */
8780 0, /* tp_getattr */
8781 0, /* tp_setattr */
8782 0, /* tp_compare */
8783 unicode_repr, /* tp_repr */
8784 &unicode_as_number, /* tp_as_number */
8785 &unicode_as_sequence, /* tp_as_sequence */
8786 &unicode_as_mapping, /* tp_as_mapping */
8787 (hashfunc) unicode_hash, /* tp_hash*/
8788 0, /* tp_call*/
8789 (reprfunc) unicode_str, /* tp_str */
8790 PyObject_GenericGetAttr, /* tp_getattro */
8791 0, /* tp_setattro */
8792 &unicode_as_buffer, /* tp_as_buffer */
8793 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8794 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8795 unicode_doc, /* tp_doc */
8796 0, /* tp_traverse */
8797 0, /* tp_clear */
8798 PyUnicode_RichCompare, /* tp_richcompare */
8799 0, /* tp_weaklistoffset */
8800 0, /* tp_iter */
8801 0, /* tp_iternext */
8802 unicode_methods, /* tp_methods */
8803 0, /* tp_members */
8804 0, /* tp_getset */
8805 &PyBaseString_Type, /* tp_base */
8806 0, /* tp_dict */
8807 0, /* tp_descr_get */
8808 0, /* tp_descr_set */
8809 0, /* tp_dictoffset */
8810 0, /* tp_init */
8811 0, /* tp_alloc */
8812 unicode_new, /* tp_new */
8813 PyObject_Del, /* tp_free */
8814 };
8815
8816 /* Initialize the Unicode implementation */
8817
8818 void _PyUnicode_Init(void)
8819 {
8820 int i;
8821
8822 /* XXX - move this array to unicodectype.c ? */
8823 Py_UNICODE linebreak[] = {
8824 0x000A, /* LINE FEED */
8825 0x000D, /* CARRIAGE RETURN */
8826 0x001C, /* FILE SEPARATOR */
8827 0x001D, /* GROUP SEPARATOR */
8828 0x001E, /* RECORD SEPARATOR */
8829 0x0085, /* NEXT LINE */
8830 0x2028, /* LINE SEPARATOR */
8831 0x2029, /* PARAGRAPH SEPARATOR */
8832 };
8833
8834 /* Init the implementation */
8835 free_list = NULL;
8836 numfree = 0;
8837 unicode_empty = _PyUnicode_New(0);
8838 if (!unicode_empty)
8839 return;
8840
8841 strcpy(unicode_default_encoding, "ascii");
8842 for (i = 0; i < 256; i++)
8843 unicode_latin1[i] = NULL;
8844 if (PyType_Ready(&PyUnicode_Type) < 0)
8845 Py_FatalError("Can't initialize 'unicode'");
8846
8847 /* initialize the linebreak bloom filter */
8848 bloom_linebreak = make_bloom_mask(
8849 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8850 );
8851
8852 PyType_Ready(&EncodingMapType);
8853 }
8854
8855 /* Finalize the Unicode implementation */
8856
8857 int
8858 PyUnicode_ClearFreeList(void)
8859 {
8860 int freelist_size = numfree;
8861 PyUnicodeObject *u;
8862
8863 for (u = free_list; u != NULL;) {
8864 PyUnicodeObject *v = u;
8865 u = *(PyUnicodeObject **)u;
8866 if (v->str)
8867 PyObject_DEL(v->str);
8868 Py_XDECREF(v->defenc);
8869 PyObject_Del(v);
8870 numfree--;
8871 }
8872 free_list = NULL;
8873 assert(numfree == 0);
8874 return freelist_size;
8875 }
8876
8877 void
8878 _PyUnicode_Fini(void)
8879 {
8880 int i;
8881
8882 Py_XDECREF(unicode_empty);
8883 unicode_empty = NULL;
8884
8885 for (i = 0; i < 256; i++) {
8886 if (unicode_latin1[i]) {
8887 Py_DECREF(unicode_latin1[i]);
8888 unicode_latin1[i] = NULL;
8889 }
8890 }
8891 (void)PyUnicode_ClearFreeList();
8892 }
8893
8894 void _PyUnicode_DebugMallocStats(FILE *out)
8895 {
8896 _PyDebugAllocatorStats(out, "free PyUnicodeObject", numfree,
8897 sizeof(PyUnicodeObject));
8898 }
8899
8900 #ifdef __cplusplus
8901 }
8902 #endif