Location	Tool	Issue
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:919:17	clang-analyzer	Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:919:17	clang-analyzer	Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:957:25	clang-analyzer	Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:957:25	clang-analyzer	Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:2094:9	clang-analyzer	Value stored to 'nallocated' is never read
   1 /*
   2 
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6 
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9 
  10 Copyright (c) Corporation for National Research Initiatives.
  11 
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14 
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17 
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21 
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30 
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39 
  40 */
  41 
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44 
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47 
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51 
  52 /* Limit for the Unicode object free list */
  53 
  54 #define PyUnicode_MAXFREELIST       1024
  55 
  56 /* Limit for the Unicode object free list stay alive optimization.
  57 
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61 
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65 
  66    Setting the limit to 0 effectively turns the feature off.
  67 
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70 
  71 */
  72 
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74 
  75 /* Endianness switches; defaults to little endian */
  76 
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82 
  83 /* --- Globals ------------------------------------------------------------
  84 
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87 
  88 */
  89 
  90 
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94 
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98 
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101 
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105 
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108 
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111 
 112 */
 113 static char unicode_default_encoding[100];
 114 
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * CHARACTER TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * LINE TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135 
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145 
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000B, * LINE TABULATION */
 151 /*         0x000C, * FORM FEED */
 152 /*         0x000D, * CARRIAGE RETURN */
 153     0, 0, 1, 1, 1, 1, 0, 0,
 154     0, 0, 0, 0, 0, 0, 0, 0,
 155 /*         0x001C, * FILE SEPARATOR */
 156 /*         0x001D, * GROUP SEPARATOR */
 157 /*         0x001E, * RECORD SEPARATOR */
 158     0, 0, 0, 0, 1, 1, 1, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161     0, 0, 0, 0, 0, 0, 0, 0,
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163 
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0,
 170     0, 0, 0, 0, 0, 0, 0, 0,
 171     0, 0, 0, 0, 0, 0, 0, 0
 172 };
 173 
 174 
 175 Py_UNICODE
 176 PyUnicode_GetMax(void)
 177 {
 178 #ifdef Py_UNICODE_WIDE
 179     return 0x10FFFF;
 180 #else
 181     /* This is actually an illegal character, so it should
 182        not be passed to unichr. */
 183     return 0xFFFF;
 184 #endif
 185 }
 186 
 187 /* --- Bloom Filters ----------------------------------------------------- */
 188 
 189 /* stuff to implement simple "bloom filters" for Unicode characters.
 190    to keep things simple, we use a single bitmask, using the least 5
 191    bits from each unicode characters as the bit index. */
 192 
 193 /* the linebreak mask is set up by Unicode_Init below */
 194 
 195 #if LONG_BIT >= 128
 196 #define BLOOM_WIDTH 128
 197 #elif LONG_BIT >= 64
 198 #define BLOOM_WIDTH 64
 199 #elif LONG_BIT >= 32
 200 #define BLOOM_WIDTH 32
 201 #else
 202 #error "LONG_BIT is smaller than 32"
 203 #endif
 204 
 205 #define BLOOM_MASK unsigned long
 206 
 207 static BLOOM_MASK bloom_linebreak;
 208 
 209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 210 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 211 
 212 #define BLOOM_LINEBREAK(ch)                                             \
 213     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 214      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 215 
 216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 217 {
 218     /* calculate simple bloom-style bitmask for a given unicode string */
 219 
 220     BLOOM_MASK mask;
 221     Py_ssize_t i;
 222 
 223     mask = 0;
 224     for (i = 0; i < len; i++)
 225         BLOOM_ADD(mask, ptr[i]);
 226 
 227     return mask;
 228 }
 229 
 230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 231 {
 232     Py_ssize_t i;
 233 
 234     for (i = 0; i < setlen; i++)
 235         if (set[i] == chr)
 236             return 1;
 237 
 238     return 0;
 239 }
 240 
 241 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 242     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 243 
 244 /* --- Unicode Object ----------------------------------------------------- */
 245 
 246 static
 247 int unicode_resize(register PyUnicodeObject *unicode,
 248                    Py_ssize_t length)
 249 {
 250     void *oldstr;
 251 
 252     /* Shortcut if there's nothing much to do. */
 253     if (unicode->length == length)
 254         goto reset;
 255 
 256     /* Resizing shared object (unicode_empty or single character
 257        objects) in-place is not allowed. Use PyUnicode_Resize()
 258        instead ! */
 259 
 260     if (unicode == unicode_empty ||
 261         (unicode->length == 1 &&
 262          unicode->str[0] < 256U &&
 263          unicode_latin1[unicode->str[0]] == unicode)) {
 264         PyErr_SetString(PyExc_SystemError,
 265                         "can't resize shared unicode objects");
 266         return -1;
 267     }
 268 
 269     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 270        The overallocation is also used by fastsearch, which assumes that it's
 271        safe to look at str[length] (without making any assumptions about what
 272        it contains). */
 273 
 274     oldstr = unicode->str;
 275     unicode->str = PyObject_REALLOC(unicode->str,
 276                                     sizeof(Py_UNICODE) * (length + 1));
 277     if (!unicode->str) {
 278         unicode->str = (Py_UNICODE *)oldstr;
 279         PyErr_NoMemory();
 280         return -1;
 281     }
 282     unicode->str[length] = 0;
 283     unicode->length = length;
 284 
 285   reset:
 286     /* Reset the object caches */
 287     if (unicode->defenc) {
 288         Py_CLEAR(unicode->defenc);
 289     }
 290     unicode->hash = -1;
 291 
 292     return 0;
 293 }
 294 
 295 /* We allocate one more byte to make sure the string is
 296    Ux0000 terminated; some code relies on that.
 297 
 298    XXX This allocator could further be enhanced by assuring that the
 299    free list never reduces its size below 1.
 300 
 301 */
 302 
 303 static
 304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 305 {
 306     register PyUnicodeObject *unicode;
 307 
 308     /* Optimization for empty strings */
 309     if (length == 0 && unicode_empty != NULL) {
 310         Py_INCREF(unicode_empty);
 311         return unicode_empty;
 312     }
 313 
 314     /* Ensure we won't overflow the size. */
 315     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 316         return (PyUnicodeObject *)PyErr_NoMemory();
 317     }
 318 
 319     /* Unicode freelist & memory allocation */
 320     if (free_list) {
 321         unicode = free_list;
 322         free_list = *(PyUnicodeObject **)unicode;
 323         numfree--;
 324         if (unicode->str) {
 325             /* Keep-Alive optimization: we only upsize the buffer,
 326                never downsize it. */
 327             if ((unicode->length < length) &&
 328                 unicode_resize(unicode, length) < 0) {
 329                 PyObject_DEL(unicode->str);
 330                 unicode->str = NULL;
 331             }
 332         }
 333         else {
 334             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 335             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 336         }
 337         PyObject_INIT(unicode, &PyUnicode_Type);
 338     }
 339     else {
 340         size_t new_size;
 341         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 342         if (unicode == NULL)
 343             return NULL;
 344         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 345         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 346     }
 347 
 348     if (!unicode->str) {
 349         PyErr_NoMemory();
 350         goto onError;
 351     }
 352     /* Initialize the first element to guard against cases where
 353      * the caller fails before initializing str -- unicode_resize()
 354      * reads str[0], and the Keep-Alive optimization can keep memory
 355      * allocated for str alive across a call to unicode_dealloc(unicode).
 356      * We don't want unicode_resize to read uninitialized memory in
 357      * that case.
 358      */
 359     unicode->str[0] = 0;
 360     unicode->str[length] = 0;
 361     unicode->length = length;
 362     unicode->hash = -1;
 363     unicode->defenc = NULL;
 364     return unicode;
 365 
 366   onError:
 367     /* XXX UNREF/NEWREF interface should be more symmetrical */
 368     _Py_DEC_REFTOTAL;
 369     _Py_ForgetReference((PyObject *)unicode);
 370     PyObject_Del(unicode);
 371     return NULL;
 372 }
 373 
 374 static
 375 void unicode_dealloc(register PyUnicodeObject *unicode)
 376 {
 377     if (PyUnicode_CheckExact(unicode) &&
 378         numfree < PyUnicode_MAXFREELIST) {
 379         /* Keep-Alive optimization */
 380         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 381             PyObject_DEL(unicode->str);
 382             unicode->str = NULL;
 383             unicode->length = 0;
 384         }
 385         if (unicode->defenc) {
 386             Py_CLEAR(unicode->defenc);
 387         }
 388         /* Add to free list */
 389         *(PyUnicodeObject **)unicode = free_list;
 390         free_list = unicode;
 391         numfree++;
 392     }
 393     else {
 394         PyObject_DEL(unicode->str);
 395         Py_XDECREF(unicode->defenc);
 396         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 397     }
 398 }
 399 
 400 static
 401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 402 {
 403     register PyUnicodeObject *v;
 404 
 405     /* Argument checks */
 406     if (unicode == NULL) {
 407         PyErr_BadInternalCall();
 408         return -1;
 409     }
 410     v = *unicode;
 411     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 412         PyErr_BadInternalCall();
 413         return -1;
 414     }
 415 
 416     /* Resizing unicode_empty and single character objects is not
 417        possible since these are being shared. We simply return a fresh
 418        copy with the same Unicode content. */
 419     if (v->length != length &&
 420         (v == unicode_empty || v->length == 1)) {
 421         PyUnicodeObject *w = _PyUnicode_New(length);
 422         if (w == NULL)
 423             return -1;
 424         Py_UNICODE_COPY(w->str, v->str,
 425                         length < v->length ? length : v->length);
 426         Py_DECREF(*unicode);
 427         *unicode = w;
 428         return 0;
 429     }
 430 
 431     /* Note that we don't have to modify *unicode for unshared Unicode
 432        objects, since we can modify them in-place. */
 433     return unicode_resize(v, length);
 434 }
 435 
 436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 437 {
 438     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 439 }
 440 
 441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 442                                 Py_ssize_t size)
 443 {
 444     PyUnicodeObject *unicode;
 445 
 446     /* If the Unicode data is known at construction time, we can apply
 447        some optimizations which share commonly used objects. */
 448     if (u != NULL) {
 449 
 450         /* Optimization for empty strings */
 451         if (size == 0 && unicode_empty != NULL) {
 452             Py_INCREF(unicode_empty);
 453             return (PyObject *)unicode_empty;
 454         }
 455 
 456         /* Single character Unicode objects in the Latin-1 range are
 457            shared when using this constructor */
 458         if (size == 1 && *u < 256) {
 459             unicode = unicode_latin1[*u];
 460             if (!unicode) {
 461                 unicode = _PyUnicode_New(1);
 462                 if (!unicode)
 463                     return NULL;
 464                 unicode->str[0] = *u;
 465                 unicode_latin1[*u] = unicode;
 466             }
 467             Py_INCREF(unicode);
 468             return (PyObject *)unicode;
 469         }
 470     }
 471 
 472     unicode = _PyUnicode_New(size);
 473     if (!unicode)
 474         return NULL;
 475 
 476     /* Copy the Unicode data into the new object */
 477     if (u != NULL)
 478         Py_UNICODE_COPY(unicode->str, u, size);
 479 
 480     return (PyObject *)unicode;
 481 }
 482 
 483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 484 {
 485     PyUnicodeObject *unicode;
 486 
 487     if (size < 0) {
 488         PyErr_SetString(PyExc_SystemError,
 489                         "Negative size passed to PyUnicode_FromStringAndSize");
 490         return NULL;
 491     }
 492 
 493     /* If the Unicode data is known at construction time, we can apply
 494        some optimizations which share commonly used objects.
 495        Also, this means the input must be UTF-8, so fall back to the
 496        UTF-8 decoder at the end. */
 497     if (u != NULL) {
 498 
 499         /* Optimization for empty strings */
 500         if (size == 0 && unicode_empty != NULL) {
 501             Py_INCREF(unicode_empty);
 502             return (PyObject *)unicode_empty;
 503         }
 504 
 505         /* Single characters are shared when using this constructor.
 506            Restrict to ASCII, since the input must be UTF-8. */
 507         if (size == 1 && Py_CHARMASK(*u) < 128) {
 508             unicode = unicode_latin1[Py_CHARMASK(*u)];
 509             if (!unicode) {
 510                 unicode = _PyUnicode_New(1);
 511                 if (!unicode)
 512                     return NULL;
 513                 unicode->str[0] = Py_CHARMASK(*u);
 514                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 515             }
 516             Py_INCREF(unicode);
 517             return (PyObject *)unicode;
 518         }
 519 
 520         return PyUnicode_DecodeUTF8(u, size, NULL);
 521     }
 522 
 523     unicode = _PyUnicode_New(size);
 524     if (!unicode)
 525         return NULL;
 526 
 527     return (PyObject *)unicode;
 528 }
 529 
 530 PyObject *PyUnicode_FromString(const char *u)
 531 {
 532     size_t size = strlen(u);
 533     if (size > PY_SSIZE_T_MAX) {
 534         PyErr_SetString(PyExc_OverflowError, "input too long");
 535         return NULL;
 536     }
 537 
 538     return PyUnicode_FromStringAndSize(u, size);
 539 }
 540 
 541 #ifdef HAVE_WCHAR_H
 542 
 543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 544 # define CONVERT_WCHAR_TO_SURROGATES
 545 #endif
 546 
 547 #ifdef CONVERT_WCHAR_TO_SURROGATES
 548 
 549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 550    to convert from UTF32 to UTF16. */
 551 
 552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 553                                  Py_ssize_t size)
 554 {
 555     PyUnicodeObject *unicode;
 556     register Py_ssize_t i;
 557     Py_ssize_t alloc;
 558     const wchar_t *orig_w;
 559 
 560     if (w == NULL) {
 561         PyErr_BadInternalCall();
 562         return NULL;
 563     }
 564 
 565     alloc = size;
 566     orig_w = w;
 567     for (i = size; i > 0; i--) {
 568         if (*w > 0xFFFF)
 569             alloc++;
 570         w++;
 571     }
 572     w = orig_w;
 573     unicode = _PyUnicode_New(alloc);
 574     if (!unicode)
 575         return NULL;
 576 
 577     /* Copy the wchar_t data into the new object */
 578     {
 579         register Py_UNICODE *u;
 580         u = PyUnicode_AS_UNICODE(unicode);
 581         for (i = size; i > 0; i--) {
 582             if (*w > 0xFFFF) {
 583                 wchar_t ordinal = *w++;
 584                 ordinal -= 0x10000;
 585                 *u++ = 0xD800 | (ordinal >> 10);
 586                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 587             }
 588             else
 589                 *u++ = *w++;
 590         }
 591     }
 592     return (PyObject *)unicode;
 593 }
 594 
 595 #else
 596 
 597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 598                                  Py_ssize_t size)
 599 {
 600     PyUnicodeObject *unicode;
 601 
 602     if (w == NULL) {
 603         PyErr_BadInternalCall();
 604         return NULL;
 605     }
 606 
 607     unicode = _PyUnicode_New(size);
 608     if (!unicode)
 609         return NULL;
 610 
 611     /* Copy the wchar_t data into the new object */
 612 #ifdef HAVE_USABLE_WCHAR_T
 613     memcpy(unicode->str, w, size * sizeof(wchar_t));
 614 #else
 615     {
 616         register Py_UNICODE *u;
 617         register Py_ssize_t i;
 618         u = PyUnicode_AS_UNICODE(unicode);
 619         for (i = size; i > 0; i--)
 620             *u++ = *w++;
 621     }
 622 #endif
 623 
 624     return (PyObject *)unicode;
 625 }
 626 
 627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 628 
 629 #undef CONVERT_WCHAR_TO_SURROGATES
 630 
 631 static void
 632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 633 {
 634     *fmt++ = '%';
 635     if (width) {
 636         if (zeropad)
 637             *fmt++ = '0';
 638         fmt += sprintf(fmt, "%d", width);
 639     }
 640     if (precision)
 641         fmt += sprintf(fmt, ".%d", precision);
 642     if (longflag)
 643         *fmt++ = 'l';
 644     else if (size_tflag) {
 645         char *f = PY_FORMAT_SIZE_T;
 646         while (*f)
 647             *fmt++ = *f++;
 648     }
 649     *fmt++ = c;
 650     *fmt = '\0';
 651 }
 652 
 653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 654 
 655 PyObject *
 656 PyUnicode_FromFormatV(const char *format, va_list vargs)
 657 {
 658     va_list count;
 659     Py_ssize_t callcount = 0;
 660     PyObject **callresults = NULL;
 661     PyObject **callresult = NULL;
 662     Py_ssize_t n = 0;
 663     int width = 0;
 664     int precision = 0;
 665     int zeropad;
 666     const char* f;
 667     Py_UNICODE *s;
 668     PyObject *string;
 669     /* used by sprintf */
 670     char buffer[21];
 671     /* use abuffer instead of buffer, if we need more space
 672      * (which can happen if there's a format specifier with width). */
 673     char *abuffer = NULL;
 674     char *realbuffer;
 675     Py_ssize_t abuffersize = 0;
 676     char fmt[60]; /* should be enough for %0width.precisionld */
 677     const char *copy;
 678 
 679 #ifdef VA_LIST_IS_ARRAY
 680     Py_MEMCPY(count, vargs, sizeof(va_list));
 681 #else
 682 #ifdef  __va_copy
 683     __va_copy(count, vargs);
 684 #else
 685     count = vargs;
 686 #endif
 687 #endif
 688      /* step 1: count the number of %S/%R/%s format specifications
 689       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 690       * objects once during step 3 and put the result in an array) */
 691     for (f = format; *f; f++) {
 692          if (*f == '%') {
 693              if (*(f+1)=='%')
 694                  continue;
 695              if (*(f+1)=='S' || *(f+1)=='R')
 696                  ++callcount;
 697              while (isdigit((unsigned)*f))
 698                  width = (width*10) + *f++ - '0';
 699              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 700                  ;
 701              if (*f == 's')
 702                  ++callcount;
 703          }
 704     }
 705     /* step 2: allocate memory for the results of
 706      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 707     if (callcount) {
 708         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 709         if (!callresults) {
 710             PyErr_NoMemory();
 711             return NULL;
 712         }
 713         callresult = callresults;
 714     }
 715     /* step 3: figure out how large a buffer we need */
 716     for (f = format; *f; f++) {
 717         if (*f == '%') {
 718             const char* p = f;
 719             width = 0;
 720             while (isdigit((unsigned)*f))
 721                 width = (width*10) + *f++ - '0';
 722             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 723                 ;
 724 
 725             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 726              * they don't affect the amount of space we reserve.
 727              */
 728             if ((*f == 'l' || *f == 'z') &&
 729                 (f[1] == 'd' || f[1] == 'u'))
 730                 ++f;
 731 
 732             switch (*f) {
 733             case 'c':
 734                 (void)va_arg(count, int);
 735                 /* fall through... */
 736             case '%':
 737                 n++;
 738                 break;
 739             case 'd': case 'u': case 'i': case 'x':
 740                 (void) va_arg(count, int);
 741                 /* 20 bytes is enough to hold a 64-bit
 742                    integer.  Decimal takes the most space.
 743                    This isn't enough for octal.
 744                    If a width is specified we need more
 745                    (which we allocate later). */
 746                 if (width < 20)
 747                     width = 20;
 748                 n += width;
 749                 if (abuffersize < width)
 750                     abuffersize = width;
 751                 break;
 752             case 's':
 753             {
 754                 /* UTF-8 */
 755                 const char *s = va_arg(count, const char*);
 756                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 757                 if (!str)
 758                     goto fail;
 759                 n += PyUnicode_GET_SIZE(str);
 760                 /* Remember the str and switch to the next slot */
 761                 *callresult++ = str;
 762                 break;
 763             }
 764             case 'U':
 765             {
 766                 PyObject *obj = va_arg(count, PyObject *);
 767                 assert(obj && PyUnicode_Check(obj));
 768                 n += PyUnicode_GET_SIZE(obj);
 769                 break;
 770             }
 771             case 'V':
 772             {
 773                 PyObject *obj = va_arg(count, PyObject *);
 774                 const char *str = va_arg(count, const char *);
 775                 assert(obj || str);
 776                 assert(!obj || PyUnicode_Check(obj));
 777                 if (obj)
 778                     n += PyUnicode_GET_SIZE(obj);
 779                 else
 780                     n += strlen(str);
 781                 break;
 782             }
 783             case 'S':
 784             {
 785                 PyObject *obj = va_arg(count, PyObject *);
 786                 PyObject *str;
 787                 assert(obj);
 788                 str = PyObject_Str(obj);
 789                 if (!str)
 790                     goto fail;
 791                 n += PyUnicode_GET_SIZE(str);
 792                 /* Remember the str and switch to the next slot */
 793                 *callresult++ = str;
 794                 break;
 795             }
 796             case 'R':
 797             {
 798                 PyObject *obj = va_arg(count, PyObject *);
 799                 PyObject *repr;
 800                 assert(obj);
 801                 repr = PyObject_Repr(obj);
 802                 if (!repr)
 803                     goto fail;
 804                 n += PyUnicode_GET_SIZE(repr);
 805                 /* Remember the repr and switch to the next slot */
 806                 *callresult++ = repr;
 807                 break;
 808             }
 809             case 'p':
 810                 (void) va_arg(count, int);
 811                 /* maximum 64-bit pointer representation:
 812                  * 0xffffffffffffffff
 813                  * so 19 characters is enough.
 814                  * XXX I count 18 -- what's the extra for?
 815                  */
 816                 n += 19;
 817                 break;
 818             default:
 819                 /* if we stumble upon an unknown
 820                    formatting code, copy the rest of
 821                    the format string to the output
 822                    string. (we cannot just skip the
 823                    code, since there's no way to know
 824                    what's in the argument list) */
 825                 n += strlen(p);
 826                 goto expand;
 827             }
 828         } else
 829             n++;
 830     }
 831   expand:
 832     if (abuffersize > 20) {
 833         abuffer = PyObject_Malloc(abuffersize);
 834         if (!abuffer) {
 835             PyErr_NoMemory();
 836             goto fail;
 837         }
 838         realbuffer = abuffer;
 839     }
 840     else
 841         realbuffer = buffer;
 842     /* step 4: fill the buffer */
 843     /* Since we've analyzed how much space we need for the worst case,
 844        we don't have to resize the string.
 845        There can be no errors beyond this point. */
 846     string = PyUnicode_FromUnicode(NULL, n);
 847     if (!string)
 848         goto fail;
 849 
 850     s = PyUnicode_AS_UNICODE(string);
 851     callresult = callresults;
 852 
 853     for (f = format; *f; f++) {
 854         if (*f == '%') {
 855             const char* p = f++;
 856             int longflag = 0;
 857             int size_tflag = 0;
 858             zeropad = (*f == '0');
 859             /* parse the width.precision part */
 860             width = 0;
 861             while (isdigit((unsigned)*f))
 862                 width = (width*10) + *f++ - '0';
 863             precision = 0;
 864             if (*f == '.') {
 865                 f++;
 866                 while (isdigit((unsigned)*f))
 867                     precision = (precision*10) + *f++ - '0';
 868             }
 869             /* handle the long flag, but only for %ld and %lu.
 870                others can be added when necessary. */
 871             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 872                 longflag = 1;
 873                 ++f;
 874             }
 875             /* handle the size_t flag. */
 876             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 877                 size_tflag = 1;
 878                 ++f;
 879             }
 880 
 881             switch (*f) {
 882             case 'c':
 883                 *s++ = va_arg(vargs, int);
 884                 break;
 885             case 'd':
 886                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 887                 if (longflag)
 888                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 889                 else if (size_tflag)
 890                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 891                 else
 892                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 893                 appendstring(realbuffer);
 894                 break;
 895             case 'u':
 896                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 897                 if (longflag)
 898                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 899                 else if (size_tflag)
 900                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 901                 else
 902                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 903                 appendstring(realbuffer);
 904                 break;
 905             case 'i':
 906                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 907                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 908                 appendstring(realbuffer);
 909                 break;
 910             case 'x':
 911                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 912                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 913                 appendstring(realbuffer);
 914                 break;
 915             case 's':
 916             {
 917                 /* unused, since we already have the result */
 918                 (void) va_arg(vargs, char *);
 919                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
   Dereference of null pointer (loaded from variable 'callresult')
   (emitted by clang-analyzer)
TODO: a detailed trace is available in the data model (not yet rendered in this report)
   Dereference of null pointer (loaded from variable 'callresult')
   (emitted by clang-analyzer)
TODO: a detailed trace is available in the data model (not yet rendered in this report) 920                                 PyUnicode_GET_SIZE(*callresult));
 921                 s += PyUnicode_GET_SIZE(*callresult);
 922                 /* We're done with the unicode()/repr() => forget it */
 923                 Py_DECREF(*callresult);
 924                 /* switch to next unicode()/repr() result */
 925                 ++callresult;
 926                 break;
 927             }
 928             case 'U':
 929             {
 930                 PyObject *obj = va_arg(vargs, PyObject *);
 931                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 932                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 933                 s += size;
 934                 break;
 935             }
 936             case 'V':
 937             {
 938                 PyObject *obj = va_arg(vargs, PyObject *);
 939                 const char *str = va_arg(vargs, const char *);
 940                 if (obj) {
 941                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 942                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 943                     s += size;
 944                 } else {
 945                     appendstring(str);
 946                 }
 947                 break;
 948             }
 949             case 'S':
 950             case 'R':
 951             {
 952                 Py_UNICODE *ucopy;
 953                 Py_ssize_t usize;
 954                 Py_ssize_t upos;
 955                 /* unused, since we already have the result */
 956                 (void) va_arg(vargs, PyObject *);
 957                 ucopy = PyUnicode_AS_UNICODE(*callresult);
   Dereference of null pointer (loaded from variable 'callresult')
   (emitted by clang-analyzer)
TODO: a detailed trace is available in the data model (not yet rendered in this report)
   Dereference of null pointer (loaded from variable 'callresult')
   (emitted by clang-analyzer)
TODO: a detailed trace is available in the data model (not yet rendered in this report) 958                 usize = PyUnicode_GET_SIZE(*callresult);
 959                 for (upos = 0; upos<usize;)
 960                     *s++ = ucopy[upos++];
 961                 /* We're done with the unicode()/repr() => forget it */
 962                 Py_DECREF(*callresult);
 963                 /* switch to next unicode()/repr() result */
 964                 ++callresult;
 965                 break;
 966             }
 967             case 'p':
 968                 sprintf(buffer, "%p", va_arg(vargs, void*));
 969                 /* %p is ill-defined:  ensure leading 0x. */
 970                 if (buffer[1] == 'X')
 971                     buffer[1] = 'x';
 972                 else if (buffer[1] != 'x') {
 973                     memmove(buffer+2, buffer, strlen(buffer)+1);
 974                     buffer[0] = '0';
 975                     buffer[1] = 'x';
 976                 }
 977                 appendstring(buffer);
 978                 break;
 979             case '%':
 980                 *s++ = '%';
 981                 break;
 982             default:
 983                 appendstring(p);
 984                 goto end;
 985             }
 986         } else
 987             *s++ = *f;
 988     }
 989 
 990   end:
 991     if (callresults)
 992         PyObject_Free(callresults);
 993     if (abuffer)
 994         PyObject_Free(abuffer);
 995     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 996     return string;
 997   fail:
 998     if (callresults) {
 999         PyObject **callresult2 = callresults;
1000         while (callresult2 < callresult) {
1001             Py_DECREF(*callresult2);
1002             ++callresult2;
1003         }
1004         PyObject_Free(callresults);
1005     }
1006     if (abuffer)
1007         PyObject_Free(abuffer);
1008     return NULL;
1009 }
1010 
1011 #undef appendstring
1012 
1013 PyObject *
1014 PyUnicode_FromFormat(const char *format, ...)
1015 {
1016     PyObject* ret;
1017     va_list vargs;
1018 
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020     va_start(vargs, format);
1021 #else
1022     va_start(vargs);
1023 #endif
1024     ret = PyUnicode_FromFormatV(format, vargs);
1025     va_end(vargs);
1026     return ret;
1027 }
1028 
1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030                                 wchar_t *w,
1031                                 Py_ssize_t size)
1032 {
1033     if (unicode == NULL) {
1034         PyErr_BadInternalCall();
1035         return -1;
1036     }
1037 
1038     /* If possible, try to copy the 0-termination as well */
1039     if (size > PyUnicode_GET_SIZE(unicode))
1040         size = PyUnicode_GET_SIZE(unicode) + 1;
1041 
1042 #ifdef HAVE_USABLE_WCHAR_T
1043     memcpy(w, unicode->str, size * sizeof(wchar_t));
1044 #else
1045     {
1046         register Py_UNICODE *u;
1047         register Py_ssize_t i;
1048         u = PyUnicode_AS_UNICODE(unicode);
1049         for (i = size; i > 0; i--)
1050             *w++ = *u++;
1051     }
1052 #endif
1053 
1054     if (size > PyUnicode_GET_SIZE(unicode))
1055         return PyUnicode_GET_SIZE(unicode);
1056     else
1057         return size;
1058 }
1059 
1060 #endif
1061 
1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
1063 {
1064     Py_UNICODE s[1];
1065 
1066 #ifdef Py_UNICODE_WIDE
1067     if (ordinal < 0 || ordinal > 0x10ffff) {
1068         PyErr_SetString(PyExc_ValueError,
1069                         "unichr() arg not in range(0x110000) "
1070                         "(wide Python build)");
1071         return NULL;
1072     }
1073 #else
1074     if (ordinal < 0 || ordinal > 0xffff) {
1075         PyErr_SetString(PyExc_ValueError,
1076                         "unichr() arg not in range(0x10000) "
1077                         "(narrow Python build)");
1078         return NULL;
1079     }
1080 #endif
1081 
1082     s[0] = (Py_UNICODE)ordinal;
1083     return PyUnicode_FromUnicode(s, 1);
1084 }
1085 
1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
1087 {
1088     /* XXX Perhaps we should make this API an alias of
1089        PyObject_Unicode() instead ?! */
1090     if (PyUnicode_CheckExact(obj)) {
1091         Py_INCREF(obj);
1092         return obj;
1093     }
1094     if (PyUnicode_Check(obj)) {
1095         /* For a Unicode subtype that's not a Unicode object,
1096            return a true Unicode object with the same data. */
1097         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098                                      PyUnicode_GET_SIZE(obj));
1099     }
1100     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101 }
1102 
1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104                                       const char *encoding,
1105                                       const char *errors)
1106 {
1107     const char *s = NULL;
1108     Py_ssize_t len;
1109     PyObject *v;
1110 
1111     if (obj == NULL) {
1112         PyErr_BadInternalCall();
1113         return NULL;
1114     }
1115 
1116 #if 0
1117     /* For b/w compatibility we also accept Unicode objects provided
1118        that no encodings is given and then redirect to
1119        PyObject_Unicode() which then applies the additional logic for
1120        Unicode subclasses.
1121 
1122        NOTE: This API should really only be used for object which
1123        represent *encoded* Unicode !
1124 
1125     */
1126     if (PyUnicode_Check(obj)) {
1127         if (encoding) {
1128             PyErr_SetString(PyExc_TypeError,
1129                             "decoding Unicode is not supported");
1130             return NULL;
1131         }
1132         return PyObject_Unicode(obj);
1133     }
1134 #else
1135     if (PyUnicode_Check(obj)) {
1136         PyErr_SetString(PyExc_TypeError,
1137                         "decoding Unicode is not supported");
1138         return NULL;
1139     }
1140 #endif
1141 
1142     /* Coerce object */
1143     if (PyString_Check(obj)) {
1144         s = PyString_AS_STRING(obj);
1145         len = PyString_GET_SIZE(obj);
1146     }
1147     else if (PyByteArray_Check(obj)) {
1148         /* Python 2.x specific */
1149         PyErr_Format(PyExc_TypeError,
1150                      "decoding bytearray is not supported");
1151         return NULL;
1152     }
1153     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154         /* Overwrite the error message with something more useful in
1155            case of a TypeError. */
1156         if (PyErr_ExceptionMatches(PyExc_TypeError))
1157             PyErr_Format(PyExc_TypeError,
1158                          "coercing to Unicode: need string or buffer, "
1159                          "%.80s found",
1160                          Py_TYPE(obj)->tp_name);
1161         goto onError;
1162     }
1163 
1164     /* Convert to Unicode */
1165     if (len == 0) {
1166         Py_INCREF(unicode_empty);
1167         v = (PyObject *)unicode_empty;
1168     }
1169     else
1170         v = PyUnicode_Decode(s, len, encoding, errors);
1171 
1172     return v;
1173 
1174   onError:
1175     return NULL;
1176 }
1177 
1178 PyObject *PyUnicode_Decode(const char *s,
1179                            Py_ssize_t size,
1180                            const char *encoding,
1181                            const char *errors)
1182 {
1183     PyObject *buffer = NULL, *unicode;
1184 
1185     if (encoding == NULL)
1186         encoding = PyUnicode_GetDefaultEncoding();
1187 
1188     /* Shortcuts for common default encodings */
1189     if (strcmp(encoding, "utf-8") == 0)
1190         return PyUnicode_DecodeUTF8(s, size, errors);
1191     else if (strcmp(encoding, "latin-1") == 0)
1192         return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194     else if (strcmp(encoding, "mbcs") == 0)
1195         return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif
1197     else if (strcmp(encoding, "ascii") == 0)
1198         return PyUnicode_DecodeASCII(s, size, errors);
1199 
1200     /* Decode via the codec registry */
1201     buffer = PyBuffer_FromMemory((void *)s, size);
1202     if (buffer == NULL)
1203         goto onError;
1204     unicode = PyCodec_Decode(buffer, encoding, errors);
1205     if (unicode == NULL)
1206         goto onError;
1207     if (!PyUnicode_Check(unicode)) {
1208         PyErr_Format(PyExc_TypeError,
1209                      "decoder did not return an unicode object (type=%.400s)",
1210                      Py_TYPE(unicode)->tp_name);
1211         Py_DECREF(unicode);
1212         goto onError;
1213     }
1214     Py_DECREF(buffer);
1215     return unicode;
1216 
1217   onError:
1218     Py_XDECREF(buffer);
1219     return NULL;
1220 }
1221 
1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223                                     const char *encoding,
1224                                     const char *errors)
1225 {
1226     PyObject *v;
1227 
1228     if (!PyUnicode_Check(unicode)) {
1229         PyErr_BadArgument();
1230         goto onError;
1231     }
1232 
1233     if (encoding == NULL)
1234         encoding = PyUnicode_GetDefaultEncoding();
1235 
1236     /* Decode via the codec registry */
1237     v = PyCodec_Decode(unicode, encoding, errors);
1238     if (v == NULL)
1239         goto onError;
1240     return v;
1241 
1242   onError:
1243     return NULL;
1244 }
1245 
1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247                            Py_ssize_t size,
1248                            const char *encoding,
1249                            const char *errors)
1250 {
1251     PyObject *v, *unicode;
1252 
1253     unicode = PyUnicode_FromUnicode(s, size);
1254     if (unicode == NULL)
1255         return NULL;
1256     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257     Py_DECREF(unicode);
1258     return v;
1259 }
1260 
1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262                                     const char *encoding,
1263                                     const char *errors)
1264 {
1265     PyObject *v;
1266 
1267     if (!PyUnicode_Check(unicode)) {
1268         PyErr_BadArgument();
1269         goto onError;
1270     }
1271 
1272     if (encoding == NULL)
1273         encoding = PyUnicode_GetDefaultEncoding();
1274 
1275     /* Encode via the codec registry */
1276     v = PyCodec_Encode(unicode, encoding, errors);
1277     if (v == NULL)
1278         goto onError;
1279     return v;
1280 
1281   onError:
1282     return NULL;
1283 }
1284 
1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286                                     const char *encoding,
1287                                     const char *errors)
1288 {
1289     PyObject *v;
1290 
1291     if (!PyUnicode_Check(unicode)) {
1292         PyErr_BadArgument();
1293         goto onError;
1294     }
1295 
1296     if (encoding == NULL)
1297         encoding = PyUnicode_GetDefaultEncoding();
1298 
1299     /* Shortcuts for common default encodings */
1300     if (errors == NULL) {
1301         if (strcmp(encoding, "utf-8") == 0)
1302             return PyUnicode_AsUTF8String(unicode);
1303         else if (strcmp(encoding, "latin-1") == 0)
1304             return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306         else if (strcmp(encoding, "mbcs") == 0)
1307             return PyUnicode_AsMBCSString(unicode);
1308 #endif
1309         else if (strcmp(encoding, "ascii") == 0)
1310             return PyUnicode_AsASCIIString(unicode);
1311     }
1312 
1313     /* Encode via the codec registry */
1314     v = PyCodec_Encode(unicode, encoding, errors);
1315     if (v == NULL)
1316         goto onError;
1317     if (!PyString_Check(v)) {
1318         PyErr_Format(PyExc_TypeError,
1319                      "encoder did not return a string object (type=%.400s)",
1320                      Py_TYPE(v)->tp_name);
1321         Py_DECREF(v);
1322         goto onError;
1323     }
1324     return v;
1325 
1326   onError:
1327     return NULL;
1328 }
1329 
1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331                                             const char *errors)
1332 {
1333     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334 
1335     if (v)
1336         return v;
1337     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338     if (v && errors == NULL)
1339         ((PyUnicodeObject *)unicode)->defenc = v;
1340     return v;
1341 }
1342 
1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344 {
1345     if (!PyUnicode_Check(unicode)) {
1346         PyErr_BadArgument();
1347         goto onError;
1348     }
1349     return PyUnicode_AS_UNICODE(unicode);
1350 
1351   onError:
1352     return NULL;
1353 }
1354 
1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1356 {
1357     if (!PyUnicode_Check(unicode)) {
1358         PyErr_BadArgument();
1359         goto onError;
1360     }
1361     return PyUnicode_GET_SIZE(unicode);
1362 
1363   onError:
1364     return -1;
1365 }
1366 
1367 const char *PyUnicode_GetDefaultEncoding(void)
1368 {
1369     return unicode_default_encoding;
1370 }
1371 
1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
1373 {
1374     PyObject *v;
1375 
1376     /* Make sure the encoding is valid. As side effect, this also
1377        loads the encoding into the codec registry cache. */
1378     v = _PyCodec_Lookup(encoding);
1379     if (v == NULL)
1380         goto onError;
1381     Py_DECREF(v);
1382     strncpy(unicode_default_encoding,
1383             encoding,
1384             sizeof(unicode_default_encoding));
1385     return 0;
1386 
1387   onError:
1388     return -1;
1389 }
1390 
1391 /* error handling callback helper:
1392    build arguments, call the callback and check the arguments,
1393    if no exception occurred, copy the replacement to the output
1394    and adjust various state variables.
1395    return 0 on success, -1 on error
1396 */
1397 
1398 static
1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400                                      const char *encoding, const char *reason,
1401                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1404 {
1405     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1406 
1407     PyObject *restuple = NULL;
1408     PyObject *repunicode = NULL;
1409     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410     Py_ssize_t requiredsize;
1411     Py_ssize_t newpos;
1412     Py_UNICODE *repptr;
1413     Py_ssize_t repsize;
1414     int res = -1;
1415 
1416     if (*errorHandler == NULL) {
1417         *errorHandler = PyCodec_LookupError(errors);
1418         if (*errorHandler == NULL)
1419             goto onError;
1420     }
1421 
1422     if (*exceptionObject == NULL) {
1423         *exceptionObject = PyUnicodeDecodeError_Create(
1424             encoding, input, insize, *startinpos, *endinpos, reason);
1425         if (*exceptionObject == NULL)
1426             goto onError;
1427     }
1428     else {
1429         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430             goto onError;
1431         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432             goto onError;
1433         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434             goto onError;
1435     }
1436 
1437     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438     if (restuple == NULL)
1439         goto onError;
1440     if (!PyTuple_Check(restuple)) {
1441         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442         goto onError;
1443     }
1444     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445         goto onError;
1446     if (newpos<0)
1447         newpos = insize+newpos;
1448     if (newpos<0 || newpos>insize) {
1449         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450         goto onError;
1451     }
1452 
1453     /* need more space? (at least enough for what we
1454        have+the replacement+the rest of the string (starting
1455        at the new input position), so we won't have to check space
1456        when there are no errors in the rest of the string) */
1457     repptr = PyUnicode_AS_UNICODE(repunicode);
1458     repsize = PyUnicode_GET_SIZE(repunicode);
1459     requiredsize = *outpos + repsize + insize-newpos;
1460     if (requiredsize > outsize) {
1461         if (requiredsize<2*outsize)
1462             requiredsize = 2*outsize;
1463         if (_PyUnicode_Resize(output, requiredsize) < 0)
1464             goto onError;
1465         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466     }
1467     *endinpos = newpos;
1468     *inptr = input + newpos;
1469     Py_UNICODE_COPY(*outptr, repptr, repsize);
1470     *outptr += repsize;
1471     *outpos += repsize;
1472     /* we made it! */
1473     res = 0;
1474 
1475   onError:
1476     Py_XDECREF(restuple);
1477     return res;
1478 }
1479 
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1481 
1482 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1483 
1484 /* Three simple macros defining base-64. */
1485 
1486 /* Is c a base-64 character? */
1487 
1488 #define IS_BASE64(c) \
1489     (isalnum(c) || (c) == '+' || (c) == '/')
1490 
1491 /* given that c is a base-64 character, what is its base-64 value? */
1492 
1493 #define FROM_BASE64(c)                                                  \
1494     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1495      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1496      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1497      (c) == '+' ? 62 : 63)
1498 
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1500 
1501 #define TO_BASE64(n)  \
1502     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503 
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505  * decoded as itself.  We are permissive on decoding; the only ASCII
1506  * byte not decoding to itself is the + which begins a base64
1507  * string. */
1508 
1509 #define DECODE_DIRECT(c)                                \
1510     ((c) <= 127 && (c) != '+')
1511 
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514  * the above).  See RFC2152.  This array identifies these different
1515  * sets:
1516  * 0 : "Set D"
1517  *     alphanumeric and '(),-./:?
1518  * 1 : "Set O"
1519  *     !"#$%&*;<=>@[]^_`{|}
1520  * 2 : "whitespace"
1521  *     ht nl cr sp
1522  * 3 : special (must be base64 encoded)
1523  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524  */
1525 
1526 static
1527 char utf7_category[128] = {
1528 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1529     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1531     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1532 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1533     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1534 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1535     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1536 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1537     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1538 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1539     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1540 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1541     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1542 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1543     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1544 };
1545 
1546 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1547  * answer depends on whether we are encoding set O as itself, and also
1548  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1549  * clear that the answers to these questions vary between
1550  * applications, so this code needs to be flexible.  */
1551 
1552 #define ENCODE_DIRECT(c, directO, directWS)             \
1553     ((c) < 128 && (c) > 0 &&                            \
1554      ((utf7_category[(c)] == 0) ||                      \
1555       (directWS && (utf7_category[(c)] == 2)) ||        \
1556       (directO && (utf7_category[(c)] == 1))))
1557 
1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
1559                                Py_ssize_t size,
1560                                const char *errors)
1561 {
1562     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563 }
1564 
1565 /* The decoder.  The only state we preserve is our read position,
1566  * i.e. how many characters we have consumed.  So if we end in the
1567  * middle of a shift sequence we have to back off the read position
1568  * and the output to the beginning of the sequence, otherwise we lose
1569  * all the shift state (seen bits, number of bits seen, high
1570  * surrogate). */
1571 
1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573                                        Py_ssize_t size,
1574                                        const char *errors,
1575                                        Py_ssize_t *consumed)
1576 {
1577     const char *starts = s;
1578     Py_ssize_t startinpos;
1579     Py_ssize_t endinpos;
1580     Py_ssize_t outpos;
1581     const char *e;
1582     PyUnicodeObject *unicode;
1583     Py_UNICODE *p;
1584     const char *errmsg = "";
1585     int inShift = 0;
1586     Py_UNICODE *shiftOutStart;
1587     unsigned int base64bits = 0;
1588     unsigned long base64buffer = 0;
1589     Py_UNICODE surrogate = 0;
1590     PyObject *errorHandler = NULL;
1591     PyObject *exc = NULL;
1592 
1593     unicode = _PyUnicode_New(size);
1594     if (!unicode)
1595         return NULL;
1596     if (size == 0) {
1597         if (consumed)
1598             *consumed = 0;
1599         return (PyObject *)unicode;
1600     }
1601 
1602     p = unicode->str;
1603     shiftOutStart = p;
1604     e = s + size;
1605 
1606     while (s < e) {
1607         Py_UNICODE ch = (unsigned char) *s;
1608 
1609         if (inShift) { /* in a base-64 section */
1610             if (IS_BASE64(ch)) { /* consume a base-64 character */
1611                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612                 base64bits += 6;
1613                 s++;
1614                 if (base64bits >= 16) {
1615                     /* we have enough bits for a UTF-16 value */
1616                     Py_UNICODE outCh = (Py_UNICODE)
1617                                        (base64buffer >> (base64bits-16));
1618                     base64bits -= 16;
1619                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620                     if (surrogate) {
1621                         /* expecting a second surrogate */
1622                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624                             *p++ = (((surrogate & 0x3FF)<<10)
1625                                     | (outCh & 0x3FF)) + 0x10000;
1626 #else
1627                             *p++ = surrogate;
1628                             *p++ = outCh;
1629 #endif
1630                             surrogate = 0;
1631                             continue;
1632                         }
1633                         else {
1634                             *p++ = surrogate;
1635                             surrogate = 0;
1636                         }
1637                     }
1638                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639                         /* first surrogate */
1640                         surrogate = outCh;
1641                     }
1642                     else {
1643                         *p++ = outCh;
1644                     }
1645                 }
1646             }
1647             else { /* now leaving a base-64 section */
1648                 inShift = 0;
1649                 s++;
1650                 if (surrogate) {
1651                     *p++ = surrogate;
1652                     surrogate = 0;
1653                 }
1654                 if (base64bits > 0) { /* left-over bits */
1655                     if (base64bits >= 6) {
1656                         /* We've seen at least one base-64 character */
1657                         errmsg = "partial character in shift sequence";
1658                         goto utf7Error;
1659                     }
1660                     else {
1661                         /* Some bits remain; they should be zero */
1662                         if (base64buffer != 0) {
1663                             errmsg = "non-zero padding bits in shift sequence";
1664                             goto utf7Error;
1665                         }
1666                     }
1667                 }
1668                 if (ch != '-') {
1669                     /* '-' is absorbed; other terminating
1670                        characters are preserved */
1671                     *p++ = ch;
1672                 }
1673             }
1674         }
1675         else if ( ch == '+' ) {
1676             startinpos = s-starts;
1677             s++; /* consume '+' */
1678             if (s < e && *s == '-') { /* '+-' encodes '+' */
1679                 s++;
1680                 *p++ = '+';
1681             }
1682             else { /* begin base64-encoded section */
1683                 inShift = 1;
1684                 shiftOutStart = p;
1685                 base64bits = 0;
1686             }
1687         }
1688         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1689             *p++ = ch;
1690             s++;
1691         }
1692         else {
1693             startinpos = s-starts;
1694             s++;
1695             errmsg = "unexpected special character";
1696             goto utf7Error;
1697         }
1698         continue;
1699 utf7Error:
1700         outpos = p-PyUnicode_AS_UNICODE(unicode);
1701         endinpos = s-starts;
1702         if (unicode_decode_call_errorhandler(
1703                 errors, &errorHandler,
1704                 "utf7", errmsg,
1705                 starts, size, &startinpos, &endinpos, &exc, &s,
1706                 &unicode, &outpos, &p))
1707             goto onError;
1708     }
1709 
1710     /* end of string */
1711 
1712     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1713         /* if we're in an inconsistent state, that's an error */
1714         if (surrogate ||
1715                 (base64bits >= 6) ||
1716                 (base64bits > 0 && base64buffer != 0)) {
1717             outpos = p-PyUnicode_AS_UNICODE(unicode);
1718             endinpos = size;
1719             if (unicode_decode_call_errorhandler(
1720                     errors, &errorHandler,
1721                     "utf7", "unterminated shift sequence",
1722                     starts, size, &startinpos, &endinpos, &exc, &s,
1723                     &unicode, &outpos, &p))
1724                 goto onError;
1725         }
1726     }
1727 
1728     /* return state */
1729     if (consumed) {
1730         if (inShift) {
1731             p = shiftOutStart; /* back off output */
1732             *consumed = startinpos;
1733         }
1734         else {
1735             *consumed = s-starts;
1736         }
1737     }
1738 
1739     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1740         goto onError;
1741 
1742     Py_XDECREF(errorHandler);
1743     Py_XDECREF(exc);
1744     return (PyObject *)unicode;
1745 
1746   onError:
1747     Py_XDECREF(errorHandler);
1748     Py_XDECREF(exc);
1749     Py_DECREF(unicode);
1750     return NULL;
1751 }
1752 
1753 
1754 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1755                                Py_ssize_t size,
1756                                int base64SetO,
1757                                int base64WhiteSpace,
1758                                const char *errors)
1759 {
1760     PyObject *v;
1761     /* It might be possible to tighten this worst case */
1762     Py_ssize_t allocated = 8 * size;
1763     int inShift = 0;
1764     Py_ssize_t i = 0;
1765     unsigned int base64bits = 0;
1766     unsigned long base64buffer = 0;
1767     char * out;
1768     char * start;
1769 
1770     if (allocated / 8 != size)
1771         return PyErr_NoMemory();
1772 
1773     if (size == 0)
1774         return PyString_FromStringAndSize(NULL, 0);
1775 
1776     v = PyString_FromStringAndSize(NULL, allocated);
1777     if (v == NULL)
1778         return NULL;
1779 
1780     start = out = PyString_AS_STRING(v);
1781     for (;i < size; ++i) {
1782         Py_UNICODE ch = s[i];
1783 
1784         if (inShift) {
1785             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1786                 /* shifting out */
1787                 if (base64bits) { /* output remaining bits */
1788                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1789                     base64buffer = 0;
1790                     base64bits = 0;
1791                 }
1792                 inShift = 0;
1793                 /* Characters not in the BASE64 set implicitly unshift the sequence
1794                    so no '-' is required, except if the character is itself a '-' */
1795                 if (IS_BASE64(ch) || ch == '-') {
1796                     *out++ = '-';
1797                 }
1798                 *out++ = (char) ch;
1799             }
1800             else {
1801                 goto encode_char;
1802             }
1803         }
1804         else { /* not in a shift sequence */
1805             if (ch == '+') {
1806                 *out++ = '+';
1807                         *out++ = '-';
1808             }
1809             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1810                 *out++ = (char) ch;
1811             }
1812             else {
1813                 *out++ = '+';
1814                 inShift = 1;
1815                 goto encode_char;
1816             }
1817         }
1818         continue;
1819 encode_char:
1820 #ifdef Py_UNICODE_WIDE
1821         if (ch >= 0x10000) {
1822             /* code first surrogate */
1823             base64bits += 16;
1824             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1825             while (base64bits >= 6) {
1826                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1827                 base64bits -= 6;
1828             }
1829             /* prepare second surrogate */
1830             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1831         }
1832 #endif
1833         base64bits += 16;
1834         base64buffer = (base64buffer << 16) | ch;
1835         while (base64bits >= 6) {
1836             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1837             base64bits -= 6;
1838         }
1839     }
1840     if (base64bits)
1841         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1842     if (inShift)
1843         *out++ = '-';
1844 
1845     if (_PyString_Resize(&v, out - start))
1846         return NULL;
1847     return v;
1848 }
1849 
1850 #undef IS_BASE64
1851 #undef FROM_BASE64
1852 #undef TO_BASE64
1853 #undef DECODE_DIRECT
1854 #undef ENCODE_DIRECT
1855 
1856 /* --- UTF-8 Codec -------------------------------------------------------- */
1857 
1858 static
1859 char utf8_code_length[256] = {
1860     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1861        illegal prefix.  See RFC 3629 for details */
1862     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1863     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1864     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1865     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1870     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1871     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1872     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1873     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1874     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1875     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1876     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1877     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1878 };
1879 
1880 PyObject *PyUnicode_DecodeUTF8(const char *s,
1881                                Py_ssize_t size,
1882                                const char *errors)
1883 {
1884     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1885 }
1886 
1887 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1888                                        Py_ssize_t size,
1889                                        const char *errors,
1890                                        Py_ssize_t *consumed)
1891 {
1892     const char *starts = s;
1893     int n;
1894     int k;
1895     Py_ssize_t startinpos;
1896     Py_ssize_t endinpos;
1897     Py_ssize_t outpos;
1898     const char *e;
1899     PyUnicodeObject *unicode;
1900     Py_UNICODE *p;
1901     const char *errmsg = "";
1902     PyObject *errorHandler = NULL;
1903     PyObject *exc = NULL;
1904 
1905     /* Note: size will always be longer than the resulting Unicode
1906        character count */
1907     unicode = _PyUnicode_New(size);
1908     if (!unicode)
1909         return NULL;
1910     if (size == 0) {
1911         if (consumed)
1912             *consumed = 0;
1913         return (PyObject *)unicode;
1914     }
1915 
1916     /* Unpack UTF-8 encoded data */
1917     p = unicode->str;
1918     e = s + size;
1919 
1920     while (s < e) {
1921         Py_UCS4 ch = (unsigned char)*s;
1922 
1923         if (ch < 0x80) {
1924             *p++ = (Py_UNICODE)ch;
1925             s++;
1926             continue;
1927         }
1928 
1929         n = utf8_code_length[ch];
1930 
1931         if (s + n > e) {
1932             if (consumed)
1933                 break;
1934             else {
1935                 errmsg = "unexpected end of data";
1936                 startinpos = s-starts;
1937                 endinpos = startinpos+1;
1938                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1939                     endinpos++;
1940                 goto utf8Error;
1941             }
1942         }
1943 
1944         switch (n) {
1945 
1946         case 0:
1947             errmsg = "invalid start byte";
1948             startinpos = s-starts;
1949             endinpos = startinpos+1;
1950             goto utf8Error;
1951 
1952         case 1:
1953             errmsg = "internal error";
1954             startinpos = s-starts;
1955             endinpos = startinpos+1;
1956             goto utf8Error;
1957 
1958         case 2:
1959             if ((s[1] & 0xc0) != 0x80) {
1960                 errmsg = "invalid continuation byte";
1961                 startinpos = s-starts;
1962                 endinpos = startinpos + 1;
1963                 goto utf8Error;
1964             }
1965             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1966             assert ((ch > 0x007F) && (ch <= 0x07FF));
1967             *p++ = (Py_UNICODE)ch;
1968             break;
1969 
1970         case 3:
1971             /* XXX: surrogates shouldn't be valid UTF-8!
1972                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1973                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1974                Uncomment the 2 lines below to make them invalid,
1975                codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1976             if ((s[1] & 0xc0) != 0x80 ||
1977                 (s[2] & 0xc0) != 0x80 ||
1978                 ((unsigned char)s[0] == 0xE0 &&
1979                  (unsigned char)s[1] < 0xA0)/* ||
1980                 ((unsigned char)s[0] == 0xED &&
1981                  (unsigned char)s[1] > 0x9F)*/) {
1982                 errmsg = "invalid continuation byte";
1983                 startinpos = s-starts;
1984                 endinpos = startinpos + 1;
1985 
1986                 /* if s[1] first two bits are 1 and 0, then the invalid
1987                    continuation byte is s[2], so increment endinpos by 1,
1988                    if not, s[1] is invalid and endinpos doesn't need to
1989                    be incremented. */
1990                 if ((s[1] & 0xC0) == 0x80)
1991                     endinpos++;
1992                 goto utf8Error;
1993             }
1994             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1995             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1996             *p++ = (Py_UNICODE)ch;
1997             break;
1998 
1999         case 4:
2000             if ((s[1] & 0xc0) != 0x80 ||
2001                 (s[2] & 0xc0) != 0x80 ||
2002                 (s[3] & 0xc0) != 0x80 ||
2003                 ((unsigned char)s[0] == 0xF0 &&
2004                  (unsigned char)s[1] < 0x90) ||
2005                 ((unsigned char)s[0] == 0xF4 &&
2006                  (unsigned char)s[1] > 0x8F)) {
2007                 errmsg = "invalid continuation byte";
2008                 startinpos = s-starts;
2009                 endinpos = startinpos + 1;
2010                 if ((s[1] & 0xC0) == 0x80) {
2011                     endinpos++;
2012                     if ((s[2] & 0xC0) == 0x80)
2013                         endinpos++;
2014                 }
2015                 goto utf8Error;
2016             }
2017             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2018                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2019             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2020 
2021 #ifdef Py_UNICODE_WIDE
2022             *p++ = (Py_UNICODE)ch;
2023 #else
2024             /*  compute and append the two surrogates: */
2025 
2026             /*  translate from 10000..10FFFF to 0..FFFF */
2027             ch -= 0x10000;
2028 
2029             /*  high surrogate = top 10 bits added to D800 */
2030             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2031 
2032             /*  low surrogate = bottom 10 bits added to DC00 */
2033             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2034 #endif
2035             break;
2036         }
2037         s += n;
2038         continue;
2039 
2040       utf8Error:
2041         outpos = p-PyUnicode_AS_UNICODE(unicode);
2042         if (unicode_decode_call_errorhandler(
2043                 errors, &errorHandler,
2044                 "utf8", errmsg,
2045                 starts, size, &startinpos, &endinpos, &exc, &s,
2046                 &unicode, &outpos, &p))
2047             goto onError;
2048     }
2049     if (consumed)
2050         *consumed = s-starts;
2051 
2052     /* Adjust length */
2053     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2054         goto onError;
2055 
2056     Py_XDECREF(errorHandler);
2057     Py_XDECREF(exc);
2058     return (PyObject *)unicode;
2059 
2060   onError:
2061     Py_XDECREF(errorHandler);
2062     Py_XDECREF(exc);
2063     Py_DECREF(unicode);
2064     return NULL;
2065 }
2066 
2067 /* Allocation strategy:  if the string is short, convert into a stack buffer
2068    and allocate exactly as much space needed at the end.  Else allocate the
2069    maximum possible needed (4 result bytes per Unicode character), and return
2070    the excess memory at the end.
2071 */
2072 PyObject *
2073 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2074                      Py_ssize_t size,
2075                      const char *errors)
2076 {
2077 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2078 
2079     Py_ssize_t i;           /* index into s of next input byte */
2080     PyObject *v;        /* result string object */
2081     char *p;            /* next free byte in output buffer */
2082     Py_ssize_t nallocated;  /* number of result bytes allocated */
2083     Py_ssize_t nneeded;        /* number of result bytes needed */
2084     char stackbuf[MAX_SHORT_UNICHARS * 4];
2085 
2086     assert(s != NULL);
2087     assert(size >= 0);
2088 
2089     if (size <= MAX_SHORT_UNICHARS) {
2090         /* Write into the stack buffer; nallocated can't overflow.
2091          * At the end, we'll allocate exactly as much heap space as it
2092          * turns out we need.
2093          */
2094         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
   Value stored to 'nallocated' is never read
   (emitted by clang-analyzer)
TODO: a detailed trace is available in the data model (not yet rendered in this report)2095         v = NULL;   /* will allocate after we're done */
2096         p = stackbuf;
2097     }
2098     else {
2099         /* Overallocate on the heap, and give the excess back at the end. */
2100         nallocated = size * 4;
2101         if (nallocated / 4 != size)  /* overflow! */
2102             return PyErr_NoMemory();
2103         v = PyString_FromStringAndSize(NULL, nallocated);
2104         if (v == NULL)
2105             return NULL;
2106         p = PyString_AS_STRING(v);
2107     }
2108 
2109     for (i = 0; i < size;) {
2110         Py_UCS4 ch = s[i++];
2111 
2112         if (ch < 0x80)
2113             /* Encode ASCII */
2114             *p++ = (char) ch;
2115 
2116         else if (ch < 0x0800) {
2117             /* Encode Latin-1 */
2118             *p++ = (char)(0xc0 | (ch >> 6));
2119             *p++ = (char)(0x80 | (ch & 0x3f));
2120         }
2121         else {
2122             /* Encode UCS2 Unicode ordinals */
2123             if (ch < 0x10000) {
2124                 /* Special case: check for high surrogate */
2125                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126                     Py_UCS4 ch2 = s[i];
2127                     /* Check for low surrogate and combine the two to
2128                        form a UCS4 value */
2129                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2130                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2131                         i++;
2132                         goto encodeUCS4;
2133                     }
2134                     /* Fall through: handles isolated high surrogates */
2135                 }
2136                 *p++ = (char)(0xe0 | (ch >> 12));
2137                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138                 *p++ = (char)(0x80 | (ch & 0x3f));
2139                 continue;
2140             }
2141           encodeUCS4:
2142             /* Encode UCS4 Unicode ordinals */
2143             *p++ = (char)(0xf0 | (ch >> 18));
2144             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146             *p++ = (char)(0x80 | (ch & 0x3f));
2147         }
2148     }
2149 
2150     if (v == NULL) {
2151         /* This was stack allocated. */
2152         nneeded = p - stackbuf;
2153         assert(nneeded <= nallocated);
2154         v = PyString_FromStringAndSize(stackbuf, nneeded);
2155     }
2156     else {
2157         /* Cut back to size actually needed. */
2158         nneeded = p - PyString_AS_STRING(v);
2159         assert(nneeded <= nallocated);
2160         if (_PyString_Resize(&v, nneeded))
2161             return NULL;
2162     }
2163     return v;
2164 
2165 #undef MAX_SHORT_UNICHARS
2166 }
2167 
2168 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2169 {
2170     if (!PyUnicode_Check(unicode)) {
2171         PyErr_BadArgument();
2172         return NULL;
2173     }
2174     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2175                                 PyUnicode_GET_SIZE(unicode),
2176                                 NULL);
2177 }
2178 
2179 /* --- UTF-32 Codec ------------------------------------------------------- */
2180 
2181 PyObject *
2182 PyUnicode_DecodeUTF32(const char *s,
2183                       Py_ssize_t size,
2184                       const char *errors,
2185                       int *byteorder)
2186 {
2187     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188 }
2189 
2190 PyObject *
2191 PyUnicode_DecodeUTF32Stateful(const char *s,
2192                               Py_ssize_t size,
2193                               const char *errors,
2194                               int *byteorder,
2195                               Py_ssize_t *consumed)
2196 {
2197     const char *starts = s;
2198     Py_ssize_t startinpos;
2199     Py_ssize_t endinpos;
2200     Py_ssize_t outpos;
2201     PyUnicodeObject *unicode;
2202     Py_UNICODE *p;
2203 #ifndef Py_UNICODE_WIDE
2204     int pairs = 0;
2205     const unsigned char *qq;
2206 #else
2207     const int pairs = 0;
2208 #endif
2209     const unsigned char *q, *e;
2210     int bo = 0;       /* assume native ordering by default */
2211     const char *errmsg = "";
2212     /* Offsets from q for retrieving bytes in the right order. */
2213 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2214     int iorder[] = {0, 1, 2, 3};
2215 #else
2216     int iorder[] = {3, 2, 1, 0};
2217 #endif
2218     PyObject *errorHandler = NULL;
2219     PyObject *exc = NULL;
2220     
2221     q = (unsigned char *)s;
2222     e = q + size;
2223 
2224     if (byteorder)
2225         bo = *byteorder;
2226 
2227     /* Check for BOM marks (U+FEFF) in the input and adjust current
2228        byte order setting accordingly. In native mode, the leading BOM
2229        mark is skipped, in all other modes, it is copied to the output
2230        stream as-is (giving a ZWNBSP character). */
2231     if (bo == 0) {
2232         if (size >= 4) {
2233             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2234                 (q[iorder[1]] << 8) | q[iorder[0]];
2235 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2236             if (bom == 0x0000FEFF) {
2237                 q += 4;
2238                 bo = -1;
2239             }
2240             else if (bom == 0xFFFE0000) {
2241                 q += 4;
2242                 bo = 1;
2243             }
2244 #else
2245             if (bom == 0x0000FEFF) {
2246                 q += 4;
2247                 bo = 1;
2248             }
2249             else if (bom == 0xFFFE0000) {
2250                 q += 4;
2251                 bo = -1;
2252             }
2253 #endif
2254         }
2255     }
2256 
2257     if (bo == -1) {
2258         /* force LE */
2259         iorder[0] = 0;
2260         iorder[1] = 1;
2261         iorder[2] = 2;
2262         iorder[3] = 3;
2263     }
2264     else if (bo == 1) {
2265         /* force BE */
2266         iorder[0] = 3;
2267         iorder[1] = 2;
2268         iorder[2] = 1;
2269         iorder[3] = 0;
2270     }
2271 
2272     /* On narrow builds we split characters outside the BMP into two
2273        codepoints => count how much extra space we need. */
2274 #ifndef Py_UNICODE_WIDE
2275     for (qq = q; qq < e; qq += 4)
2276         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2277             pairs++;
2278 #endif
2279 
2280     /* This might be one to much, because of a BOM */
2281     unicode = _PyUnicode_New((size+3)/4+pairs);
2282     if (!unicode)
2283         return NULL;
2284     if (size == 0)
2285         return (PyObject *)unicode;
2286 
2287     /* Unpack UTF-32 encoded data */
2288     p = unicode->str;
2289 
2290     while (q < e) {
2291         Py_UCS4 ch;
2292         /* remaining bytes at the end? (size should be divisible by 4) */
2293         if (e-q<4) {
2294             if (consumed)
2295                 break;
2296             errmsg = "truncated data";
2297             startinpos = ((const char *)q)-starts;
2298             endinpos = ((const char *)e)-starts;
2299             goto utf32Error;
2300             /* The remaining input chars are ignored if the callback
2301                chooses to skip the input */
2302         }
2303         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2304             (q[iorder[1]] << 8) | q[iorder[0]];
2305 
2306         if (ch >= 0x110000)
2307         {
2308             errmsg = "codepoint not in range(0x110000)";
2309             startinpos = ((const char *)q)-starts;
2310             endinpos = startinpos+4;
2311             goto utf32Error;
2312         }
2313 #ifndef Py_UNICODE_WIDE
2314         if (ch >= 0x10000)
2315         {
2316             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2317             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2318         }
2319         else
2320 #endif
2321             *p++ = ch;
2322         q += 4;
2323         continue;
2324       utf32Error:
2325         outpos = p-PyUnicode_AS_UNICODE(unicode);
2326         if (unicode_decode_call_errorhandler(
2327                 errors, &errorHandler,
2328                 "utf32", errmsg,
2329                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2330                 &unicode, &outpos, &p))
2331             goto onError;
2332     }
2333 
2334     if (byteorder)
2335         *byteorder = bo;
2336 
2337     if (consumed)
2338         *consumed = (const char *)q-starts;
2339 
2340     /* Adjust length */
2341     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2342         goto onError;
2343 
2344     Py_XDECREF(errorHandler);
2345     Py_XDECREF(exc);
2346     return (PyObject *)unicode;
2347 
2348   onError:
2349     Py_DECREF(unicode);
2350     Py_XDECREF(errorHandler);
2351     Py_XDECREF(exc);
2352     return NULL;
2353 }
2354 
2355 PyObject *
2356 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2357                       Py_ssize_t size,
2358                       const char *errors,
2359                       int byteorder)
2360 {
2361     PyObject *v;
2362     unsigned char *p;
2363     Py_ssize_t nsize, bytesize;
2364 #ifndef Py_UNICODE_WIDE
2365     Py_ssize_t i, pairs;
2366 #else
2367     const int pairs = 0;
2368 #endif
2369     /* Offsets from p for storing byte pairs in the right order. */
2370 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2371     int iorder[] = {0, 1, 2, 3};
2372 #else
2373     int iorder[] = {3, 2, 1, 0};
2374 #endif
2375 
2376 #define STORECHAR(CH)                           \
2377     do {                                        \
2378         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2379         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2380         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2381         p[iorder[0]] = (CH) & 0xff;             \
2382         p += 4;                                 \
2383     } while(0)
2384 
2385     /* In narrow builds we can output surrogate pairs as one codepoint,
2386        so we need less space. */
2387 #ifndef Py_UNICODE_WIDE
2388     for (i = pairs = 0; i < size-1; i++)
2389         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2390             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2391             pairs++;
2392 #endif
2393     nsize = (size - pairs + (byteorder == 0));
2394     bytesize = nsize * 4;
2395     if (bytesize / 4 != nsize)
2396         return PyErr_NoMemory();
2397     v = PyString_FromStringAndSize(NULL, bytesize);
2398     if (v == NULL)
2399         return NULL;
2400 
2401     p = (unsigned char *)PyString_AS_STRING(v);
2402     if (byteorder == 0)
2403         STORECHAR(0xFEFF);
2404     if (size == 0)
2405         return v;
2406 
2407     if (byteorder == -1) {
2408         /* force LE */
2409         iorder[0] = 0;
2410         iorder[1] = 1;
2411         iorder[2] = 2;
2412         iorder[3] = 3;
2413     }
2414     else if (byteorder == 1) {
2415         /* force BE */
2416         iorder[0] = 3;
2417         iorder[1] = 2;
2418         iorder[2] = 1;
2419         iorder[3] = 0;
2420     }
2421 
2422     while (size-- > 0) {
2423         Py_UCS4 ch = *s++;
2424 #ifndef Py_UNICODE_WIDE
2425         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2426             Py_UCS4 ch2 = *s;
2427             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2428                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2429                 s++;
2430                 size--;
2431             }
2432         }
2433 #endif
2434         STORECHAR(ch);
2435     }
2436     return v;
2437 #undef STORECHAR
2438 }
2439 
2440 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2441 {
2442     if (!PyUnicode_Check(unicode)) {
2443         PyErr_BadArgument();
2444         return NULL;
2445     }
2446     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2447                                  PyUnicode_GET_SIZE(unicode),
2448                                  NULL,
2449                                  0);
2450 }
2451 
2452 /* --- UTF-16 Codec ------------------------------------------------------- */
2453 
2454 PyObject *
2455 PyUnicode_DecodeUTF16(const char *s,
2456                       Py_ssize_t size,
2457                       const char *errors,
2458                       int *byteorder)
2459 {
2460     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2461 }
2462 
2463 PyObject *
2464 PyUnicode_DecodeUTF16Stateful(const char *s,
2465                               Py_ssize_t size,
2466                               const char *errors,
2467                               int *byteorder,
2468                               Py_ssize_t *consumed)
2469 {
2470     const char *starts = s;
2471     Py_ssize_t startinpos;
2472     Py_ssize_t endinpos;
2473     Py_ssize_t outpos;
2474     PyUnicodeObject *unicode;
2475     Py_UNICODE *p;
2476     const unsigned char *q, *e;
2477     int bo = 0;       /* assume native ordering by default */
2478     const char *errmsg = "";
2479     /* Offsets from q for retrieving byte pairs in the right order. */
2480 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2481     int ihi = 1, ilo = 0;
2482 #else
2483     int ihi = 0, ilo = 1;
2484 #endif
2485     PyObject *errorHandler = NULL;
2486     PyObject *exc = NULL;
2487 
2488     /* Note: size will always be longer than the resulting Unicode
2489        character count */
2490     unicode = _PyUnicode_New(size);
2491     if (!unicode)
2492         return NULL;
2493     if (size == 0)
2494         return (PyObject *)unicode;
2495 
2496     /* Unpack UTF-16 encoded data */
2497     p = unicode->str;
2498     q = (unsigned char *)s;
2499     e = q + size;
2500 
2501     if (byteorder)
2502         bo = *byteorder;
2503 
2504     /* Check for BOM marks (U+FEFF) in the input and adjust current
2505        byte order setting accordingly. In native mode, the leading BOM
2506        mark is skipped, in all other modes, it is copied to the output
2507        stream as-is (giving a ZWNBSP character). */
2508     if (bo == 0) {
2509         if (size >= 2) {
2510             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2511 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2512             if (bom == 0xFEFF) {
2513                 q += 2;
2514                 bo = -1;
2515             }
2516             else if (bom == 0xFFFE) {
2517                 q += 2;
2518                 bo = 1;
2519             }
2520 #else
2521             if (bom == 0xFEFF) {
2522                 q += 2;
2523                 bo = 1;
2524             }
2525             else if (bom == 0xFFFE) {
2526                 q += 2;
2527                 bo = -1;
2528             }
2529 #endif
2530         }
2531     }
2532 
2533     if (bo == -1) {
2534         /* force LE */
2535         ihi = 1;
2536         ilo = 0;
2537     }
2538     else if (bo == 1) {
2539         /* force BE */
2540         ihi = 0;
2541         ilo = 1;
2542     }
2543 
2544     while (q < e) {
2545         Py_UNICODE ch;
2546         /* remaining bytes at the end? (size should be even) */
2547         if (e-q<2) {
2548             if (consumed)
2549                 break;
2550             errmsg = "truncated data";
2551             startinpos = ((const char *)q)-starts;
2552             endinpos = ((const char *)e)-starts;
2553             goto utf16Error;
2554             /* The remaining input chars are ignored if the callback
2555                chooses to skip the input */
2556         }
2557         ch = (q[ihi] << 8) | q[ilo];
2558 
2559         q += 2;
2560 
2561         if (ch < 0xD800 || ch > 0xDFFF) {
2562             *p++ = ch;
2563             continue;
2564         }
2565 
2566         /* UTF-16 code pair: */
2567         if (q >= e) {
2568             errmsg = "unexpected end of data";
2569             startinpos = (((const char *)q)-2)-starts;
2570             endinpos = ((const char *)e)-starts;
2571             goto utf16Error;
2572         }
2573         if (0xD800 <= ch && ch <= 0xDBFF) {
2574             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2575             q += 2;
2576             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2577 #ifndef Py_UNICODE_WIDE
2578                 *p++ = ch;
2579                 *p++ = ch2;
2580 #else
2581                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2582 #endif
2583                 continue;
2584             }
2585             else {
2586                 errmsg = "illegal UTF-16 surrogate";
2587                 startinpos = (((const char *)q)-4)-starts;
2588                 endinpos = startinpos+2;
2589                 goto utf16Error;
2590             }
2591 
2592         }
2593         errmsg = "illegal encoding";
2594         startinpos = (((const char *)q)-2)-starts;
2595         endinpos = startinpos+2;
2596         /* Fall through to report the error */
2597 
2598       utf16Error:
2599         outpos = p-PyUnicode_AS_UNICODE(unicode);
2600         if (unicode_decode_call_errorhandler(
2601                 errors, &errorHandler,
2602                 "utf16", errmsg,
2603                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2604                 &unicode, &outpos, &p))
2605             goto onError;
2606     }
2607 
2608     if (byteorder)
2609         *byteorder = bo;
2610 
2611     if (consumed)
2612         *consumed = (const char *)q-starts;
2613 
2614     /* Adjust length */
2615     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2616         goto onError;
2617 
2618     Py_XDECREF(errorHandler);
2619     Py_XDECREF(exc);
2620     return (PyObject *)unicode;
2621 
2622   onError:
2623     Py_DECREF(unicode);
2624     Py_XDECREF(errorHandler);
2625     Py_XDECREF(exc);
2626     return NULL;
2627 }
2628 
2629 PyObject *
2630 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2631                       Py_ssize_t size,
2632                       const char *errors,
2633                       int byteorder)
2634 {
2635     PyObject *v;
2636     unsigned char *p;
2637     Py_ssize_t nsize, bytesize;
2638 #ifdef Py_UNICODE_WIDE
2639     Py_ssize_t i, pairs;
2640 #else
2641     const int pairs = 0;
2642 #endif
2643     /* Offsets from p for storing byte pairs in the right order. */
2644 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2645     int ihi = 1, ilo = 0;
2646 #else
2647     int ihi = 0, ilo = 1;
2648 #endif
2649 
2650 #define STORECHAR(CH)                           \
2651     do {                                        \
2652         p[ihi] = ((CH) >> 8) & 0xff;            \
2653         p[ilo] = (CH) & 0xff;                   \
2654         p += 2;                                 \
2655     } while(0)
2656 
2657 #ifdef Py_UNICODE_WIDE
2658     for (i = pairs = 0; i < size; i++)
2659         if (s[i] >= 0x10000)
2660             pairs++;
2661 #endif
2662     /* 2 * (size + pairs + (byteorder == 0)) */
2663     if (size > PY_SSIZE_T_MAX ||
2664         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2665         return PyErr_NoMemory();
2666     nsize = size + pairs + (byteorder == 0);
2667     bytesize = nsize * 2;
2668     if (bytesize / 2 != nsize)
2669         return PyErr_NoMemory();
2670     v = PyString_FromStringAndSize(NULL, bytesize);
2671     if (v == NULL)
2672         return NULL;
2673 
2674     p = (unsigned char *)PyString_AS_STRING(v);
2675     if (byteorder == 0)
2676         STORECHAR(0xFEFF);
2677     if (size == 0)
2678         return v;
2679 
2680     if (byteorder == -1) {
2681         /* force LE */
2682         ihi = 1;
2683         ilo = 0;
2684     }
2685     else if (byteorder == 1) {
2686         /* force BE */
2687         ihi = 0;
2688         ilo = 1;
2689     }
2690 
2691     while (size-- > 0) {
2692         Py_UNICODE ch = *s++;
2693         Py_UNICODE ch2 = 0;
2694 #ifdef Py_UNICODE_WIDE
2695         if (ch >= 0x10000) {
2696             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2697             ch  = 0xD800 | ((ch-0x10000) >> 10);
2698         }
2699 #endif
2700         STORECHAR(ch);
2701         if (ch2)
2702             STORECHAR(ch2);
2703     }
2704     return v;
2705 #undef STORECHAR
2706 }
2707 
2708 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2709 {
2710     if (!PyUnicode_Check(unicode)) {
2711         PyErr_BadArgument();
2712         return NULL;
2713     }
2714     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2715                                  PyUnicode_GET_SIZE(unicode),
2716                                  NULL,
2717                                  0);
2718 }
2719 
2720 /* --- Unicode Escape Codec ----------------------------------------------- */
2721 
2722 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2723 
2724 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2725                                         Py_ssize_t size,
2726                                         const char *errors)
2727 {
2728     const char *starts = s;
2729     Py_ssize_t startinpos;
2730     Py_ssize_t endinpos;
2731     Py_ssize_t outpos;
2732     int i;
2733     PyUnicodeObject *v;
2734     Py_UNICODE *p;
2735     const char *end;
2736     char* message;
2737     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2738     PyObject *errorHandler = NULL;
2739     PyObject *exc = NULL;
2740 
2741     /* Escaped strings will always be longer than the resulting
2742        Unicode string, so we start with size here and then reduce the
2743        length after conversion to the true value.
2744        (but if the error callback returns a long replacement string
2745        we'll have to allocate more space) */
2746     v = _PyUnicode_New(size);
2747     if (v == NULL)
2748         goto onError;
2749     if (size == 0)
2750         return (PyObject *)v;
2751 
2752     p = PyUnicode_AS_UNICODE(v);
2753     end = s + size;
2754 
2755     while (s < end) {
2756         unsigned char c;
2757         Py_UNICODE x;
2758         int digits;
2759 
2760         /* Non-escape characters are interpreted as Unicode ordinals */
2761         if (*s != '\\') {
2762             *p++ = (unsigned char) *s++;
2763             continue;
2764         }
2765 
2766         startinpos = s-starts;
2767         /* \ - Escapes */
2768         s++;
2769         c = *s++;
2770         if (s > end)
2771             c = '\0'; /* Invalid after \ */
2772         switch (c) {
2773 
2774             /* \x escapes */
2775         case '\n': break;
2776         case '\\': *p++ = '\\'; break;
2777         case '\'': *p++ = '\''; break;
2778         case '\"': *p++ = '\"'; break;
2779         case 'b': *p++ = '\b'; break;
2780         case 'f': *p++ = '\014'; break; /* FF */
2781         case 't': *p++ = '\t'; break;
2782         case 'n': *p++ = '\n'; break;
2783         case 'r': *p++ = '\r'; break;
2784         case 'v': *p++ = '\013'; break; /* VT */
2785         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2786 
2787             /* \OOO (octal) escapes */
2788         case '0': case '1': case '2': case '3':
2789         case '4': case '5': case '6': case '7':
2790             x = s[-1] - '0';
2791             if (s < end && '0' <= *s && *s <= '7') {
2792                 x = (x<<3) + *s++ - '0';
2793                 if (s < end && '0' <= *s && *s <= '7')
2794                     x = (x<<3) + *s++ - '0';
2795             }
2796             *p++ = x;
2797             break;
2798 
2799             /* hex escapes */
2800             /* \xXX */
2801         case 'x':
2802             digits = 2;
2803             message = "truncated \\xXX escape";
2804             goto hexescape;
2805 
2806             /* \uXXXX */
2807         case 'u':
2808             digits = 4;
2809             message = "truncated \\uXXXX escape";
2810             goto hexescape;
2811 
2812             /* \UXXXXXXXX */
2813         case 'U':
2814             digits = 8;
2815             message = "truncated \\UXXXXXXXX escape";
2816         hexescape:
2817             chr = 0;
2818             outpos = p-PyUnicode_AS_UNICODE(v);
2819             if (s+digits>end) {
2820                 endinpos = size;
2821                 if (unicode_decode_call_errorhandler(
2822                         errors, &errorHandler,
2823                         "unicodeescape", "end of string in escape sequence",
2824                         starts, size, &startinpos, &endinpos, &exc, &s,
2825                         &v, &outpos, &p))
2826                     goto onError;
2827                 goto nextByte;
2828             }
2829             for (i = 0; i < digits; ++i) {
2830                 c = (unsigned char) s[i];
2831                 if (!isxdigit(c)) {
2832                     endinpos = (s+i+1)-starts;
2833                     if (unicode_decode_call_errorhandler(
2834                             errors, &errorHandler,
2835                             "unicodeescape", message,
2836                             starts, size, &startinpos, &endinpos, &exc, &s,
2837                             &v, &outpos, &p))
2838                         goto onError;
2839                     goto nextByte;
2840                 }
2841                 chr = (chr<<4) & ~0xF;
2842                 if (c >= '0' && c <= '9')
2843                     chr += c - '0';
2844                 else if (c >= 'a' && c <= 'f')
2845                     chr += 10 + c - 'a';
2846                 else
2847                     chr += 10 + c - 'A';
2848             }
2849             s += i;
2850             if (chr == 0xffffffff && PyErr_Occurred())
2851                 /* _decoding_error will have already written into the
2852                    target buffer. */
2853                 break;
2854         store:
2855             /* when we get here, chr is a 32-bit unicode character */
2856             if (chr <= 0xffff)
2857                 /* UCS-2 character */
2858                 *p++ = (Py_UNICODE) chr;
2859             else if (chr <= 0x10ffff) {
2860                 /* UCS-4 character. Either store directly, or as
2861                    surrogate pair. */
2862 #ifdef Py_UNICODE_WIDE
2863                 *p++ = chr;
2864 #else
2865                 chr -= 0x10000L;
2866                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2867                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2868 #endif
2869             } else {
2870                 endinpos = s-starts;
2871                 outpos = p-PyUnicode_AS_UNICODE(v);
2872                 if (unicode_decode_call_errorhandler(
2873                         errors, &errorHandler,
2874                         "unicodeescape", "illegal Unicode character",
2875                         starts, size, &startinpos, &endinpos, &exc, &s,
2876                         &v, &outpos, &p))
2877                     goto onError;
2878             }
2879             break;
2880 
2881             /* \N{name} */
2882         case 'N':
2883             message = "malformed \\N character escape";
2884             if (ucnhash_CAPI == NULL) {
2885                 /* load the unicode data module */
2886                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2887                 if (ucnhash_CAPI == NULL)
2888                     goto ucnhashError;
2889             }
2890             if (*s == '{') {
2891                 const char *start = s+1;
2892                 /* look for the closing brace */
2893                 while (*s != '}' && s < end)
2894                     s++;
2895                 if (s > start && s < end && *s == '}') {
2896                     /* found a name.  look it up in the unicode database */
2897                     message = "unknown Unicode character name";
2898                     s++;
2899                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2900                         goto store;
2901                 }
2902             }
2903             endinpos = s-starts;
2904             outpos = p-PyUnicode_AS_UNICODE(v);
2905             if (unicode_decode_call_errorhandler(
2906                     errors, &errorHandler,
2907                     "unicodeescape", message,
2908                     starts, size, &startinpos, &endinpos, &exc, &s,
2909                     &v, &outpos, &p))
2910                 goto onError;
2911             break;
2912 
2913         default:
2914             if (s > end) {
2915                 message = "\\ at end of string";
2916                 s--;
2917                 endinpos = s-starts;
2918                 outpos = p-PyUnicode_AS_UNICODE(v);
2919                 if (unicode_decode_call_errorhandler(
2920                         errors, &errorHandler,
2921                         "unicodeescape", message,
2922                         starts, size, &startinpos, &endinpos, &exc, &s,
2923                         &v, &outpos, &p))
2924                     goto onError;
2925             }
2926             else {
2927                 *p++ = '\\';
2928                 *p++ = (unsigned char)s[-1];
2929             }
2930             break;
2931         }
2932       nextByte:
2933         ;
2934     }
2935     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2936         goto onError;
2937     Py_XDECREF(errorHandler);
2938     Py_XDECREF(exc);
2939     return (PyObject *)v;
2940 
2941   ucnhashError:
2942     PyErr_SetString(
2943         PyExc_UnicodeError,
2944         "\\N escapes not supported (can't load unicodedata module)"
2945         );
2946     Py_XDECREF(v);
2947     Py_XDECREF(errorHandler);
2948     Py_XDECREF(exc);
2949     return NULL;
2950 
2951   onError:
2952     Py_XDECREF(v);
2953     Py_XDECREF(errorHandler);
2954     Py_XDECREF(exc);
2955     return NULL;
2956 }
2957 
2958 /* Return a Unicode-Escape string version of the Unicode object.
2959 
2960    If quotes is true, the string is enclosed in u"" or u'' quotes as
2961    appropriate.
2962 
2963 */
2964 
2965 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2966                                              Py_ssize_t size,
2967                                              Py_UNICODE ch)
2968 {
2969     /* like wcschr, but doesn't stop at NULL characters */
2970 
2971     while (size-- > 0) {
2972         if (*s == ch)
2973             return s;
2974         s++;
2975     }
2976 
2977     return NULL;
2978 }
2979 
2980 static
2981 PyObject *unicodeescape_string(const Py_UNICODE *s,
2982                                Py_ssize_t size,
2983                                int quotes)
2984 {
2985     PyObject *repr;
2986     char *p;
2987 
2988     static const char *hexdigit = "0123456789abcdef";
2989 #ifdef Py_UNICODE_WIDE
2990     const Py_ssize_t expandsize = 10;
2991 #else
2992     const Py_ssize_t expandsize = 6;
2993 #endif
2994 
2995     /* XXX(nnorwitz): rather than over-allocating, it would be
2996        better to choose a different scheme.  Perhaps scan the
2997        first N-chars of the string and allocate based on that size.
2998     */
2999     /* Initial allocation is based on the longest-possible unichr
3000        escape.
3001 
3002        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3003        unichr, so in this case it's the longest unichr escape. In
3004        narrow (UTF-16) builds this is five chars per source unichr
3005        since there are two unichrs in the surrogate pair, so in narrow
3006        (UTF-16) builds it's not the longest unichr escape.
3007 
3008        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3009        so in the narrow (UTF-16) build case it's the longest unichr
3010        escape.
3011     */
3012 
3013     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3014         return PyErr_NoMemory();
3015 
3016     repr = PyString_FromStringAndSize(NULL,
3017                                       2
3018                                       + expandsize*size
3019                                       + 1);
3020     if (repr == NULL)
3021         return NULL;
3022 
3023     p = PyString_AS_STRING(repr);
3024 
3025     if (quotes) {
3026         *p++ = 'u';
3027         *p++ = (findchar(s, size, '\'') &&
3028                 !findchar(s, size, '"')) ? '"' : '\'';
3029     }
3030     while (size-- > 0) {
3031         Py_UNICODE ch = *s++;
3032 
3033         /* Escape quotes and backslashes */
3034         if ((quotes &&
3035              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3036             *p++ = '\\';
3037             *p++ = (char) ch;
3038             continue;
3039         }
3040 
3041 #ifdef Py_UNICODE_WIDE
3042         /* Map 21-bit characters to '\U00xxxxxx' */
3043         else if (ch >= 0x10000) {
3044             *p++ = '\\';
3045             *p++ = 'U';
3046             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3047             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3048             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3049             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3050             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3051             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3052             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3053             *p++ = hexdigit[ch & 0x0000000F];
3054             continue;
3055         }
3056 #else
3057         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3058         else if (ch >= 0xD800 && ch < 0xDC00) {
3059             Py_UNICODE ch2;
3060             Py_UCS4 ucs;
3061 
3062             ch2 = *s++;
3063             size--;
3064             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3065                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3066                 *p++ = '\\';
3067                 *p++ = 'U';
3068                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3069                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3070                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3071                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3072                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3073                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3074                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3075                 *p++ = hexdigit[ucs & 0x0000000F];
3076                 continue;
3077             }
3078             /* Fall through: isolated surrogates are copied as-is */
3079             s--;
3080             size++;
3081         }
3082 #endif
3083 
3084         /* Map 16-bit characters to '\uxxxx' */
3085         if (ch >= 256) {
3086             *p++ = '\\';
3087             *p++ = 'u';
3088             *p++ = hexdigit[(ch >> 12) & 0x000F];
3089             *p++ = hexdigit[(ch >> 8) & 0x000F];
3090             *p++ = hexdigit[(ch >> 4) & 0x000F];
3091             *p++ = hexdigit[ch & 0x000F];
3092         }
3093 
3094         /* Map special whitespace to '\t', \n', '\r' */
3095         else if (ch == '\t') {
3096             *p++ = '\\';
3097             *p++ = 't';
3098         }
3099         else if (ch == '\n') {
3100             *p++ = '\\';
3101             *p++ = 'n';
3102         }
3103         else if (ch == '\r') {
3104             *p++ = '\\';
3105             *p++ = 'r';
3106         }
3107 
3108         /* Map non-printable US ASCII to '\xhh' */
3109         else if (ch < ' ' || ch >= 0x7F) {
3110             *p++ = '\\';
3111             *p++ = 'x';
3112             *p++ = hexdigit[(ch >> 4) & 0x000F];
3113             *p++ = hexdigit[ch & 0x000F];
3114         }
3115 
3116         /* Copy everything else as-is */
3117         else
3118             *p++ = (char) ch;
3119     }
3120     if (quotes)
3121         *p++ = PyString_AS_STRING(repr)[1];
3122 
3123     *p = '\0';
3124     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3125         return NULL;
3126     return repr;
3127 }
3128 
3129 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3130                                         Py_ssize_t size)
3131 {
3132     return unicodeescape_string(s, size, 0);
3133 }
3134 
3135 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3136 {
3137     if (!PyUnicode_Check(unicode)) {
3138         PyErr_BadArgument();
3139         return NULL;
3140     }
3141     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3142                                          PyUnicode_GET_SIZE(unicode));
3143 }
3144 
3145 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3146 
3147 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3148                                            Py_ssize_t size,
3149                                            const char *errors)
3150 {
3151     const char *starts = s;
3152     Py_ssize_t startinpos;
3153     Py_ssize_t endinpos;
3154     Py_ssize_t outpos;
3155     PyUnicodeObject *v;
3156     Py_UNICODE *p;
3157     const char *end;
3158     const char *bs;
3159     PyObject *errorHandler = NULL;
3160     PyObject *exc = NULL;
3161 
3162     /* Escaped strings will always be longer than the resulting
3163        Unicode string, so we start with size here and then reduce the
3164        length after conversion to the true value. (But decoding error
3165        handler might have to resize the string) */
3166     v = _PyUnicode_New(size);
3167     if (v == NULL)
3168         goto onError;
3169     if (size == 0)
3170         return (PyObject *)v;
3171     p = PyUnicode_AS_UNICODE(v);
3172     end = s + size;
3173     while (s < end) {
3174         unsigned char c;
3175         Py_UCS4 x;
3176         int i;
3177         int count;
3178 
3179         /* Non-escape characters are interpreted as Unicode ordinals */
3180         if (*s != '\\') {
3181             *p++ = (unsigned char)*s++;
3182             continue;
3183         }
3184         startinpos = s-starts;
3185 
3186         /* \u-escapes are only interpreted iff the number of leading
3187            backslashes if odd */
3188         bs = s;
3189         for (;s < end;) {
3190             if (*s != '\\')
3191                 break;
3192             *p++ = (unsigned char)*s++;
3193         }
3194         if (((s - bs) & 1) == 0 ||
3195             s >= end ||
3196             (*s != 'u' && *s != 'U')) {
3197             continue;
3198         }
3199         p--;
3200         count = *s=='u' ? 4 : 8;
3201         s++;
3202 
3203         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3204         outpos = p-PyUnicode_AS_UNICODE(v);
3205         for (x = 0, i = 0; i < count; ++i, ++s) {
3206             c = (unsigned char)*s;
3207             if (!isxdigit(c)) {
3208                 endinpos = s-starts;
3209                 if (unicode_decode_call_errorhandler(
3210                         errors, &errorHandler,
3211                         "rawunicodeescape", "truncated \\uXXXX",
3212                         starts, size, &startinpos, &endinpos, &exc, &s,
3213                         &v, &outpos, &p))
3214                     goto onError;
3215                 goto nextByte;
3216             }
3217             x = (x<<4) & ~0xF;
3218             if (c >= '0' && c <= '9')
3219                 x += c - '0';
3220             else if (c >= 'a' && c <= 'f')
3221                 x += 10 + c - 'a';
3222             else
3223                 x += 10 + c - 'A';
3224         }
3225         if (x <= 0xffff)
3226             /* UCS-2 character */
3227             *p++ = (Py_UNICODE) x;
3228         else if (x <= 0x10ffff) {
3229             /* UCS-4 character. Either store directly, or as
3230                surrogate pair. */
3231 #ifdef Py_UNICODE_WIDE
3232             *p++ = (Py_UNICODE) x;
3233 #else
3234             x -= 0x10000L;
3235             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3236             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3237 #endif
3238         } else {
3239             endinpos = s-starts;
3240             outpos = p-PyUnicode_AS_UNICODE(v);
3241             if (unicode_decode_call_errorhandler(
3242                     errors, &errorHandler,
3243                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3244                     starts, size, &startinpos, &endinpos, &exc, &s,
3245                     &v, &outpos, &p))
3246                 goto onError;
3247         }
3248       nextByte:
3249         ;
3250     }
3251     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3252         goto onError;
3253     Py_XDECREF(errorHandler);
3254     Py_XDECREF(exc);
3255     return (PyObject *)v;
3256 
3257   onError:
3258     Py_XDECREF(v);
3259     Py_XDECREF(errorHandler);
3260     Py_XDECREF(exc);
3261     return NULL;
3262 }
3263 
3264 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3265                                            Py_ssize_t size)
3266 {
3267     PyObject *repr;
3268     char *p;
3269     char *q;
3270 
3271     static const char *hexdigit = "0123456789abcdef";
3272 #ifdef Py_UNICODE_WIDE
3273     const Py_ssize_t expandsize = 10;
3274 #else
3275     const Py_ssize_t expandsize = 6;
3276 #endif
3277 
3278     if (size > PY_SSIZE_T_MAX / expandsize)
3279         return PyErr_NoMemory();
3280 
3281     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3282     if (repr == NULL)
3283         return NULL;
3284     if (size == 0)
3285         return repr;
3286 
3287     p = q = PyString_AS_STRING(repr);
3288     while (size-- > 0) {
3289         Py_UNICODE ch = *s++;
3290 #ifdef Py_UNICODE_WIDE
3291         /* Map 32-bit characters to '\Uxxxxxxxx' */
3292         if (ch >= 0x10000) {
3293             *p++ = '\\';
3294             *p++ = 'U';
3295             *p++ = hexdigit[(ch >> 28) & 0xf];
3296             *p++ = hexdigit[(ch >> 24) & 0xf];
3297             *p++ = hexdigit[(ch >> 20) & 0xf];
3298             *p++ = hexdigit[(ch >> 16) & 0xf];
3299             *p++ = hexdigit[(ch >> 12) & 0xf];
3300             *p++ = hexdigit[(ch >> 8) & 0xf];
3301             *p++ = hexdigit[(ch >> 4) & 0xf];
3302             *p++ = hexdigit[ch & 15];
3303         }
3304         else
3305 #else
3306             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3307             if (ch >= 0xD800 && ch < 0xDC00) {
3308                 Py_UNICODE ch2;
3309                 Py_UCS4 ucs;
3310 
3311                 ch2 = *s++;
3312                 size--;
3313                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3314                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3315                     *p++ = '\\';
3316                     *p++ = 'U';
3317                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3318                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3319                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3320                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3321                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3322                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3323                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3324                     *p++ = hexdigit[ucs & 0xf];
3325                     continue;
3326                 }
3327                 /* Fall through: isolated surrogates are copied as-is */
3328                 s--;
3329                 size++;
3330             }
3331 #endif
3332         /* Map 16-bit characters to '\uxxxx' */
3333         if (ch >= 256) {
3334             *p++ = '\\';
3335             *p++ = 'u';
3336             *p++ = hexdigit[(ch >> 12) & 0xf];
3337             *p++ = hexdigit[(ch >> 8) & 0xf];
3338             *p++ = hexdigit[(ch >> 4) & 0xf];
3339             *p++ = hexdigit[ch & 15];
3340         }
3341         /* Copy everything else as-is */
3342         else
3343             *p++ = (char) ch;
3344     }
3345     *p = '\0';
3346     if (_PyString_Resize(&repr, p - q))
3347         return NULL;
3348     return repr;
3349 }
3350 
3351 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3352 {
3353     if (!PyUnicode_Check(unicode)) {
3354         PyErr_BadArgument();
3355         return NULL;
3356     }
3357     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3358                                             PyUnicode_GET_SIZE(unicode));
3359 }
3360 
3361 /* --- Unicode Internal Codec ------------------------------------------- */
3362 
3363 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3364                                            Py_ssize_t size,
3365                                            const char *errors)
3366 {
3367     const char *starts = s;
3368     Py_ssize_t startinpos;
3369     Py_ssize_t endinpos;
3370     Py_ssize_t outpos;
3371     PyUnicodeObject *v;
3372     Py_UNICODE *p;
3373     const char *end;
3374     const char *reason;
3375     PyObject *errorHandler = NULL;
3376     PyObject *exc = NULL;
3377 
3378 #ifdef Py_UNICODE_WIDE
3379     Py_UNICODE unimax = PyUnicode_GetMax();
3380 #endif
3381 
3382     /* XXX overflow detection missing */
3383     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3384     if (v == NULL)
3385         goto onError;
3386     if (PyUnicode_GetSize((PyObject *)v) == 0)
3387         return (PyObject *)v;
3388     p = PyUnicode_AS_UNICODE(v);
3389     end = s + size;
3390 
3391     while (s < end) {
3392         memcpy(p, s, sizeof(Py_UNICODE));
3393         /* We have to sanity check the raw data, otherwise doom looms for
3394            some malformed UCS-4 data. */
3395         if (
3396 #ifdef Py_UNICODE_WIDE
3397             *p > unimax || *p < 0 ||
3398 #endif
3399             end-s < Py_UNICODE_SIZE
3400             )
3401         {
3402             startinpos = s - starts;
3403             if (end-s < Py_UNICODE_SIZE) {
3404                 endinpos = end-starts;
3405                 reason = "truncated input";
3406             }
3407             else {
3408                 endinpos = s - starts + Py_UNICODE_SIZE;
3409                 reason = "illegal code point (> 0x10FFFF)";
3410             }
3411             outpos = p - PyUnicode_AS_UNICODE(v);
3412             if (unicode_decode_call_errorhandler(
3413                     errors, &errorHandler,
3414                     "unicode_internal", reason,
3415                     starts, size, &startinpos, &endinpos, &exc, &s,
3416                     &v, &outpos, &p)) {
3417                 goto onError;
3418             }
3419         }
3420         else {
3421             p++;
3422             s += Py_UNICODE_SIZE;
3423         }
3424     }
3425 
3426     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3427         goto onError;
3428     Py_XDECREF(errorHandler);
3429     Py_XDECREF(exc);
3430     return (PyObject *)v;
3431 
3432   onError:
3433     Py_XDECREF(v);
3434     Py_XDECREF(errorHandler);
3435     Py_XDECREF(exc);
3436     return NULL;
3437 }
3438 
3439 /* --- Latin-1 Codec ------------------------------------------------------ */
3440 
3441 PyObject *PyUnicode_DecodeLatin1(const char *s,
3442                                  Py_ssize_t size,
3443                                  const char *errors)
3444 {
3445     PyUnicodeObject *v;
3446     Py_UNICODE *p;
3447 
3448     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3449     if (size == 1) {
3450         Py_UNICODE r = *(unsigned char*)s;
3451         return PyUnicode_FromUnicode(&r, 1);
3452     }
3453 
3454     v = _PyUnicode_New(size);
3455     if (v == NULL)
3456         goto onError;
3457     if (size == 0)
3458         return (PyObject *)v;
3459     p = PyUnicode_AS_UNICODE(v);
3460     while (size-- > 0)
3461         *p++ = (unsigned char)*s++;
3462     return (PyObject *)v;
3463 
3464   onError:
3465     Py_XDECREF(v);
3466     return NULL;
3467 }
3468 
3469 /* create or adjust a UnicodeEncodeError */
3470 static void make_encode_exception(PyObject **exceptionObject,
3471                                   const char *encoding,
3472                                   const Py_UNICODE *unicode, Py_ssize_t size,
3473                                   Py_ssize_t startpos, Py_ssize_t endpos,
3474                                   const char *reason)
3475 {
3476     if (*exceptionObject == NULL) {
3477         *exceptionObject = PyUnicodeEncodeError_Create(
3478             encoding, unicode, size, startpos, endpos, reason);
3479     }
3480     else {
3481         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3482             goto onError;
3483         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3484             goto onError;
3485         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3486             goto onError;
3487         return;
3488       onError:
3489         Py_DECREF(*exceptionObject);
3490         *exceptionObject = NULL;
3491     }
3492 }
3493 
3494 /* raises a UnicodeEncodeError */
3495 static void raise_encode_exception(PyObject **exceptionObject,
3496                                    const char *encoding,
3497                                    const Py_UNICODE *unicode, Py_ssize_t size,
3498                                    Py_ssize_t startpos, Py_ssize_t endpos,
3499                                    const char *reason)
3500 {
3501     make_encode_exception(exceptionObject,
3502                           encoding, unicode, size, startpos, endpos, reason);
3503     if (*exceptionObject != NULL)
3504         PyCodec_StrictErrors(*exceptionObject);
3505 }
3506 
3507 /* error handling callback helper:
3508    build arguments, call the callback and check the arguments,
3509    put the result into newpos and return the replacement string, which
3510    has to be freed by the caller */
3511 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3512                                                   PyObject **errorHandler,
3513                                                   const char *encoding, const char *reason,
3514                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3515                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3516                                                   Py_ssize_t *newpos)
3517 {
3518     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3519 
3520     PyObject *restuple;
3521     PyObject *resunicode;
3522 
3523     if (*errorHandler == NULL) {
3524         *errorHandler = PyCodec_LookupError(errors);
3525         if (*errorHandler == NULL)
3526             return NULL;
3527     }
3528 
3529     make_encode_exception(exceptionObject,
3530                           encoding, unicode, size, startpos, endpos, reason);
3531     if (*exceptionObject == NULL)
3532         return NULL;
3533 
3534     restuple = PyObject_CallFunctionObjArgs(
3535         *errorHandler, *exceptionObject, NULL);
3536     if (restuple == NULL)
3537         return NULL;
3538     if (!PyTuple_Check(restuple)) {
3539         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3540         Py_DECREF(restuple);
3541         return NULL;
3542     }
3543     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3544                           &resunicode, newpos)) {
3545         Py_DECREF(restuple);
3546         return NULL;
3547     }
3548     if (*newpos<0)
3549         *newpos = size+*newpos;
3550     if (*newpos<0 || *newpos>size) {
3551         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3552         Py_DECREF(restuple);
3553         return NULL;
3554     }
3555     Py_INCREF(resunicode);
3556     Py_DECREF(restuple);
3557     return resunicode;
3558 }
3559 
3560 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3561                                      Py_ssize_t size,
3562                                      const char *errors,
3563                                      int limit)
3564 {
3565     /* output object */
3566     PyObject *res;
3567     /* pointers to the beginning and end+1 of input */
3568     const Py_UNICODE *startp = p;
3569     const Py_UNICODE *endp = p + size;
3570     /* pointer to the beginning of the unencodable characters */
3571     /* const Py_UNICODE *badp = NULL; */
3572     /* pointer into the output */
3573     char *str;
3574     /* current output position */
3575     Py_ssize_t respos = 0;
3576     Py_ssize_t ressize;
3577     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3578     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3579     PyObject *errorHandler = NULL;
3580     PyObject *exc = NULL;
3581     /* the following variable is used for caching string comparisons
3582      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3583     int known_errorHandler = -1;
3584 
3585     /* allocate enough for a simple encoding without
3586        replacements, if we need more, we'll resize */
3587     res = PyString_FromStringAndSize(NULL, size);
3588     if (res == NULL)
3589         goto onError;
3590     if (size == 0)
3591         return res;
3592     str = PyString_AS_STRING(res);
3593     ressize = size;
3594 
3595     while (p<endp) {
3596         Py_UNICODE c = *p;
3597 
3598         /* can we encode this? */
3599         if (c<limit) {
3600             /* no overflow check, because we know that the space is enough */
3601             *str++ = (char)c;
3602             ++p;
3603         }
3604         else {
3605             Py_ssize_t unicodepos = p-startp;
3606             Py_ssize_t requiredsize;
3607             PyObject *repunicode;
3608             Py_ssize_t repsize;
3609             Py_ssize_t newpos;
3610             Py_ssize_t respos;
3611             Py_UNICODE *uni2;
3612             /* startpos for collecting unencodable chars */
3613             const Py_UNICODE *collstart = p;
3614             const Py_UNICODE *collend = p;
3615             /* find all unecodable characters */
3616             while ((collend < endp) && ((*collend)>=limit))
3617                 ++collend;
3618             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3619             if (known_errorHandler==-1) {
3620                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3621                     known_errorHandler = 1;
3622                 else if (!strcmp(errors, "replace"))
3623                     known_errorHandler = 2;
3624                 else if (!strcmp(errors, "ignore"))
3625                     known_errorHandler = 3;
3626                 else if (!strcmp(errors, "xmlcharrefreplace"))
3627                     known_errorHandler = 4;
3628                 else
3629                     known_errorHandler = 0;
3630             }
3631             switch (known_errorHandler) {
3632             case 1: /* strict */
3633                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3634                 goto onError;
3635             case 2: /* replace */
3636                 while (collstart++<collend)
3637                     *str++ = '?'; /* fall through */
3638             case 3: /* ignore */
3639                 p = collend;
3640                 break;
3641             case 4: /* xmlcharrefreplace */
3642                 respos = str-PyString_AS_STRING(res);
3643                 /* determine replacement size (temporarily (mis)uses p) */
3644                 for (p = collstart, repsize = 0; p < collend; ++p) {
3645                     if (*p<10)
3646                         repsize += 2+1+1;
3647                     else if (*p<100)
3648                         repsize += 2+2+1;
3649                     else if (*p<1000)
3650                         repsize += 2+3+1;
3651                     else if (*p<10000)
3652                         repsize += 2+4+1;
3653 #ifndef Py_UNICODE_WIDE
3654                     else
3655                         repsize += 2+5+1;
3656 #else
3657                     else if (*p<100000)
3658                         repsize += 2+5+1;
3659                     else if (*p<1000000)
3660                         repsize += 2+6+1;
3661                     else
3662                         repsize += 2+7+1;
3663 #endif
3664                 }
3665                 requiredsize = respos+repsize+(endp-collend);
3666                 if (requiredsize > ressize) {
3667                     if (requiredsize<2*ressize)
3668                         requiredsize = 2*ressize;
3669                     if (_PyString_Resize(&res, requiredsize))
3670                         goto onError;
3671                     str = PyString_AS_STRING(res) + respos;
3672                     ressize = requiredsize;
3673                 }
3674                 /* generate replacement (temporarily (mis)uses p) */
3675                 for (p = collstart; p < collend; ++p) {
3676                     str += sprintf(str, "&#%d;", (int)*p);
3677                 }
3678                 p = collend;
3679                 break;
3680             default:
3681                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3682                                                               encoding, reason, startp, size, &exc,
3683                                                               collstart-startp, collend-startp, &newpos);
3684                 if (repunicode == NULL)
3685                     goto onError;
3686                 /* need more space? (at least enough for what we have+the
3687                    replacement+the rest of the string, so we won't have to
3688                    check space for encodable characters) */
3689                 respos = str-PyString_AS_STRING(res);
3690                 repsize = PyUnicode_GET_SIZE(repunicode);
3691                 requiredsize = respos+repsize+(endp-collend);
3692                 if (requiredsize > ressize) {
3693                     if (requiredsize<2*ressize)
3694                         requiredsize = 2*ressize;
3695                     if (_PyString_Resize(&res, requiredsize)) {
3696                         Py_DECREF(repunicode);
3697                         goto onError;
3698                     }
3699                     str = PyString_AS_STRING(res) + respos;
3700                     ressize = requiredsize;
3701                 }
3702                 /* check if there is anything unencodable in the replacement
3703                    and copy it to the output */
3704                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3705                     c = *uni2;
3706                     if (c >= limit) {
3707                         raise_encode_exception(&exc, encoding, startp, size,
3708                                                unicodepos, unicodepos+1, reason);
3709                         Py_DECREF(repunicode);
3710                         goto onError;
3711                     }
3712                     *str = (char)c;
3713                 }
3714                 p = startp + newpos;
3715                 Py_DECREF(repunicode);
3716             }
3717         }
3718     }
3719     /* Resize if we allocated to much */
3720     respos = str-PyString_AS_STRING(res);
3721     if (respos<ressize)
3722         /* If this falls res will be NULL */
3723         _PyString_Resize(&res, respos);
3724     Py_XDECREF(errorHandler);
3725     Py_XDECREF(exc);
3726     return res;
3727 
3728   onError:
3729     Py_XDECREF(res);
3730     Py_XDECREF(errorHandler);
3731     Py_XDECREF(exc);
3732     return NULL;
3733 }
3734 
3735 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3736                                  Py_ssize_t size,
3737                                  const char *errors)
3738 {
3739     return unicode_encode_ucs1(p, size, errors, 256);
3740 }
3741 
3742 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3743 {
3744     if (!PyUnicode_Check(unicode)) {
3745         PyErr_BadArgument();
3746         return NULL;
3747     }
3748     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3749                                   PyUnicode_GET_SIZE(unicode),
3750                                   NULL);
3751 }
3752 
3753 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3754 
3755 PyObject *PyUnicode_DecodeASCII(const char *s,
3756                                 Py_ssize_t size,
3757                                 const char *errors)
3758 {
3759     const char *starts = s;
3760     PyUnicodeObject *v;
3761     Py_UNICODE *p;
3762     Py_ssize_t startinpos;
3763     Py_ssize_t endinpos;
3764     Py_ssize_t outpos;
3765     const char *e;
3766     PyObject *errorHandler = NULL;
3767     PyObject *exc = NULL;
3768 
3769     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3770     if (size == 1 && *(unsigned char*)s < 128) {
3771         Py_UNICODE r = *(unsigned char*)s;
3772         return PyUnicode_FromUnicode(&r, 1);
3773     }
3774 
3775     v = _PyUnicode_New(size);
3776     if (v == NULL)
3777         goto onError;
3778     if (size == 0)
3779         return (PyObject *)v;
3780     p = PyUnicode_AS_UNICODE(v);
3781     e = s + size;
3782     while (s < e) {
3783         register unsigned char c = (unsigned char)*s;
3784         if (c < 128) {
3785             *p++ = c;
3786             ++s;
3787         }
3788         else {
3789             startinpos = s-starts;
3790             endinpos = startinpos + 1;
3791             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3792             if (unicode_decode_call_errorhandler(
3793                     errors, &errorHandler,
3794                     "ascii", "ordinal not in range(128)",
3795                     starts, size, &startinpos, &endinpos, &exc, &s,
3796                     &v, &outpos, &p))
3797                 goto onError;
3798         }
3799     }
3800     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3801         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3802             goto onError;
3803     Py_XDECREF(errorHandler);
3804     Py_XDECREF(exc);
3805     return (PyObject *)v;
3806 
3807   onError:
3808     Py_XDECREF(v);
3809     Py_XDECREF(errorHandler);
3810     Py_XDECREF(exc);
3811     return NULL;
3812 }
3813 
3814 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3815                                 Py_ssize_t size,
3816                                 const char *errors)
3817 {
3818     return unicode_encode_ucs1(p, size, errors, 128);
3819 }
3820 
3821 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3822 {
3823     if (!PyUnicode_Check(unicode)) {
3824         PyErr_BadArgument();
3825         return NULL;
3826     }
3827     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3828                                  PyUnicode_GET_SIZE(unicode),
3829                                  NULL);
3830 }
3831 
3832 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3833 
3834 /* --- MBCS codecs for Windows -------------------------------------------- */
3835 
3836 #if SIZEOF_INT < SIZEOF_SIZE_T
3837 #define NEED_RETRY
3838 #endif
3839 
3840 /* XXX This code is limited to "true" double-byte encodings, as
3841    a) it assumes an incomplete character consists of a single byte, and
3842    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3843    encodings, see IsDBCSLeadByteEx documentation. */
3844 
3845 static int is_dbcs_lead_byte(const char *s, int offset)
3846 {
3847     const char *curr = s + offset;
3848 
3849     if (IsDBCSLeadByte(*curr)) {
3850         const char *prev = CharPrev(s, curr);
3851         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3852     }
3853     return 0;
3854 }
3855 
3856 /*
3857  * Decode MBCS string into unicode object. If 'final' is set, converts
3858  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3859  */
3860 static int decode_mbcs(PyUnicodeObject **v,
3861                        const char *s, /* MBCS string */
3862                        int size, /* sizeof MBCS string */
3863                        int final)
3864 {
3865     Py_UNICODE *p;
3866     Py_ssize_t n = 0;
3867     int usize = 0;
3868 
3869     assert(size >= 0);
3870 
3871     /* Skip trailing lead-byte unless 'final' is set */
3872     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3873         --size;
3874 
3875     /* First get the size of the result */
3876     if (size > 0) {
3877         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3878         if (usize == 0) {
3879             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3880             return -1;
3881         }
3882     }
3883 
3884     if (*v == NULL) {
3885         /* Create unicode object */
3886         *v = _PyUnicode_New(usize);
3887         if (*v == NULL)
3888             return -1;
3889     }
3890     else {
3891         /* Extend unicode object */
3892         n = PyUnicode_GET_SIZE(*v);
3893         if (_PyUnicode_Resize(v, n + usize) < 0)
3894             return -1;
3895     }
3896 
3897     /* Do the conversion */
3898     if (size > 0) {
3899         p = PyUnicode_AS_UNICODE(*v) + n;
3900         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3901             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3902             return -1;
3903         }
3904     }
3905 
3906     return size;
3907 }
3908 
3909 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3910                                        Py_ssize_t size,
3911                                        const char *errors,
3912                                        Py_ssize_t *consumed)
3913 {
3914     PyUnicodeObject *v = NULL;
3915     int done;
3916 
3917     if (consumed)
3918         *consumed = 0;
3919 
3920 #ifdef NEED_RETRY
3921   retry:
3922     if (size > INT_MAX)
3923         done = decode_mbcs(&v, s, INT_MAX, 0);
3924     else
3925 #endif
3926         done = decode_mbcs(&v, s, (int)size, !consumed);
3927 
3928     if (done < 0) {
3929         Py_XDECREF(v);
3930         return NULL;
3931     }
3932 
3933     if (consumed)
3934         *consumed += done;
3935 
3936 #ifdef NEED_RETRY
3937     if (size > INT_MAX) {
3938         s += done;
3939         size -= done;
3940         goto retry;
3941     }
3942 #endif
3943 
3944     return (PyObject *)v;
3945 }
3946 
3947 PyObject *PyUnicode_DecodeMBCS(const char *s,
3948                                Py_ssize_t size,
3949                                const char *errors)
3950 {
3951     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3952 }
3953 
3954 /*
3955  * Convert unicode into string object (MBCS).
3956  * Returns 0 if succeed, -1 otherwise.
3957  */
3958 static int encode_mbcs(PyObject **repr,
3959                        const Py_UNICODE *p, /* unicode */
3960                        int size) /* size of unicode */
3961 {
3962     int mbcssize = 0;
3963     Py_ssize_t n = 0;
3964 
3965     assert(size >= 0);
3966 
3967     /* First get the size of the result */
3968     if (size > 0) {
3969         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3970         if (mbcssize == 0) {
3971             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3972             return -1;
3973         }
3974     }
3975 
3976     if (*repr == NULL) {
3977         /* Create string object */
3978         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3979         if (*repr == NULL)
3980             return -1;
3981     }
3982     else {
3983         /* Extend string object */
3984         n = PyString_Size(*repr);
3985         if (_PyString_Resize(repr, n + mbcssize) < 0)
3986             return -1;
3987     }
3988 
3989     /* Do the conversion */
3990     if (size > 0) {
3991         char *s = PyString_AS_STRING(*repr) + n;
3992         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3993             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3994             return -1;
3995         }
3996     }
3997 
3998     return 0;
3999 }
4000 
4001 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4002                                Py_ssize_t size,
4003                                const char *errors)
4004 {
4005     PyObject *repr = NULL;
4006     int ret;
4007 
4008 #ifdef NEED_RETRY
4009   retry:
4010     if (size > INT_MAX)
4011         ret = encode_mbcs(&repr, p, INT_MAX);
4012     else
4013 #endif
4014         ret = encode_mbcs(&repr, p, (int)size);
4015 
4016     if (ret < 0) {
4017         Py_XDECREF(repr);
4018         return NULL;
4019     }
4020 
4021 #ifdef NEED_RETRY
4022     if (size > INT_MAX) {
4023         p += INT_MAX;
4024         size -= INT_MAX;
4025         goto retry;
4026     }
4027 #endif
4028 
4029     return repr;
4030 }
4031 
4032 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4033 {
4034     if (!PyUnicode_Check(unicode)) {
4035         PyErr_BadArgument();
4036         return NULL;
4037     }
4038     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4039                                 PyUnicode_GET_SIZE(unicode),
4040                                 NULL);
4041 }
4042 
4043 #undef NEED_RETRY
4044 
4045 #endif /* MS_WINDOWS */
4046 
4047 /* --- Character Mapping Codec -------------------------------------------- */
4048 
4049 PyObject *PyUnicode_DecodeCharmap(const char *s,
4050                                   Py_ssize_t size,
4051                                   PyObject *mapping,
4052                                   const char *errors)
4053 {
4054     const char *starts = s;
4055     Py_ssize_t startinpos;
4056     Py_ssize_t endinpos;
4057     Py_ssize_t outpos;
4058     const char *e;
4059     PyUnicodeObject *v;
4060     Py_UNICODE *p;
4061     Py_ssize_t extrachars = 0;
4062     PyObject *errorHandler = NULL;
4063     PyObject *exc = NULL;
4064     Py_UNICODE *mapstring = NULL;
4065     Py_ssize_t maplen = 0;
4066 
4067     /* Default to Latin-1 */
4068     if (mapping == NULL)
4069         return PyUnicode_DecodeLatin1(s, size, errors);
4070 
4071     v = _PyUnicode_New(size);
4072     if (v == NULL)
4073         goto onError;
4074     if (size == 0)
4075         return (PyObject *)v;
4076     p = PyUnicode_AS_UNICODE(v);
4077     e = s + size;
4078     if (PyUnicode_CheckExact(mapping)) {
4079         mapstring = PyUnicode_AS_UNICODE(mapping);
4080         maplen = PyUnicode_GET_SIZE(mapping);
4081         while (s < e) {
4082             unsigned char ch = *s;
4083             Py_UNICODE x = 0xfffe; /* illegal value */
4084 
4085             if (ch < maplen)
4086                 x = mapstring[ch];
4087 
4088             if (x == 0xfffe) {
4089                 /* undefined mapping */
4090                 outpos = p-PyUnicode_AS_UNICODE(v);
4091                 startinpos = s-starts;
4092                 endinpos = startinpos+1;
4093                 if (unicode_decode_call_errorhandler(
4094                         errors, &errorHandler,
4095                         "charmap", "character maps to <undefined>",
4096                         starts, size, &startinpos, &endinpos, &exc, &s,
4097                         &v, &outpos, &p)) {
4098                     goto onError;
4099                 }
4100                 continue;
4101             }
4102             *p++ = x;
4103             ++s;
4104         }
4105     }
4106     else {
4107         while (s < e) {
4108             unsigned char ch = *s;
4109             PyObject *w, *x;
4110 
4111             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4112             w = PyInt_FromLong((long)ch);
4113             if (w == NULL)
4114                 goto onError;
4115             x = PyObject_GetItem(mapping, w);
4116             Py_DECREF(w);
4117             if (x == NULL) {
4118                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4119                     /* No mapping found means: mapping is undefined. */
4120                     PyErr_Clear();
4121                     x = Py_None;
4122                     Py_INCREF(x);
4123                 } else
4124                     goto onError;
4125             }
4126 
4127             /* Apply mapping */
4128             if (PyInt_Check(x)) {
4129                 long value = PyInt_AS_LONG(x);
4130                 if (value < 0 || value > 65535) {
4131                     PyErr_SetString(PyExc_TypeError,
4132                                     "character mapping must be in range(65536)");
4133                     Py_DECREF(x);
4134                     goto onError;
4135                 }
4136                 *p++ = (Py_UNICODE)value;
4137             }
4138             else if (x == Py_None) {
4139                 /* undefined mapping */
4140                 outpos = p-PyUnicode_AS_UNICODE(v);
4141                 startinpos = s-starts;
4142                 endinpos = startinpos+1;
4143                 if (unicode_decode_call_errorhandler(
4144                         errors, &errorHandler,
4145                         "charmap", "character maps to <undefined>",
4146                         starts, size, &startinpos, &endinpos, &exc, &s,
4147                         &v, &outpos, &p)) {
4148                     Py_DECREF(x);
4149                     goto onError;
4150                 }
4151                 Py_DECREF(x);
4152                 continue;
4153             }
4154             else if (PyUnicode_Check(x)) {
4155                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4156 
4157                 if (targetsize == 1)
4158                     /* 1-1 mapping */
4159                     *p++ = *PyUnicode_AS_UNICODE(x);
4160 
4161                 else if (targetsize > 1) {
4162                     /* 1-n mapping */
4163                     if (targetsize > extrachars) {
4164                         /* resize first */
4165                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4166                         Py_ssize_t needed = (targetsize - extrachars) + \
4167                             (targetsize << 2);
4168                         extrachars += needed;
4169                         /* XXX overflow detection missing */
4170                         if (_PyUnicode_Resize(&v,
4171                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4172                             Py_DECREF(x);
4173                             goto onError;
4174                         }
4175                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4176                     }
4177                     Py_UNICODE_COPY(p,
4178                                     PyUnicode_AS_UNICODE(x),
4179                                     targetsize);
4180                     p += targetsize;
4181                     extrachars -= targetsize;
4182                 }
4183                 /* 1-0 mapping: skip the character */
4184             }
4185             else {
4186                 /* wrong return value */
4187                 PyErr_SetString(PyExc_TypeError,
4188                                 "character mapping must return integer, None or unicode");
4189                 Py_DECREF(x);
4190                 goto onError;
4191             }
4192             Py_DECREF(x);
4193             ++s;
4194         }
4195     }
4196     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4197         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4198             goto onError;
4199     Py_XDECREF(errorHandler);
4200     Py_XDECREF(exc);
4201     return (PyObject *)v;
4202 
4203   onError:
4204     Py_XDECREF(errorHandler);
4205     Py_XDECREF(exc);
4206     Py_XDECREF(v);
4207     return NULL;
4208 }
4209 
4210 /* Charmap encoding: the lookup table */
4211 
4212 struct encoding_map{
4213     PyObject_HEAD
4214     unsigned char level1[32];
4215     int count2, count3;
4216     unsigned char level23[1];
4217 };
4218 
4219 static PyObject*
4220 encoding_map_size(PyObject *obj, PyObject* args)
4221 {
4222     struct encoding_map *map = (struct encoding_map*)obj;
4223     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4224                           128*map->count3);
4225 }
4226 
4227 static PyMethodDef encoding_map_methods[] = {
4228     {"size", encoding_map_size, METH_NOARGS,
4229      PyDoc_STR("Return the size (in bytes) of this object") },
4230     { 0 }
4231 };
4232 
4233 static void
4234 encoding_map_dealloc(PyObject* o)
4235 {
4236     PyObject_FREE(o);
4237 }
4238 
4239 static PyTypeObject EncodingMapType = {
4240     PyVarObject_HEAD_INIT(NULL, 0)
4241     "EncodingMap",          /*tp_name*/
4242     sizeof(struct encoding_map),   /*tp_basicsize*/
4243     0,                      /*tp_itemsize*/
4244     /* methods */
4245     encoding_map_dealloc,   /*tp_dealloc*/
4246     0,                      /*tp_print*/
4247     0,                      /*tp_getattr*/
4248     0,                      /*tp_setattr*/
4249     0,                      /*tp_compare*/
4250     0,                      /*tp_repr*/
4251     0,                      /*tp_as_number*/
4252     0,                      /*tp_as_sequence*/
4253     0,                      /*tp_as_mapping*/
4254     0,                      /*tp_hash*/
4255     0,                      /*tp_call*/
4256     0,                      /*tp_str*/
4257     0,                      /*tp_getattro*/
4258     0,                      /*tp_setattro*/
4259     0,                      /*tp_as_buffer*/
4260     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4261     0,                      /*tp_doc*/
4262     0,                      /*tp_traverse*/
4263     0,                      /*tp_clear*/
4264     0,                      /*tp_richcompare*/
4265     0,                      /*tp_weaklistoffset*/
4266     0,                      /*tp_iter*/
4267     0,                      /*tp_iternext*/
4268     encoding_map_methods,   /*tp_methods*/
4269     0,                      /*tp_members*/
4270     0,                      /*tp_getset*/
4271     0,                      /*tp_base*/
4272     0,                      /*tp_dict*/
4273     0,                      /*tp_descr_get*/
4274     0,                      /*tp_descr_set*/
4275     0,                      /*tp_dictoffset*/
4276     0,                      /*tp_init*/
4277     0,                      /*tp_alloc*/
4278     0,                      /*tp_new*/
4279     0,                      /*tp_free*/
4280     0,                      /*tp_is_gc*/
4281 };
4282 
4283 PyObject*
4284 PyUnicode_BuildEncodingMap(PyObject* string)
4285 {
4286     Py_UNICODE *decode;
4287     PyObject *result;
4288     struct encoding_map *mresult;
4289     int i;
4290     int need_dict = 0;
4291     unsigned char level1[32];
4292     unsigned char level2[512];
4293     unsigned char *mlevel1, *mlevel2, *mlevel3;
4294     int count2 = 0, count3 = 0;
4295 
4296     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4297         PyErr_BadArgument();
4298         return NULL;
4299     }
4300     decode = PyUnicode_AS_UNICODE(string);
4301     memset(level1, 0xFF, sizeof level1);
4302     memset(level2, 0xFF, sizeof level2);
4303 
4304     /* If there isn't a one-to-one mapping of NULL to \0,
4305        or if there are non-BMP characters, we need to use
4306        a mapping dictionary. */
4307     if (decode[0] != 0)
4308         need_dict = 1;
4309     for (i = 1; i < 256; i++) {
4310         int l1, l2;
4311         if (decode[i] == 0
4312 #ifdef Py_UNICODE_WIDE
4313             || decode[i] > 0xFFFF
4314 #endif
4315             ) {
4316             need_dict = 1;
4317             break;
4318         }
4319         if (decode[i] == 0xFFFE)
4320             /* unmapped character */
4321             continue;
4322         l1 = decode[i] >> 11;
4323         l2 = decode[i] >> 7;
4324         if (level1[l1] == 0xFF)
4325             level1[l1] = count2++;
4326         if (level2[l2] == 0xFF)
4327             level2[l2] = count3++;
4328     }
4329 
4330     if (count2 >= 0xFF || count3 >= 0xFF)
4331         need_dict = 1;
4332 
4333     if (need_dict) {
4334         PyObject *result = PyDict_New();
4335         PyObject *key, *value;
4336         if (!result)
4337             return NULL;
4338         for (i = 0; i < 256; i++) {
4339             value = NULL;
4340             key = PyInt_FromLong(decode[i]);
4341             value = PyInt_FromLong(i);
4342             if (!key || !value)
4343                 goto failed1;
4344             if (PyDict_SetItem(result, key, value) == -1)
4345                 goto failed1;
4346             Py_DECREF(key);
4347             Py_DECREF(value);
4348         }
4349         return result;
4350       failed1:
4351         Py_XDECREF(key);
4352         Py_XDECREF(value);
4353         Py_DECREF(result);
4354         return NULL;
4355     }
4356 
4357     /* Create a three-level trie */
4358     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4359                              16*count2 + 128*count3 - 1);
4360     if (!result)
4361         return PyErr_NoMemory();
4362     PyObject_Init(result, &EncodingMapType);
4363     mresult = (struct encoding_map*)result;
4364     mresult->count2 = count2;
4365     mresult->count3 = count3;
4366     mlevel1 = mresult->level1;
4367     mlevel2 = mresult->level23;
4368     mlevel3 = mresult->level23 + 16*count2;
4369     memcpy(mlevel1, level1, 32);
4370     memset(mlevel2, 0xFF, 16*count2);
4371     memset(mlevel3, 0, 128*count3);
4372     count3 = 0;
4373     for (i = 1; i < 256; i++) {
4374         int o1, o2, o3, i2, i3;
4375         if (decode[i] == 0xFFFE)
4376             /* unmapped character */
4377             continue;
4378         o1 = decode[i]>>11;
4379         o2 = (decode[i]>>7) & 0xF;
4380         i2 = 16*mlevel1[o1] + o2;
4381         if (mlevel2[i2] == 0xFF)
4382             mlevel2[i2] = count3++;
4383         o3 = decode[i] & 0x7F;
4384         i3 = 128*mlevel2[i2] + o3;
4385         mlevel3[i3] = i;
4386     }
4387     return result;
4388 }
4389 
4390 static int
4391 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4392 {
4393     struct encoding_map *map = (struct encoding_map*)mapping;
4394     int l1 = c>>11;
4395     int l2 = (c>>7) & 0xF;
4396     int l3 = c & 0x7F;
4397     int i;
4398 
4399 #ifdef Py_UNICODE_WIDE
4400     if (c > 0xFFFF) {
4401         return -1;
4402     }
4403 #endif
4404     if (c == 0)
4405         return 0;
4406     /* level 1*/
4407     i = map->level1[l1];
4408     if (i == 0xFF) {
4409         return -1;
4410     }
4411     /* level 2*/
4412     i = map->level23[16*i+l2];
4413     if (i == 0xFF) {
4414         return -1;
4415     }
4416     /* level 3 */
4417     i = map->level23[16*map->count2 + 128*i + l3];
4418     if (i == 0) {
4419         return -1;
4420     }
4421     return i;
4422 }
4423 
4424 /* Lookup the character ch in the mapping. If the character
4425    can't be found, Py_None is returned (or NULL, if another
4426    error occurred). */
4427 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4428 {
4429     PyObject *w = PyInt_FromLong((long)c);
4430     PyObject *x;
4431 
4432     if (w == NULL)
4433         return NULL;
4434     x = PyObject_GetItem(mapping, w);
4435     Py_DECREF(w);
4436     if (x == NULL) {
4437         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4438             /* No mapping found means: mapping is undefined. */
4439             PyErr_Clear();
4440             x = Py_None;
4441             Py_INCREF(x);
4442             return x;
4443         } else
4444             return NULL;
4445     }
4446     else if (x == Py_None)
4447         return x;
4448     else if (PyInt_Check(x)) {
4449         long value = PyInt_AS_LONG(x);
4450         if (value < 0 || value > 255) {
4451             PyErr_SetString(PyExc_TypeError,
4452                             "character mapping must be in range(256)");
4453             Py_DECREF(x);
4454             return NULL;
4455         }
4456         return x;
4457     }
4458     else if (PyString_Check(x))
4459         return x;
4460     else {
4461         /* wrong return value */
4462         PyErr_SetString(PyExc_TypeError,
4463                         "character mapping must return integer, None or str");
4464         Py_DECREF(x);
4465         return NULL;
4466     }
4467 }
4468 
4469 static int
4470 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4471 {
4472     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4473     /* exponentially overallocate to minimize reallocations */
4474     if (requiredsize < 2*outsize)
4475         requiredsize = 2*outsize;
4476     if (_PyString_Resize(outobj, requiredsize)) {
4477         return 0;
4478     }
4479     return 1;
4480 }
4481 
4482 typedef enum charmapencode_result {
4483     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4484 }charmapencode_result;
4485 /* lookup the character, put the result in the output string and adjust
4486    various state variables. Reallocate the output string if not enough
4487    space is available. Return a new reference to the object that
4488    was put in the output buffer, or Py_None, if the mapping was undefined
4489    (in which case no character was written) or NULL, if a
4490    reallocation error occurred. The caller must decref the result */
4491 static
4492 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4493                                           PyObject **outobj, Py_ssize_t *outpos)
4494 {
4495     PyObject *rep;
4496     char *outstart;
4497     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4498 
4499     if (Py_TYPE(mapping) == &EncodingMapType) {
4500         int res = encoding_map_lookup(c, mapping);
4501         Py_ssize_t requiredsize = *outpos+1;
4502         if (res == -1)
4503             return enc_FAILED;
4504         if (outsize<requiredsize)
4505             if (!charmapencode_resize(outobj, outpos, requiredsize))
4506                 return enc_EXCEPTION;
4507         outstart = PyString_AS_STRING(*outobj);
4508         outstart[(*outpos)++] = (char)res;
4509         return enc_SUCCESS;
4510     }
4511 
4512     rep = charmapencode_lookup(c, mapping);
4513     if (rep==NULL)
4514         return enc_EXCEPTION;
4515     else if (rep==Py_None) {
4516         Py_DECREF(rep);
4517         return enc_FAILED;
4518     } else {
4519         if (PyInt_Check(rep)) {
4520             Py_ssize_t requiredsize = *outpos+1;
4521             if (outsize<requiredsize)
4522                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4523                     Py_DECREF(rep);
4524                     return enc_EXCEPTION;
4525                 }
4526             outstart = PyString_AS_STRING(*outobj);
4527             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4528         }
4529         else {
4530             const char *repchars = PyString_AS_STRING(rep);
4531             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4532             Py_ssize_t requiredsize = *outpos+repsize;
4533             if (outsize<requiredsize)
4534                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4535                     Py_DECREF(rep);
4536                     return enc_EXCEPTION;
4537                 }
4538             outstart = PyString_AS_STRING(*outobj);
4539             memcpy(outstart + *outpos, repchars, repsize);
4540             *outpos += repsize;
4541         }
4542     }
4543     Py_DECREF(rep);
4544     return enc_SUCCESS;
4545 }
4546 
4547 /* handle an error in PyUnicode_EncodeCharmap
4548    Return 0 on success, -1 on error */
4549 static
4550 int charmap_encoding_error(
4551     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4552     PyObject **exceptionObject,
4553     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4554     PyObject **res, Py_ssize_t *respos)
4555 {
4556     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4557     Py_ssize_t repsize;
4558     Py_ssize_t newpos;
4559     Py_UNICODE *uni2;
4560     /* startpos for collecting unencodable chars */
4561     Py_ssize_t collstartpos = *inpos;
4562     Py_ssize_t collendpos = *inpos+1;
4563     Py_ssize_t collpos;
4564     char *encoding = "charmap";
4565     char *reason = "character maps to <undefined>";
4566     charmapencode_result x;
4567 
4568     /* find all unencodable characters */
4569     while (collendpos < size) {
4570         PyObject *rep;
4571         if (Py_TYPE(mapping) == &EncodingMapType) {
4572             int res = encoding_map_lookup(p[collendpos], mapping);
4573             if (res != -1)
4574                 break;
4575             ++collendpos;
4576             continue;
4577         }
4578 
4579         rep = charmapencode_lookup(p[collendpos], mapping);
4580         if (rep==NULL)
4581             return -1;
4582         else if (rep!=Py_None) {
4583             Py_DECREF(rep);
4584             break;
4585         }
4586         Py_DECREF(rep);
4587         ++collendpos;
4588     }
4589     /* cache callback name lookup
4590      * (if not done yet, i.e. it's the first error) */
4591     if (*known_errorHandler==-1) {
4592         if ((errors==NULL) || (!strcmp(errors, "strict")))
4593             *known_errorHandler = 1;
4594         else if (!strcmp(errors, "replace"))
4595             *known_errorHandler = 2;
4596         else if (!strcmp(errors, "ignore"))
4597             *known_errorHandler = 3;
4598         else if (!strcmp(errors, "xmlcharrefreplace"))
4599             *known_errorHandler = 4;
4600         else
4601             *known_errorHandler = 0;
4602     }
4603     switch (*known_errorHandler) {
4604     case 1: /* strict */
4605         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4606         return -1;
4607     case 2: /* replace */
4608         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4609             x = charmapencode_output('?', mapping, res, respos);
4610             if (x==enc_EXCEPTION) {
4611                 return -1;
4612             }
4613             else if (x==enc_FAILED) {
4614                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4615                 return -1;
4616             }
4617         }
4618         /* fall through */
4619     case 3: /* ignore */
4620         *inpos = collendpos;
4621         break;
4622     case 4: /* xmlcharrefreplace */
4623         /* generate replacement (temporarily (mis)uses p) */
4624         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4625             char buffer[2+29+1+1];
4626             char *cp;
4627             sprintf(buffer, "&#%d;", (int)p[collpos]);
4628             for (cp = buffer; *cp; ++cp) {
4629                 x = charmapencode_output(*cp, mapping, res, respos);
4630                 if (x==enc_EXCEPTION)
4631                     return -1;
4632                 else if (x==enc_FAILED) {
4633                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4634                     return -1;
4635                 }
4636             }
4637         }
4638         *inpos = collendpos;
4639         break;
4640     default:
4641         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4642                                                       encoding, reason, p, size, exceptionObject,
4643                                                       collstartpos, collendpos, &newpos);
4644         if (repunicode == NULL)
4645             return -1;
4646         /* generate replacement  */
4647         repsize = PyUnicode_GET_SIZE(repunicode);
4648         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4649             x = charmapencode_output(*uni2, mapping, res, respos);
4650             if (x==enc_EXCEPTION) {
4651                 return -1;
4652             }
4653             else if (x==enc_FAILED) {
4654                 Py_DECREF(repunicode);
4655                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4656                 return -1;
4657             }
4658         }
4659         *inpos = newpos;
4660         Py_DECREF(repunicode);
4661     }
4662     return 0;
4663 }
4664 
4665 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4666                                   Py_ssize_t size,
4667                                   PyObject *mapping,
4668                                   const char *errors)
4669 {
4670     /* output object */
4671     PyObject *res = NULL;
4672     /* current input position */
4673     Py_ssize_t inpos = 0;
4674     /* current output position */
4675     Py_ssize_t respos = 0;
4676     PyObject *errorHandler = NULL;
4677     PyObject *exc = NULL;
4678     /* the following variable is used for caching string comparisons
4679      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4680      * 3=ignore, 4=xmlcharrefreplace */
4681     int known_errorHandler = -1;
4682 
4683     /* Default to Latin-1 */
4684     if (mapping == NULL)
4685         return PyUnicode_EncodeLatin1(p, size, errors);
4686 
4687     /* allocate enough for a simple encoding without
4688        replacements, if we need more, we'll resize */
4689     res = PyString_FromStringAndSize(NULL, size);
4690     if (res == NULL)
4691         goto onError;
4692     if (size == 0)
4693         return res;
4694 
4695     while (inpos<size) {
4696         /* try to encode it */
4697         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4698         if (x==enc_EXCEPTION) /* error */
4699             goto onError;
4700         if (x==enc_FAILED) { /* unencodable character */
4701             if (charmap_encoding_error(p, size, &inpos, mapping,
4702                                        &exc,
4703                                        &known_errorHandler, &errorHandler, errors,
4704                                        &res, &respos)) {
4705                 goto onError;
4706             }
4707         }
4708         else
4709             /* done with this character => adjust input position */
4710             ++inpos;
4711     }
4712 
4713     /* Resize if we allocated to much */
4714     if (respos<PyString_GET_SIZE(res)) {
4715         if (_PyString_Resize(&res, respos))
4716             goto onError;
4717     }
4718     Py_XDECREF(exc);
4719     Py_XDECREF(errorHandler);
4720     return res;
4721 
4722   onError:
4723     Py_XDECREF(res);
4724     Py_XDECREF(exc);
4725     Py_XDECREF(errorHandler);
4726     return NULL;
4727 }
4728 
4729 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4730                                     PyObject *mapping)
4731 {
4732     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4733         PyErr_BadArgument();
4734         return NULL;
4735     }
4736     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4737                                    PyUnicode_GET_SIZE(unicode),
4738                                    mapping,
4739                                    NULL);
4740 }
4741 
4742 /* create or adjust a UnicodeTranslateError */
4743 static void make_translate_exception(PyObject **exceptionObject,
4744                                      const Py_UNICODE *unicode, Py_ssize_t size,
4745                                      Py_ssize_t startpos, Py_ssize_t endpos,
4746                                      const char *reason)
4747 {
4748     if (*exceptionObject == NULL) {
4749         *exceptionObject = PyUnicodeTranslateError_Create(
4750             unicode, size, startpos, endpos, reason);
4751     }
4752     else {
4753         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4754             goto onError;
4755         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4756             goto onError;
4757         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4758             goto onError;
4759         return;
4760       onError:
4761         Py_DECREF(*exceptionObject);
4762         *exceptionObject = NULL;
4763     }
4764 }
4765 
4766 /* raises a UnicodeTranslateError */
4767 static void raise_translate_exception(PyObject **exceptionObject,
4768                                       const Py_UNICODE *unicode, Py_ssize_t size,
4769                                       Py_ssize_t startpos, Py_ssize_t endpos,
4770                                       const char *reason)
4771 {
4772     make_translate_exception(exceptionObject,
4773                              unicode, size, startpos, endpos, reason);
4774     if (*exceptionObject != NULL)
4775         PyCodec_StrictErrors(*exceptionObject);
4776 }
4777 
4778 /* error handling callback helper:
4779    build arguments, call the callback and check the arguments,
4780    put the result into newpos and return the replacement string, which
4781    has to be freed by the caller */
4782 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4783                                                      PyObject **errorHandler,
4784                                                      const char *reason,
4785                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4786                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4787                                                      Py_ssize_t *newpos)
4788 {
4789     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4790 
4791     Py_ssize_t i_newpos;
4792     PyObject *restuple;
4793     PyObject *resunicode;
4794 
4795     if (*errorHandler == NULL) {
4796         *errorHandler = PyCodec_LookupError(errors);
4797         if (*errorHandler == NULL)
4798             return NULL;
4799     }
4800 
4801     make_translate_exception(exceptionObject,
4802                              unicode, size, startpos, endpos, reason);
4803     if (*exceptionObject == NULL)
4804         return NULL;
4805 
4806     restuple = PyObject_CallFunctionObjArgs(
4807         *errorHandler, *exceptionObject, NULL);
4808     if (restuple == NULL)
4809         return NULL;
4810     if (!PyTuple_Check(restuple)) {
4811         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4812         Py_DECREF(restuple);
4813         return NULL;
4814     }
4815     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4816                           &resunicode, &i_newpos)) {
4817         Py_DECREF(restuple);
4818         return NULL;
4819     }
4820     if (i_newpos<0)
4821         *newpos = size+i_newpos;
4822     else
4823         *newpos = i_newpos;
4824     if (*newpos<0 || *newpos>size) {
4825         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4826         Py_DECREF(restuple);
4827         return NULL;
4828     }
4829     Py_INCREF(resunicode);
4830     Py_DECREF(restuple);
4831     return resunicode;
4832 }
4833 
4834 /* Lookup the character ch in the mapping and put the result in result,
4835    which must be decrefed by the caller.
4836    Return 0 on success, -1 on error */
4837 static
4838 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4839 {
4840     PyObject *w = PyInt_FromLong((long)c);
4841     PyObject *x;
4842 
4843     if (w == NULL)
4844         return -1;
4845     x = PyObject_GetItem(mapping, w);
4846     Py_DECREF(w);
4847     if (x == NULL) {
4848         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4849             /* No mapping found means: use 1:1 mapping. */
4850             PyErr_Clear();
4851             *result = NULL;
4852             return 0;
4853         } else
4854             return -1;
4855     }
4856     else if (x == Py_None) {
4857         *result = x;
4858         return 0;
4859     }
4860     else if (PyInt_Check(x)) {
4861         long value = PyInt_AS_LONG(x);
4862         long max = PyUnicode_GetMax();
4863         if (value < 0 || value > max) {
4864             PyErr_Format(PyExc_TypeError,
4865                          "character mapping must be in range(0x%lx)", max+1);
4866             Py_DECREF(x);
4867             return -1;
4868         }
4869         *result = x;
4870         return 0;
4871     }
4872     else if (PyUnicode_Check(x)) {
4873         *result = x;
4874         return 0;
4875     }
4876     else {
4877         /* wrong return value */
4878         PyErr_SetString(PyExc_TypeError,
4879                         "character mapping must return integer, None or unicode");
4880         Py_DECREF(x);
4881         return -1;
4882     }
4883 }
4884 /* ensure that *outobj is at least requiredsize characters long,
4885    if not reallocate and adjust various state variables.
4886    Return 0 on success, -1 on error */
4887 static
4888 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4889                                Py_ssize_t requiredsize)
4890 {
4891     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4892     if (requiredsize > oldsize) {
4893         /* remember old output position */
4894         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4895         /* exponentially overallocate to minimize reallocations */
4896         if (requiredsize < 2 * oldsize)
4897             requiredsize = 2 * oldsize;
4898         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4899             return -1;
4900         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4901     }
4902     return 0;
4903 }
4904 /* lookup the character, put the result in the output string and adjust
4905    various state variables. Return a new reference to the object that
4906    was put in the output buffer in *result, or Py_None, if the mapping was
4907    undefined (in which case no character was written).
4908    The called must decref result.
4909    Return 0 on success, -1 on error. */
4910 static
4911 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4912                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4913                             PyObject **res)
4914 {
4915     if (charmaptranslate_lookup(*curinp, mapping, res))
4916         return -1;
4917     if (*res==NULL) {
4918         /* not found => default to 1:1 mapping */
4919         *(*outp)++ = *curinp;
4920     }
4921     else if (*res==Py_None)
4922         ;
4923     else if (PyInt_Check(*res)) {
4924         /* no overflow check, because we know that the space is enough */
4925         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4926     }
4927     else if (PyUnicode_Check(*res)) {
4928         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4929         if (repsize==1) {
4930             /* no overflow check, because we know that the space is enough */
4931             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4932         }
4933         else if (repsize!=0) {
4934             /* more than one character */
4935             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4936                 (insize - (curinp-startinp)) +
4937                 repsize - 1;
4938             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4939                 return -1;
4940             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4941             *outp += repsize;
4942         }
4943     }
4944     else
4945         return -1;
4946     return 0;
4947 }
4948 
4949 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4950                                      Py_ssize_t size,
4951                                      PyObject *mapping,
4952                                      const char *errors)
4953 {
4954     /* output object */
4955     PyObject *res = NULL;
4956     /* pointers to the beginning and end+1 of input */
4957     const Py_UNICODE *startp = p;
4958     const Py_UNICODE *endp = p + size;
4959     /* pointer into the output */
4960     Py_UNICODE *str;
4961     /* current output position */
4962     Py_ssize_t respos = 0;
4963     char *reason = "character maps to <undefined>";
4964     PyObject *errorHandler = NULL;
4965     PyObject *exc = NULL;
4966     /* the following variable is used for caching string comparisons
4967      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4968      * 3=ignore, 4=xmlcharrefreplace */
4969     int known_errorHandler = -1;
4970 
4971     if (mapping == NULL) {
4972         PyErr_BadArgument();
4973         return NULL;
4974     }
4975 
4976     /* allocate enough for a simple 1:1 translation without
4977        replacements, if we need more, we'll resize */
4978     res = PyUnicode_FromUnicode(NULL, size);
4979     if (res == NULL)
4980         goto onError;
4981     if (size == 0)
4982         return res;
4983     str = PyUnicode_AS_UNICODE(res);
4984 
4985     while (p<endp) {
4986         /* try to encode it */
4987         PyObject *x = NULL;
4988         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4989             Py_XDECREF(x);
4990             goto onError;
4991         }
4992         Py_XDECREF(x);
4993         if (x!=Py_None) /* it worked => adjust input pointer */
4994             ++p;
4995         else { /* untranslatable character */
4996             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4997             Py_ssize_t repsize;
4998             Py_ssize_t newpos;
4999             Py_UNICODE *uni2;
5000             /* startpos for collecting untranslatable chars */
5001             const Py_UNICODE *collstart = p;
5002             const Py_UNICODE *collend = p+1;
5003             const Py_UNICODE *coll;
5004 
5005             /* find all untranslatable characters */
5006             while (collend < endp) {
5007                 if (charmaptranslate_lookup(*collend, mapping, &x))
5008                     goto onError;
5009                 Py_XDECREF(x);
5010                 if (x!=Py_None)
5011                     break;
5012                 ++collend;
5013             }
5014             /* cache callback name lookup
5015              * (if not done yet, i.e. it's the first error) */
5016             if (known_errorHandler==-1) {
5017                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5018                     known_errorHandler = 1;
5019                 else if (!strcmp(errors, "replace"))
5020                     known_errorHandler = 2;
5021                 else if (!strcmp(errors, "ignore"))
5022                     known_errorHandler = 3;
5023                 else if (!strcmp(errors, "xmlcharrefreplace"))
5024                     known_errorHandler = 4;
5025                 else
5026                     known_errorHandler = 0;
5027             }
5028             switch (known_errorHandler) {
5029             case 1: /* strict */
5030                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5031                 goto onError;
5032             case 2: /* replace */
5033                 /* No need to check for space, this is a 1:1 replacement */
5034                 for (coll = collstart; coll<collend; ++coll)
5035                     *str++ = '?';
5036                 /* fall through */
5037             case 3: /* ignore */
5038                 p = collend;
5039                 break;
5040             case 4: /* xmlcharrefreplace */
5041                 /* generate replacement (temporarily (mis)uses p) */
5042                 for (p = collstart; p < collend; ++p) {
5043                     char buffer[2+29+1+1];
5044                     char *cp;
5045                     sprintf(buffer, "&#%d;", (int)*p);
5046                     if (charmaptranslate_makespace(&res, &str,
5047                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5048                         goto onError;
5049                     for (cp = buffer; *cp; ++cp)
5050                         *str++ = *cp;
5051                 }
5052                 p = collend;
5053                 break;
5054             default:
5055                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5056                                                                  reason, startp, size, &exc,
5057                                                                  collstart-startp, collend-startp, &newpos);
5058                 if (repunicode == NULL)
5059                     goto onError;
5060                 /* generate replacement  */
5061                 repsize = PyUnicode_GET_SIZE(repunicode);
5062                 if (charmaptranslate_makespace(&res, &str,
5063                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5064                     Py_DECREF(repunicode);
5065                     goto onError;
5066                 }
5067                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5068                     *str++ = *uni2;
5069                 p = startp + newpos;
5070                 Py_DECREF(repunicode);
5071             }
5072         }
5073     }
5074     /* Resize if we allocated to much */
5075     respos = str-PyUnicode_AS_UNICODE(res);
5076     if (respos<PyUnicode_GET_SIZE(res)) {
5077         if (PyUnicode_Resize(&res, respos) < 0)
5078             goto onError;
5079     }
5080     Py_XDECREF(exc);
5081     Py_XDECREF(errorHandler);
5082     return res;
5083 
5084   onError:
5085     Py_XDECREF(res);
5086     Py_XDECREF(exc);
5087     Py_XDECREF(errorHandler);
5088     return NULL;
5089 }
5090 
5091 PyObject *PyUnicode_Translate(PyObject *str,
5092                               PyObject *mapping,
5093                               const char *errors)
5094 {
5095     PyObject *result;
5096 
5097     str = PyUnicode_FromObject(str);
5098     if (str == NULL)
5099         goto onError;
5100     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5101                                         PyUnicode_GET_SIZE(str),
5102                                         mapping,
5103                                         errors);
5104     Py_DECREF(str);
5105     return result;
5106 
5107   onError:
5108     Py_XDECREF(str);
5109     return NULL;
5110 }
5111 
5112 /* --- Decimal Encoder ---------------------------------------------------- */
5113 
5114 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5115                             Py_ssize_t length,
5116                             char *output,
5117                             const char *errors)
5118 {
5119     Py_UNICODE *p, *end;
5120     PyObject *errorHandler = NULL;
5121     PyObject *exc = NULL;
5122     const char *encoding = "decimal";
5123     const char *reason = "invalid decimal Unicode string";
5124     /* the following variable is used for caching string comparisons
5125      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5126     int known_errorHandler = -1;
5127 
5128     if (output == NULL) {
5129         PyErr_BadArgument();
5130         return -1;
5131     }
5132 
5133     p = s;
5134     end = s + length;
5135     while (p < end) {
5136         register Py_UNICODE ch = *p;
5137         int decimal;
5138         PyObject *repunicode;
5139         Py_ssize_t repsize;
5140         Py_ssize_t newpos;
5141         Py_UNICODE *uni2;
5142         Py_UNICODE *collstart;
5143         Py_UNICODE *collend;
5144 
5145         if (Py_UNICODE_ISSPACE(ch)) {
5146             *output++ = ' ';
5147             ++p;
5148             continue;
5149         }
5150         decimal = Py_UNICODE_TODECIMAL(ch);
5151         if (decimal >= 0) {
5152             *output++ = '0' + decimal;
5153             ++p;
5154             continue;
5155         }
5156         if (0 < ch && ch < 256) {
5157             *output++ = (char)ch;
5158             ++p;
5159             continue;
5160         }
5161         /* All other characters are considered unencodable */
5162         collstart = p;
5163         for (collend = p+1; collend < end; collend++) {
5164             if ((0 < *collend && *collend < 256) ||
5165                 Py_UNICODE_ISSPACE(*collend) ||
5166                 0 <= Py_UNICODE_TODECIMAL(*collend))
5167                 break;
5168         }
5169         /* cache callback name lookup
5170          * (if not done yet, i.e. it's the first error) */
5171         if (known_errorHandler==-1) {
5172             if ((errors==NULL) || (!strcmp(errors, "strict")))
5173                 known_errorHandler = 1;
5174             else if (!strcmp(errors, "replace"))
5175                 known_errorHandler = 2;
5176             else if (!strcmp(errors, "ignore"))
5177                 known_errorHandler = 3;
5178             else if (!strcmp(errors, "xmlcharrefreplace"))
5179                 known_errorHandler = 4;
5180             else
5181                 known_errorHandler = 0;
5182         }
5183         switch (known_errorHandler) {
5184         case 1: /* strict */
5185             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5186             goto onError;
5187         case 2: /* replace */
5188             for (p = collstart; p < collend; ++p)
5189                 *output++ = '?';
5190             /* fall through */
5191         case 3: /* ignore */
5192             p = collend;
5193             break;
5194         case 4: /* xmlcharrefreplace */
5195             /* generate replacement (temporarily (mis)uses p) */
5196             for (p = collstart; p < collend; ++p)
5197                 output += sprintf(output, "&#%d;", (int)*p);
5198             p = collend;
5199             break;
5200         default:
5201             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5202                                                           encoding, reason, s, length, &exc,
5203                                                           collstart-s, collend-s, &newpos);
5204             if (repunicode == NULL)
5205                 goto onError;
5206             /* generate replacement  */
5207             repsize = PyUnicode_GET_SIZE(repunicode);
5208             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5209                 Py_UNICODE ch = *uni2;
5210                 if (Py_UNICODE_ISSPACE(ch))
5211                     *output++ = ' ';
5212                 else {
5213                     decimal = Py_UNICODE_TODECIMAL(ch);
5214                     if (decimal >= 0)
5215                         *output++ = '0' + decimal;
5216                     else if (0 < ch && ch < 256)
5217                         *output++ = (char)ch;
5218                     else {
5219                         Py_DECREF(repunicode);
5220                         raise_encode_exception(&exc, encoding,
5221                                                s, length, collstart-s, collend-s, reason);
5222                         goto onError;
5223                     }
5224                 }
5225             }
5226             p = s + newpos;
5227             Py_DECREF(repunicode);
5228         }
5229     }
5230     /* 0-terminate the output string */
5231     *output++ = '\0';
5232     Py_XDECREF(exc);
5233     Py_XDECREF(errorHandler);
5234     return 0;
5235 
5236   onError:
5237     Py_XDECREF(exc);
5238     Py_XDECREF(errorHandler);
5239     return -1;
5240 }
5241 
5242 /* --- Helpers ------------------------------------------------------------ */
5243 
5244 #include "stringlib/unicodedefs.h"
5245 #include "stringlib/fastsearch.h"
5246 
5247 #include "stringlib/count.h"
5248 #include "stringlib/find.h"
5249 #include "stringlib/partition.h"
5250 #include "stringlib/split.h"
5251 
5252 /* helper macro to fixup start/end slice values */
5253 #define ADJUST_INDICES(start, end, len)         \
5254     if (end > len)                              \
5255         end = len;                              \
5256     else if (end < 0) {                         \
5257         end += len;                             \
5258         if (end < 0)                            \
5259             end = 0;                            \
5260     }                                           \
5261     if (start < 0) {                            \
5262         start += len;                           \
5263         if (start < 0)                          \
5264             start = 0;                          \
5265     }
5266 
5267 Py_ssize_t PyUnicode_Count(PyObject *str,
5268                            PyObject *substr,
5269                            Py_ssize_t start,
5270                            Py_ssize_t end)
5271 {
5272     Py_ssize_t result;
5273     PyUnicodeObject* str_obj;
5274     PyUnicodeObject* sub_obj;
5275 
5276     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5277     if (!str_obj)
5278         return -1;
5279     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5280     if (!sub_obj) {
5281         Py_DECREF(str_obj);
5282         return -1;
5283     }
5284 
5285     ADJUST_INDICES(start, end, str_obj->length);
5286     result = stringlib_count(
5287         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5288         PY_SSIZE_T_MAX
5289         );
5290 
5291     Py_DECREF(sub_obj);
5292     Py_DECREF(str_obj);
5293 
5294     return result;
5295 }
5296 
5297 Py_ssize_t PyUnicode_Find(PyObject *str,
5298                           PyObject *sub,
5299                           Py_ssize_t start,
5300                           Py_ssize_t end,
5301                           int direction)
5302 {
5303     Py_ssize_t result;
5304 
5305     str = PyUnicode_FromObject(str);
5306     if (!str)
5307         return -2;
5308     sub = PyUnicode_FromObject(sub);
5309     if (!sub) {
5310         Py_DECREF(str);
5311         return -2;
5312     }
5313 
5314     if (direction > 0)
5315         result = stringlib_find_slice(
5316             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5317             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5318             start, end
5319             );
5320     else
5321         result = stringlib_rfind_slice(
5322             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5323             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5324             start, end
5325             );
5326 
5327     Py_DECREF(str);
5328     Py_DECREF(sub);
5329 
5330     return result;
5331 }
5332 
5333 static
5334 int tailmatch(PyUnicodeObject *self,
5335               PyUnicodeObject *substring,
5336               Py_ssize_t start,
5337               Py_ssize_t end,
5338               int direction)
5339 {
5340     if (substring->length == 0)
5341         return 1;
5342 
5343     ADJUST_INDICES(start, end, self->length);
5344     end -= substring->length;
5345     if (end < start)
5346         return 0;
5347 
5348     if (direction > 0) {
5349         if (Py_UNICODE_MATCH(self, end, substring))
5350             return 1;
5351     } else {
5352         if (Py_UNICODE_MATCH(self, start, substring))
5353             return 1;
5354     }
5355 
5356     return 0;
5357 }
5358 
5359 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5360                                PyObject *substr,
5361                                Py_ssize_t start,
5362                                Py_ssize_t end,
5363                                int direction)
5364 {
5365     Py_ssize_t result;
5366 
5367     str = PyUnicode_FromObject(str);
5368     if (str == NULL)
5369         return -1;
5370     substr = PyUnicode_FromObject(substr);
5371     if (substr == NULL) {
5372         Py_DECREF(str);
5373         return -1;
5374     }
5375 
5376     result = tailmatch((PyUnicodeObject *)str,
5377                        (PyUnicodeObject *)substr,
5378                        start, end, direction);
5379     Py_DECREF(str);
5380     Py_DECREF(substr);
5381     return result;
5382 }
5383 
5384 /* Apply fixfct filter to the Unicode object self and return a
5385    reference to the modified object */
5386 
5387 static
5388 PyObject *fixup(PyUnicodeObject *self,
5389                 int (*fixfct)(PyUnicodeObject *s))
5390 {
5391 
5392     PyUnicodeObject *u;
5393 
5394     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5395     if (u == NULL)
5396         return NULL;
5397 
5398     Py_UNICODE_COPY(u->str, self->str, self->length);
5399 
5400     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5401         /* fixfct should return TRUE if it modified the buffer. If
5402            FALSE, return a reference to the original buffer instead
5403            (to save space, not time) */
5404         Py_INCREF(self);
5405         Py_DECREF(u);
5406         return (PyObject*) self;
5407     }
5408     return (PyObject*) u;
5409 }
5410 
5411 static
5412 int fixupper(PyUnicodeObject *self)
5413 {
5414     Py_ssize_t len = self->length;
5415     Py_UNICODE *s = self->str;
5416     int status = 0;
5417 
5418     while (len-- > 0) {
5419         register Py_UNICODE ch;
5420 
5421         ch = Py_UNICODE_TOUPPER(*s);
5422         if (ch != *s) {
5423             status = 1;
5424             *s = ch;
5425         }
5426         s++;
5427     }
5428 
5429     return status;
5430 }
5431 
5432 static
5433 int fixlower(PyUnicodeObject *self)
5434 {
5435     Py_ssize_t len = self->length;
5436     Py_UNICODE *s = self->str;
5437     int status = 0;
5438 
5439     while (len-- > 0) {
5440         register Py_UNICODE ch;
5441 
5442         ch = Py_UNICODE_TOLOWER(*s);
5443         if (ch != *s) {
5444             status = 1;
5445             *s = ch;
5446         }
5447         s++;
5448     }
5449 
5450     return status;
5451 }
5452 
5453 static
5454 int fixswapcase(PyUnicodeObject *self)
5455 {
5456     Py_ssize_t len = self->length;
5457     Py_UNICODE *s = self->str;
5458     int status = 0;
5459 
5460     while (len-- > 0) {
5461         if (Py_UNICODE_ISUPPER(*s)) {
5462             *s = Py_UNICODE_TOLOWER(*s);
5463             status = 1;
5464         } else if (Py_UNICODE_ISLOWER(*s)) {
5465             *s = Py_UNICODE_TOUPPER(*s);
5466             status = 1;
5467         }
5468         s++;
5469     }
5470 
5471     return status;
5472 }
5473 
5474 static
5475 int fixcapitalize(PyUnicodeObject *self)
5476 {
5477     Py_ssize_t len = self->length;
5478     Py_UNICODE *s = self->str;
5479     int status = 0;
5480 
5481     if (len == 0)
5482         return 0;
5483     if (!Py_UNICODE_ISUPPER(*s)) {
5484         *s = Py_UNICODE_TOUPPER(*s);
5485         status = 1;
5486     }
5487     s++;
5488     while (--len > 0) {
5489         if (!Py_UNICODE_ISLOWER(*s)) {
5490             *s = Py_UNICODE_TOLOWER(*s);
5491             status = 1;
5492         }
5493         s++;
5494     }
5495     return status;
5496 }
5497 
5498 static
5499 int fixtitle(PyUnicodeObject *self)
5500 {
5501     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5502     register Py_UNICODE *e;
5503     int previous_is_cased;
5504 
5505     /* Shortcut for single character strings */
5506     if (PyUnicode_GET_SIZE(self) == 1) {
5507         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5508         if (*p != ch) {
5509             *p = ch;
5510             return 1;
5511         }
5512         else
5513             return 0;
5514     }
5515 
5516     e = p + PyUnicode_GET_SIZE(self);
5517     previous_is_cased = 0;
5518     for (; p < e; p++) {
5519         register const Py_UNICODE ch = *p;
5520 
5521         if (previous_is_cased)
5522             *p = Py_UNICODE_TOLOWER(ch);
5523         else
5524             *p = Py_UNICODE_TOTITLE(ch);
5525 
5526         if (Py_UNICODE_ISLOWER(ch) ||
5527             Py_UNICODE_ISUPPER(ch) ||
5528             Py_UNICODE_ISTITLE(ch))
5529             previous_is_cased = 1;
5530         else
5531             previous_is_cased = 0;
5532     }
5533     return 1;
5534 }
5535 
5536 PyObject *
5537 PyUnicode_Join(PyObject *separator, PyObject *seq)
5538 {
5539     PyObject *internal_separator = NULL;
5540     const Py_UNICODE blank = ' ';
5541     const Py_UNICODE *sep = &blank;
5542     Py_ssize_t seplen = 1;
5543     PyUnicodeObject *res = NULL; /* the result */
5544     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5545     Py_ssize_t res_used;         /* # used bytes */
5546     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5547     PyObject *fseq;          /* PySequence_Fast(seq) */
5548     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5549     PyObject *item;
5550     Py_ssize_t i;
5551 
5552     fseq = PySequence_Fast(seq, "");
5553     if (fseq == NULL) {
5554         return NULL;
5555     }
5556 
5557     /* Grrrr.  A codec may be invoked to convert str objects to
5558      * Unicode, and so it's possible to call back into Python code
5559      * during PyUnicode_FromObject(), and so it's possible for a sick
5560      * codec to change the size of fseq (if seq is a list).  Therefore
5561      * we have to keep refetching the size -- can't assume seqlen
5562      * is invariant.
5563      */
5564     seqlen = PySequence_Fast_GET_SIZE(fseq);
5565     /* If empty sequence, return u"". */
5566     if (seqlen == 0) {
5567         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5568         goto Done;
5569     }
5570     /* If singleton sequence with an exact Unicode, return that. */
5571     if (seqlen == 1) {
5572         item = PySequence_Fast_GET_ITEM(fseq, 0);
5573         if (PyUnicode_CheckExact(item)) {
5574             Py_INCREF(item);
5575             res = (PyUnicodeObject *)item;
5576             goto Done;
5577         }
5578     }
5579 
5580     /* At least two items to join, or one that isn't exact Unicode. */
5581     if (seqlen > 1) {
5582         /* Set up sep and seplen -- they're needed. */
5583         if (separator == NULL) {
5584             sep = &blank;
5585             seplen = 1;
5586         }
5587         else {
5588             internal_separator = PyUnicode_FromObject(separator);
5589             if (internal_separator == NULL)
5590                 goto onError;
5591             sep = PyUnicode_AS_UNICODE(internal_separator);
5592             seplen = PyUnicode_GET_SIZE(internal_separator);
5593             /* In case PyUnicode_FromObject() mutated seq. */
5594             seqlen = PySequence_Fast_GET_SIZE(fseq);
5595         }
5596     }
5597 
5598     /* Get space. */
5599     res = _PyUnicode_New(res_alloc);
5600     if (res == NULL)
5601         goto onError;
5602     res_p = PyUnicode_AS_UNICODE(res);
5603     res_used = 0;
5604 
5605     for (i = 0; i < seqlen; ++i) {
5606         Py_ssize_t itemlen;
5607         Py_ssize_t new_res_used;
5608 
5609         item = PySequence_Fast_GET_ITEM(fseq, i);
5610         /* Convert item to Unicode. */
5611         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5612             PyErr_Format(PyExc_TypeError,
5613                          "sequence item %zd: expected string or Unicode,"
5614                          " %.80s found",
5615                          i, Py_TYPE(item)->tp_name);
5616             goto onError;
5617         }
5618         item = PyUnicode_FromObject(item);
5619         if (item == NULL)
5620             goto onError;
5621         /* We own a reference to item from here on. */
5622 
5623         /* In case PyUnicode_FromObject() mutated seq. */
5624         seqlen = PySequence_Fast_GET_SIZE(fseq);
5625 
5626         /* Make sure we have enough space for the separator and the item. */
5627         itemlen = PyUnicode_GET_SIZE(item);
5628         new_res_used = res_used + itemlen;
5629         if (new_res_used < 0)
5630             goto Overflow;
5631         if (i < seqlen - 1) {
5632             new_res_used += seplen;
5633             if (new_res_used < 0)
5634                 goto Overflow;
5635         }
5636         if (new_res_used > res_alloc) {
5637             /* double allocated size until it's big enough */
5638             do {
5639                 res_alloc += res_alloc;
5640                 if (res_alloc <= 0)
5641                     goto Overflow;
5642             } while (new_res_used > res_alloc);
5643             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5644                 Py_DECREF(item);
5645                 goto onError;
5646             }
5647             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5648         }
5649 
5650         /* Copy item, and maybe the separator. */
5651         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5652         res_p += itemlen;
5653         if (i < seqlen - 1) {
5654             Py_UNICODE_COPY(res_p, sep, seplen);
5655             res_p += seplen;
5656         }
5657         Py_DECREF(item);
5658         res_used = new_res_used;
5659     }
5660 
5661     /* Shrink res to match the used area; this probably can't fail,
5662      * but it's cheap to check.
5663      */
5664     if (_PyUnicode_Resize(&res, res_used) < 0)
5665         goto onError;
5666 
5667   Done:
5668     Py_XDECREF(internal_separator);
5669     Py_DECREF(fseq);
5670     return (PyObject *)res;
5671 
5672   Overflow:
5673     PyErr_SetString(PyExc_OverflowError,
5674                     "join() result is too long for a Python string");
5675     Py_DECREF(item);
5676     /* fall through */
5677 
5678   onError:
5679     Py_XDECREF(internal_separator);
5680     Py_DECREF(fseq);
5681     Py_XDECREF(res);
5682     return NULL;
5683 }
5684 
5685 static
5686 PyUnicodeObject *pad(PyUnicodeObject *self,
5687                      Py_ssize_t left,
5688                      Py_ssize_t right,
5689                      Py_UNICODE fill)
5690 {
5691     PyUnicodeObject *u;
5692 
5693     if (left < 0)
5694         left = 0;
5695     if (right < 0)
5696         right = 0;
5697 
5698     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5699         Py_INCREF(self);
5700         return self;
5701     }
5702 
5703     if (left > PY_SSIZE_T_MAX - self->length ||
5704         right > PY_SSIZE_T_MAX - (left + self->length)) {
5705         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5706         return NULL;
5707     }
5708     u = _PyUnicode_New(left + self->length + right);
5709     if (u) {
5710         if (left)
5711             Py_UNICODE_FILL(u->str, fill, left);
5712         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5713         if (right)
5714             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5715     }
5716 
5717     return u;
5718 }
5719 
5720 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5721 {
5722     PyObject *list;
5723 
5724     string = PyUnicode_FromObject(string);
5725     if (string == NULL)
5726         return NULL;
5727 
5728     list = stringlib_splitlines(
5729         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5730         PyUnicode_GET_SIZE(string), keepends);
5731 
5732     Py_DECREF(string);
5733     return list;
5734 }
5735 
5736 static
5737 PyObject *split(PyUnicodeObject *self,
5738                 PyUnicodeObject *substring,
5739                 Py_ssize_t maxcount)
5740 {
5741     if (maxcount < 0)
5742         maxcount = PY_SSIZE_T_MAX;
5743 
5744     if (substring == NULL)
5745         return stringlib_split_whitespace(
5746             (PyObject*) self,  self->str, self->length, maxcount
5747             );
5748 
5749     return stringlib_split(
5750         (PyObject*) self,  self->str, self->length,
5751         substring->str, substring->length,
5752         maxcount
5753         );
5754 }
5755 
5756 static
5757 PyObject *rsplit(PyUnicodeObject *self,
5758                  PyUnicodeObject *substring,
5759                  Py_ssize_t maxcount)
5760 {
5761     if (maxcount < 0)
5762         maxcount = PY_SSIZE_T_MAX;
5763 
5764     if (substring == NULL)
5765         return stringlib_rsplit_whitespace(
5766             (PyObject*) self,  self->str, self->length, maxcount
5767             );
5768 
5769     return stringlib_rsplit(
5770         (PyObject*) self,  self->str, self->length,
5771         substring->str, substring->length,
5772         maxcount
5773         );
5774 }
5775 
5776 static
5777 PyObject *replace(PyUnicodeObject *self,
5778                   PyUnicodeObject *str1,
5779                   PyUnicodeObject *str2,
5780                   Py_ssize_t maxcount)
5781 {
5782     PyUnicodeObject *u;
5783 
5784     if (maxcount < 0)
5785         maxcount = PY_SSIZE_T_MAX;
5786     else if (maxcount == 0 || self->length == 0)
5787         goto nothing;
5788 
5789     if (str1->length == str2->length) {
5790         Py_ssize_t i;
5791         /* same length */
5792         if (str1->length == 0)
5793             goto nothing;
5794         if (str1->length == 1) {
5795             /* replace characters */
5796             Py_UNICODE u1, u2;
5797             if (!findchar(self->str, self->length, str1->str[0]))
5798                 goto nothing;
5799             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5800             if (!u)
5801                 return NULL;
5802             Py_UNICODE_COPY(u->str, self->str, self->length);
5803             u1 = str1->str[0];
5804             u2 = str2->str[0];
5805             for (i = 0; i < u->length; i++)
5806                 if (u->str[i] == u1) {
5807                     if (--maxcount < 0)
5808                         break;
5809                     u->str[i] = u2;
5810                 }
5811         } else {
5812             i = stringlib_find(
5813                 self->str, self->length, str1->str, str1->length, 0
5814                 );
5815             if (i < 0)
5816                 goto nothing;
5817             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5818             if (!u)
5819                 return NULL;
5820             Py_UNICODE_COPY(u->str, self->str, self->length);
5821 
5822             /* change everything in-place, starting with this one */
5823             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5824             i += str1->length;
5825 
5826             while ( --maxcount > 0) {
5827                 i = stringlib_find(self->str+i, self->length-i,
5828                                    str1->str, str1->length,
5829                                    i);
5830                 if (i == -1)
5831                     break;
5832                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5833                 i += str1->length;
5834             }
5835         }
5836     } else {
5837 
5838         Py_ssize_t n, i, j;
5839         Py_ssize_t product, new_size, delta;
5840         Py_UNICODE *p;
5841 
5842         /* replace strings */
5843         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5844                             maxcount);
5845         if (n == 0)
5846             goto nothing;
5847         /* new_size = self->length + n * (str2->length - str1->length)); */
5848         delta = (str2->length - str1->length);
5849         if (delta == 0) {
5850             new_size = self->length;
5851         } else {
5852             product = n * (str2->length - str1->length);
5853             if ((product / (str2->length - str1->length)) != n) {
5854                 PyErr_SetString(PyExc_OverflowError,
5855                                 "replace string is too long");
5856                 return NULL;
5857             }
5858             new_size = self->length + product;
5859             if (new_size < 0) {
5860                 PyErr_SetString(PyExc_OverflowError,
5861                                 "replace string is too long");
5862                 return NULL;
5863             }
5864         }
5865         u = _PyUnicode_New(new_size);
5866         if (!u)
5867             return NULL;
5868         i = 0;
5869         p = u->str;
5870         if (str1->length > 0) {
5871             while (n-- > 0) {
5872                 /* look for next match */
5873                 j = stringlib_find(self->str+i, self->length-i,
5874                                    str1->str, str1->length,
5875                                    i);
5876                 if (j == -1)
5877                     break;
5878                 else if (j > i) {
5879                     /* copy unchanged part [i:j] */
5880                     Py_UNICODE_COPY(p, self->str+i, j-i);
5881                     p += j - i;
5882                 }
5883                 /* copy substitution string */
5884                 if (str2->length > 0) {
5885                     Py_UNICODE_COPY(p, str2->str, str2->length);
5886                     p += str2->length;
5887                 }
5888                 i = j + str1->length;
5889             }
5890             if (i < self->length)
5891                 /* copy tail [i:] */
5892                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5893         } else {
5894             /* interleave */
5895             while (n > 0) {
5896                 Py_UNICODE_COPY(p, str2->str, str2->length);
5897                 p += str2->length;
5898                 if (--n <= 0)
5899                     break;
5900                 *p++ = self->str[i++];
5901             }
5902             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5903         }
5904     }
5905     return (PyObject *) u;
5906 
5907   nothing:
5908     /* nothing to replace; return original string (when possible) */
5909     if (PyUnicode_CheckExact(self)) {
5910         Py_INCREF(self);
5911         return (PyObject *) self;
5912     }
5913     return PyUnicode_FromUnicode(self->str, self->length);
5914 }
5915 
5916 /* --- Unicode Object Methods --------------------------------------------- */
5917 
5918 PyDoc_STRVAR(title__doc__,
5919              "S.title() -> unicode\n\
5920 \n\
5921 Return a titlecased version of S, i.e. words start with title case\n\
5922 characters, all remaining cased characters have lower case.");
5923 
5924 static PyObject*
5925 unicode_title(PyUnicodeObject *self)
5926 {
5927     return fixup(self, fixtitle);
5928 }
5929 
5930 PyDoc_STRVAR(capitalize__doc__,
5931              "S.capitalize() -> unicode\n\
5932 \n\
5933 Return a capitalized version of S, i.e. make the first character\n\
5934 have upper case and the rest lower case.");
5935 
5936 static PyObject*
5937 unicode_capitalize(PyUnicodeObject *self)
5938 {
5939     return fixup(self, fixcapitalize);
5940 }
5941 
5942 #if 0
5943 PyDoc_STRVAR(capwords__doc__,
5944              "S.capwords() -> unicode\n\
5945 \n\
5946 Apply .capitalize() to all words in S and return the result with\n\
5947 normalized whitespace (all whitespace strings are replaced by ' ').");
5948 
5949 static PyObject*
5950 unicode_capwords(PyUnicodeObject *self)
5951 {
5952     PyObject *list;
5953     PyObject *item;
5954     Py_ssize_t i;
5955 
5956     /* Split into words */
5957     list = split(self, NULL, -1);
5958     if (!list)
5959         return NULL;
5960 
5961     /* Capitalize each word */
5962     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5963         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5964                      fixcapitalize);
5965         if (item == NULL)
5966             goto onError;
5967         Py_DECREF(PyList_GET_ITEM(list, i));
5968         PyList_SET_ITEM(list, i, item);
5969     }
5970 
5971     /* Join the words to form a new string */
5972     item = PyUnicode_Join(NULL, list);
5973 
5974   onError:
5975     Py_DECREF(list);
5976     return (PyObject *)item;
5977 }
5978 #endif
5979 
5980 /* Argument converter.  Coerces to a single unicode character */
5981 
5982 static int
5983 convert_uc(PyObject *obj, void *addr)
5984 {
5985     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5986     PyObject *uniobj;
5987     Py_UNICODE *unistr;
5988 
5989     uniobj = PyUnicode_FromObject(obj);
5990     if (uniobj == NULL) {
5991         PyErr_SetString(PyExc_TypeError,
5992                         "The fill character cannot be converted to Unicode");
5993         return 0;
5994     }
5995     if (PyUnicode_GET_SIZE(uniobj) != 1) {
5996         PyErr_SetString(PyExc_TypeError,
5997                         "The fill character must be exactly one character long");
5998         Py_DECREF(uniobj);
5999         return 0;
6000     }
6001     unistr = PyUnicode_AS_UNICODE(uniobj);
6002     *fillcharloc = unistr[0];
6003     Py_DECREF(uniobj);
6004     return 1;
6005 }
6006 
6007 PyDoc_STRVAR(center__doc__,
6008              "S.center(width[, fillchar]) -> unicode\n\
6009 \n\
6010 Return S centered in a Unicode string of length width. Padding is\n\
6011 done using the specified fill character (default is a space)");
6012 
6013 static PyObject *
6014 unicode_center(PyUnicodeObject *self, PyObject *args)
6015 {
6016     Py_ssize_t marg, left;
6017     Py_ssize_t width;
6018     Py_UNICODE fillchar = ' ';
6019 
6020     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6021         return NULL;
6022 
6023     if (self->length >= width && PyUnicode_CheckExact(self)) {
6024         Py_INCREF(self);
6025         return (PyObject*) self;
6026     }
6027 
6028     marg = width - self->length;
6029     left = marg / 2 + (marg & width & 1);
6030 
6031     return (PyObject*) pad(self, left, marg - left, fillchar);
6032 }
6033 
6034 #if 0
6035 
6036 /* This code should go into some future Unicode collation support
6037    module. The basic comparison should compare ordinals on a naive
6038    basis (this is what Java does and thus Jython too). */
6039 
6040 /* speedy UTF-16 code point order comparison */
6041 /* gleaned from: */
6042 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6043 
6044 static short utf16Fixup[32] =
6045 {
6046     0, 0, 0, 0, 0, 0, 0, 0,
6047     0, 0, 0, 0, 0, 0, 0, 0,
6048     0, 0, 0, 0, 0, 0, 0, 0,
6049     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6050 };
6051 
6052 static int
6053 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6054 {
6055     Py_ssize_t len1, len2;
6056 
6057     Py_UNICODE *s1 = str1->str;
6058     Py_UNICODE *s2 = str2->str;
6059 
6060     len1 = str1->length;
6061     len2 = str2->length;
6062 
6063     while (len1 > 0 && len2 > 0) {
6064         Py_UNICODE c1, c2;
6065 
6066         c1 = *s1++;
6067         c2 = *s2++;
6068 
6069         if (c1 > (1<<11) * 26)
6070             c1 += utf16Fixup[c1>>11];
6071         if (c2 > (1<<11) * 26)
6072             c2 += utf16Fixup[c2>>11];
6073         /* now c1 and c2 are in UTF-32-compatible order */
6074 
6075         if (c1 != c2)
6076             return (c1 < c2) ? -1 : 1;
6077 
6078         len1--; len2--;
6079     }
6080 
6081     return (len1 < len2) ? -1 : (len1 != len2);
6082 }
6083 
6084 #else
6085 
6086 static int
6087 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6088 {
6089     register Py_ssize_t len1, len2;
6090 
6091     Py_UNICODE *s1 = str1->str;
6092     Py_UNICODE *s2 = str2->str;
6093 
6094     len1 = str1->length;
6095     len2 = str2->length;
6096 
6097     while (len1 > 0 && len2 > 0) {
6098         Py_UNICODE c1, c2;
6099 
6100         c1 = *s1++;
6101         c2 = *s2++;
6102 
6103         if (c1 != c2)
6104             return (c1 < c2) ? -1 : 1;
6105 
6106         len1--; len2--;
6107     }
6108 
6109     return (len1 < len2) ? -1 : (len1 != len2);
6110 }
6111 
6112 #endif
6113 
6114 int PyUnicode_Compare(PyObject *left,
6115                       PyObject *right)
6116 {
6117     PyUnicodeObject *u = NULL, *v = NULL;
6118     int result;
6119 
6120     /* Coerce the two arguments */
6121     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6122     if (u == NULL)
6123         goto onError;
6124     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6125     if (v == NULL)
6126         goto onError;
6127 
6128     /* Shortcut for empty or interned objects */
6129     if (v == u) {
6130         Py_DECREF(u);
6131         Py_DECREF(v);
6132         return 0;
6133     }
6134 
6135     result = unicode_compare(u, v);
6136 
6137     Py_DECREF(u);
6138     Py_DECREF(v);
6139     return result;
6140 
6141   onError:
6142     Py_XDECREF(u);
6143     Py_XDECREF(v);
6144     return -1;
6145 }
6146 
6147 PyObject *PyUnicode_RichCompare(PyObject *left,
6148                                 PyObject *right,
6149                                 int op)
6150 {
6151     int result;
6152 
6153     result = PyUnicode_Compare(left, right);
6154     if (result == -1 && PyErr_Occurred())
6155         goto onError;
6156 
6157     /* Convert the return value to a Boolean */
6158     switch (op) {
6159     case Py_EQ:
6160         result = (result == 0);
6161         break;
6162     case Py_NE:
6163         result = (result != 0);
6164         break;
6165     case Py_LE:
6166         result = (result <= 0);
6167         break;
6168     case Py_GE:
6169         result = (result >= 0);
6170         break;
6171     case Py_LT:
6172         result = (result == -1);
6173         break;
6174     case Py_GT:
6175         result = (result == 1);
6176         break;
6177     }
6178     return PyBool_FromLong(result);
6179 
6180   onError:
6181 
6182     /* Standard case
6183 
6184        Type errors mean that PyUnicode_FromObject() could not convert
6185        one of the arguments (usually the right hand side) to Unicode,
6186        ie. we can't handle the comparison request. However, it is
6187        possible that the other object knows a comparison method, which
6188        is why we return Py_NotImplemented to give the other object a
6189        chance.
6190 
6191     */
6192     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6193         PyErr_Clear();
6194         Py_INCREF(Py_NotImplemented);
6195         return Py_NotImplemented;
6196     }
6197     if (op != Py_EQ && op != Py_NE)
6198         return NULL;
6199 
6200     /* Equality comparison.
6201 
6202        This is a special case: we silence any PyExc_UnicodeDecodeError
6203        and instead turn it into a PyErr_UnicodeWarning.
6204 
6205     */
6206     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6207         return NULL;
6208     PyErr_Clear();
6209     if (PyErr_Warn(PyExc_UnicodeWarning,
6210                    (op == Py_EQ) ?
6211                    "Unicode equal comparison "
6212                    "failed to convert both arguments to Unicode - "
6213                    "interpreting them as being unequal" :
6214                    "Unicode unequal comparison "
6215                    "failed to convert both arguments to Unicode - "
6216                    "interpreting them as being unequal"
6217             ) < 0)
6218         return NULL;
6219     result = (op == Py_NE);
6220     return PyBool_FromLong(result);
6221 }
6222 
6223 int PyUnicode_Contains(PyObject *container,
6224                        PyObject *element)
6225 {
6226     PyObject *str, *sub;
6227     int result;
6228 
6229     /* Coerce the two arguments */
6230     sub = PyUnicode_FromObject(element);
6231     if (!sub) {
6232         return -1;
6233     }
6234 
6235     str = PyUnicode_FromObject(container);
6236     if (!str) {
6237         Py_DECREF(sub);
6238         return -1;
6239     }
6240 
6241     result = stringlib_contains_obj(str, sub);
6242 
6243     Py_DECREF(str);
6244     Py_DECREF(sub);
6245 
6246     return result;
6247 }
6248 
6249 /* Concat to string or Unicode object giving a new Unicode object. */
6250 
6251 PyObject *PyUnicode_Concat(PyObject *left,
6252                            PyObject *right)
6253 {
6254     PyUnicodeObject *u = NULL, *v = NULL, *w;
6255 
6256     /* Coerce the two arguments */
6257     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6258     if (u == NULL)
6259         goto onError;
6260     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6261     if (v == NULL)
6262         goto onError;
6263 
6264     /* Shortcuts */
6265     if (v == unicode_empty) {
6266         Py_DECREF(v);
6267         return (PyObject *)u;
6268     }
6269     if (u == unicode_empty) {
6270         Py_DECREF(u);
6271         return (PyObject *)v;
6272     }
6273 
6274     /* Concat the two Unicode strings */
6275     w = _PyUnicode_New(u->length + v->length);
6276     if (w == NULL)
6277         goto onError;
6278     Py_UNICODE_COPY(w->str, u->str, u->length);
6279     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6280 
6281     Py_DECREF(u);
6282     Py_DECREF(v);
6283     return (PyObject *)w;
6284 
6285   onError:
6286     Py_XDECREF(u);
6287     Py_XDECREF(v);
6288     return NULL;
6289 }
6290 
6291 PyDoc_STRVAR(count__doc__,
6292              "S.count(sub[, start[, end]]) -> int\n\
6293 \n\
6294 Return the number of non-overlapping occurrences of substring sub in\n\
6295 Unicode string S[start:end].  Optional arguments start and end are\n\
6296 interpreted as in slice notation.");
6297 
6298 static PyObject *
6299 unicode_count(PyUnicodeObject *self, PyObject *args)
6300 {
6301     PyUnicodeObject *substring;
6302     Py_ssize_t start = 0;
6303     Py_ssize_t end = PY_SSIZE_T_MAX;
6304     PyObject *result;
6305 
6306     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6307                                             &start, &end))
6308         return NULL;
6309 
6310     ADJUST_INDICES(start, end, self->length);
6311     result = PyInt_FromSsize_t(
6312         stringlib_count(self->str + start, end - start,
6313                         substring->str, substring->length,
6314                         PY_SSIZE_T_MAX)
6315         );
6316 
6317     Py_DECREF(substring);
6318 
6319     return result;
6320 }
6321 
6322 PyDoc_STRVAR(encode__doc__,
6323              "S.encode([encoding[,errors]]) -> string or unicode\n\
6324 \n\
6325 Encodes S using the codec registered for encoding. encoding defaults\n\
6326 to the default encoding. errors may be given to set a different error\n\
6327 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6328 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6329 'xmlcharrefreplace' as well as any other name registered with\n\
6330 codecs.register_error that can handle UnicodeEncodeErrors.");
6331 
6332 static PyObject *
6333 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6334 {
6335     static char *kwlist[] = {"encoding", "errors", 0};
6336     char *encoding = NULL;
6337     char *errors = NULL;
6338     PyObject *v;
6339 
6340     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6341                                      kwlist, &encoding, &errors))
6342         return NULL;
6343     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6344     if (v == NULL)
6345         goto onError;
6346     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6347         PyErr_Format(PyExc_TypeError,
6348                      "encoder did not return a string/unicode object "
6349                      "(type=%.400s)",
6350                      Py_TYPE(v)->tp_name);
6351         Py_DECREF(v);
6352         return NULL;
6353     }
6354     return v;
6355 
6356   onError:
6357     return NULL;
6358 }
6359 
6360 PyDoc_STRVAR(decode__doc__,
6361              "S.decode([encoding[,errors]]) -> string or unicode\n\
6362 \n\
6363 Decodes S using the codec registered for encoding. encoding defaults\n\
6364 to the default encoding. errors may be given to set a different error\n\
6365 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6366 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6367 as well as any other name registered with codecs.register_error that is\n\
6368 able to handle UnicodeDecodeErrors.");
6369 
6370 static PyObject *
6371 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6372 {
6373     static char *kwlist[] = {"encoding", "errors", 0};
6374     char *encoding = NULL;
6375     char *errors = NULL;
6376     PyObject *v;
6377 
6378     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6379                                      kwlist, &encoding, &errors))
6380         return NULL;
6381     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6382     if (v == NULL)
6383         goto onError;
6384     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6385         PyErr_Format(PyExc_TypeError,
6386                      "decoder did not return a string/unicode object "
6387                      "(type=%.400s)",
6388                      Py_TYPE(v)->tp_name);
6389         Py_DECREF(v);
6390         return NULL;
6391     }
6392     return v;
6393 
6394   onError:
6395     return NULL;
6396 }
6397 
6398 PyDoc_STRVAR(expandtabs__doc__,
6399              "S.expandtabs([tabsize]) -> unicode\n\
6400 \n\
6401 Return a copy of S where all tab characters are expanded using spaces.\n\
6402 If tabsize is not given, a tab size of 8 characters is assumed.");
6403 
6404 static PyObject*
6405 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6406 {
6407     Py_UNICODE *e;
6408     Py_UNICODE *p;
6409     Py_UNICODE *q;
6410     Py_UNICODE *qe;
6411     Py_ssize_t i, j, incr;
6412     PyUnicodeObject *u;
6413     int tabsize = 8;
6414 
6415     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6416         return NULL;
6417 
6418     /* First pass: determine size of output string */
6419     i = 0; /* chars up to and including most recent \n or \r */
6420     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6421     e = self->str + self->length; /* end of input */
6422     for (p = self->str; p < e; p++)
6423         if (*p == '\t') {
6424             if (tabsize > 0) {
6425                 incr = tabsize - (j % tabsize); /* cannot overflow */
6426                 if (j > PY_SSIZE_T_MAX - incr)
6427                     goto overflow1;
6428                 j += incr;
6429             }
6430         }
6431         else {
6432             if (j > PY_SSIZE_T_MAX - 1)
6433                 goto overflow1;
6434             j++;
6435             if (*p == '\n' || *p == '\r') {
6436                 if (i > PY_SSIZE_T_MAX - j)
6437                     goto overflow1;
6438                 i += j;
6439                 j = 0;
6440             }
6441         }
6442 
6443     if (i > PY_SSIZE_T_MAX - j)
6444         goto overflow1;
6445 
6446     /* Second pass: create output string and fill it */
6447     u = _PyUnicode_New(i + j);
6448     if (!u)
6449         return NULL;
6450 
6451     j = 0; /* same as in first pass */
6452     q = u->str; /* next output char */
6453     qe = u->str + u->length; /* end of output */
6454 
6455     for (p = self->str; p < e; p++)
6456         if (*p == '\t') {
6457             if (tabsize > 0) {
6458                 i = tabsize - (j % tabsize);
6459                 j += i;
6460                 while (i--) {
6461                     if (q >= qe)
6462                         goto overflow2;
6463                     *q++ = ' ';
6464                 }
6465             }
6466         }
6467         else {
6468             if (q >= qe)
6469                 goto overflow2;
6470             *q++ = *p;
6471             j++;
6472             if (*p == '\n' || *p == '\r')
6473                 j = 0;
6474         }
6475 
6476     return (PyObject*) u;
6477 
6478   overflow2:
6479     Py_DECREF(u);
6480   overflow1:
6481     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6482     return NULL;
6483 }
6484 
6485 PyDoc_STRVAR(find__doc__,
6486              "S.find(sub [,start [,end]]) -> int\n\
6487 \n\
6488 Return the lowest index in S where substring sub is found,\n\
6489 such that sub is contained within S[start:end].  Optional\n\
6490 arguments start and end are interpreted as in slice notation.\n\
6491 \n\
6492 Return -1 on failure.");
6493 
6494 static PyObject *
6495 unicode_find(PyUnicodeObject *self, PyObject *args)
6496 {
6497     PyUnicodeObject *substring;
6498     Py_ssize_t start;
6499     Py_ssize_t end;
6500     Py_ssize_t result;
6501 
6502     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6503                                             &start, &end))
6504         return NULL;
6505 
6506     result = stringlib_find_slice(
6507         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6508         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6509         start, end
6510         );
6511 
6512     Py_DECREF(substring);
6513 
6514     return PyInt_FromSsize_t(result);
6515 }
6516 
6517 static PyObject *
6518 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6519 {
6520     if (index < 0 || index >= self->length) {
6521         PyErr_SetString(PyExc_IndexError, "string index out of range");
6522         return NULL;
6523     }
6524 
6525     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6526 }
6527 
6528 static long
6529 unicode_hash(PyUnicodeObject *self)
6530 {
6531     /* Since Unicode objects compare equal to their ASCII string
6532        counterparts, they should use the individual character values
6533        as basis for their hash value.  This is needed to assure that
6534        strings and Unicode objects behave in the same way as
6535        dictionary keys. */
6536 
6537     register Py_ssize_t len;
6538     register Py_UNICODE *p;
6539     register long x;
6540 
6541 #ifdef Py_DEBUG
6542     assert(_Py_HashSecret_Initialized);
6543 #endif
6544     if (self->hash != -1)
6545         return self->hash;
6546     len = PyUnicode_GET_SIZE(self);
6547     /*
6548       We make the hash of the empty string be 0, rather than using
6549       (prefix ^ suffix), since this slightly obfuscates the hash secret
6550     */
6551     if (len == 0) {
6552         self->hash = 0;
6553         return 0;
6554     }
6555     p = PyUnicode_AS_UNICODE(self);
6556     x = _Py_HashSecret.prefix;
6557     x ^= *p << 7;
6558     while (--len >= 0)
6559         x = (1000003*x) ^ *p++;
6560     x ^= PyUnicode_GET_SIZE(self);
6561     x ^= _Py_HashSecret.suffix;
6562     if (x == -1)
6563         x = -2;
6564     self->hash = x;
6565     return x;
6566 }
6567 
6568 PyDoc_STRVAR(index__doc__,
6569              "S.index(sub [,start [,end]]) -> int\n\
6570 \n\
6571 Like S.find() but raise ValueError when the substring is not found.");
6572 
6573 static PyObject *
6574 unicode_index(PyUnicodeObject *self, PyObject *args)
6575 {
6576     Py_ssize_t result;
6577     PyUnicodeObject *substring;
6578     Py_ssize_t start;
6579     Py_ssize_t end;
6580 
6581     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6582                                             &start, &end))
6583         return NULL;
6584 
6585     result = stringlib_find_slice(
6586         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6587         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6588         start, end
6589         );
6590 
6591     Py_DECREF(substring);
6592 
6593     if (result < 0) {
6594         PyErr_SetString(PyExc_ValueError, "substring not found");
6595         return NULL;
6596     }
6597 
6598     return PyInt_FromSsize_t(result);
6599 }
6600 
6601 PyDoc_STRVAR(islower__doc__,
6602              "S.islower() -> bool\n\
6603 \n\
6604 Return True if all cased characters in S are lowercase and there is\n\
6605 at least one cased character in S, False otherwise.");
6606 
6607 static PyObject*
6608 unicode_islower(PyUnicodeObject *self)
6609 {
6610     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6611     register const Py_UNICODE *e;
6612     int cased;
6613 
6614     /* Shortcut for single character strings */
6615     if (PyUnicode_GET_SIZE(self) == 1)
6616         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6617 
6618     /* Special case for empty strings */
6619     if (PyUnicode_GET_SIZE(self) == 0)
6620         return PyBool_FromLong(0);
6621 
6622     e = p + PyUnicode_GET_SIZE(self);
6623     cased = 0;
6624     for (; p < e; p++) {
6625         register const Py_UNICODE ch = *p;
6626 
6627         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6628             return PyBool_FromLong(0);
6629         else if (!cased && Py_UNICODE_ISLOWER(ch))
6630             cased = 1;
6631     }
6632     return PyBool_FromLong(cased);
6633 }
6634 
6635 PyDoc_STRVAR(isupper__doc__,
6636              "S.isupper() -> bool\n\
6637 \n\
6638 Return True if all cased characters in S are uppercase and there is\n\
6639 at least one cased character in S, False otherwise.");
6640 
6641 static PyObject*
6642 unicode_isupper(PyUnicodeObject *self)
6643 {
6644     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6645     register const Py_UNICODE *e;
6646     int cased;
6647 
6648     /* Shortcut for single character strings */
6649     if (PyUnicode_GET_SIZE(self) == 1)
6650         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6651 
6652     /* Special case for empty strings */
6653     if (PyUnicode_GET_SIZE(self) == 0)
6654         return PyBool_FromLong(0);
6655 
6656     e = p + PyUnicode_GET_SIZE(self);
6657     cased = 0;
6658     for (; p < e; p++) {
6659         register const Py_UNICODE ch = *p;
6660 
6661         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6662             return PyBool_FromLong(0);
6663         else if (!cased && Py_UNICODE_ISUPPER(ch))
6664             cased = 1;
6665     }
6666     return PyBool_FromLong(cased);
6667 }
6668 
6669 PyDoc_STRVAR(istitle__doc__,
6670              "S.istitle() -> bool\n\
6671 \n\
6672 Return True if S is a titlecased string and there is at least one\n\
6673 character in S, i.e. upper- and titlecase characters may only\n\
6674 follow uncased characters and lowercase characters only cased ones.\n\
6675 Return False otherwise.");
6676 
6677 static PyObject*
6678 unicode_istitle(PyUnicodeObject *self)
6679 {
6680     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6681     register const Py_UNICODE *e;
6682     int cased, previous_is_cased;
6683 
6684     /* Shortcut for single character strings */
6685     if (PyUnicode_GET_SIZE(self) == 1)
6686         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6687                                (Py_UNICODE_ISUPPER(*p) != 0));
6688 
6689     /* Special case for empty strings */
6690     if (PyUnicode_GET_SIZE(self) == 0)
6691         return PyBool_FromLong(0);
6692 
6693     e = p + PyUnicode_GET_SIZE(self);
6694     cased = 0;
6695     previous_is_cased = 0;
6696     for (; p < e; p++) {
6697         register const Py_UNICODE ch = *p;
6698 
6699         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6700             if (previous_is_cased)
6701                 return PyBool_FromLong(0);
6702             previous_is_cased = 1;
6703             cased = 1;
6704         }
6705         else if (Py_UNICODE_ISLOWER(ch)) {
6706             if (!previous_is_cased)
6707                 return PyBool_FromLong(0);
6708             previous_is_cased = 1;
6709             cased = 1;
6710         }
6711         else
6712             previous_is_cased = 0;
6713     }
6714     return PyBool_FromLong(cased);
6715 }
6716 
6717 PyDoc_STRVAR(isspace__doc__,
6718              "S.isspace() -> bool\n\
6719 \n\
6720 Return True if all characters in S are whitespace\n\
6721 and there is at least one character in S, False otherwise.");
6722 
6723 static PyObject*
6724 unicode_isspace(PyUnicodeObject *self)
6725 {
6726     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6727     register const Py_UNICODE *e;
6728 
6729     /* Shortcut for single character strings */
6730     if (PyUnicode_GET_SIZE(self) == 1 &&
6731         Py_UNICODE_ISSPACE(*p))
6732         return PyBool_FromLong(1);
6733 
6734     /* Special case for empty strings */
6735     if (PyUnicode_GET_SIZE(self) == 0)
6736         return PyBool_FromLong(0);
6737 
6738     e = p + PyUnicode_GET_SIZE(self);
6739     for (; p < e; p++) {
6740         if (!Py_UNICODE_ISSPACE(*p))
6741             return PyBool_FromLong(0);
6742     }
6743     return PyBool_FromLong(1);
6744 }
6745 
6746 PyDoc_STRVAR(isalpha__doc__,
6747              "S.isalpha() -> bool\n\
6748 \n\
6749 Return True if all characters in S are alphabetic\n\
6750 and there is at least one character in S, False otherwise.");
6751 
6752 static PyObject*
6753 unicode_isalpha(PyUnicodeObject *self)
6754 {
6755     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6756     register const Py_UNICODE *e;
6757 
6758     /* Shortcut for single character strings */
6759     if (PyUnicode_GET_SIZE(self) == 1 &&
6760         Py_UNICODE_ISALPHA(*p))
6761         return PyBool_FromLong(1);
6762 
6763     /* Special case for empty strings */
6764     if (PyUnicode_GET_SIZE(self) == 0)
6765         return PyBool_FromLong(0);
6766 
6767     e = p + PyUnicode_GET_SIZE(self);
6768     for (; p < e; p++) {
6769         if (!Py_UNICODE_ISALPHA(*p))
6770             return PyBool_FromLong(0);
6771     }
6772     return PyBool_FromLong(1);
6773 }
6774 
6775 PyDoc_STRVAR(isalnum__doc__,
6776              "S.isalnum() -> bool\n\
6777 \n\
6778 Return True if all characters in S are alphanumeric\n\
6779 and there is at least one character in S, False otherwise.");
6780 
6781 static PyObject*
6782 unicode_isalnum(PyUnicodeObject *self)
6783 {
6784     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6785     register const Py_UNICODE *e;
6786 
6787     /* Shortcut for single character strings */
6788     if (PyUnicode_GET_SIZE(self) == 1 &&
6789         Py_UNICODE_ISALNUM(*p))
6790         return PyBool_FromLong(1);
6791 
6792     /* Special case for empty strings */
6793     if (PyUnicode_GET_SIZE(self) == 0)
6794         return PyBool_FromLong(0);
6795 
6796     e = p + PyUnicode_GET_SIZE(self);
6797     for (; p < e; p++) {
6798         if (!Py_UNICODE_ISALNUM(*p))
6799             return PyBool_FromLong(0);
6800     }
6801     return PyBool_FromLong(1);
6802 }
6803 
6804 PyDoc_STRVAR(isdecimal__doc__,
6805              "S.isdecimal() -> bool\n\
6806 \n\
6807 Return True if there are only decimal characters in S,\n\
6808 False otherwise.");
6809 
6810 static PyObject*
6811 unicode_isdecimal(PyUnicodeObject *self)
6812 {
6813     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6814     register const Py_UNICODE *e;
6815 
6816     /* Shortcut for single character strings */
6817     if (PyUnicode_GET_SIZE(self) == 1 &&
6818         Py_UNICODE_ISDECIMAL(*p))
6819         return PyBool_FromLong(1);
6820 
6821     /* Special case for empty strings */
6822     if (PyUnicode_GET_SIZE(self) == 0)
6823         return PyBool_FromLong(0);
6824 
6825     e = p + PyUnicode_GET_SIZE(self);
6826     for (; p < e; p++) {
6827         if (!Py_UNICODE_ISDECIMAL(*p))
6828             return PyBool_FromLong(0);
6829     }
6830     return PyBool_FromLong(1);
6831 }
6832 
6833 PyDoc_STRVAR(isdigit__doc__,
6834              "S.isdigit() -> bool\n\
6835 \n\
6836 Return True if all characters in S are digits\n\
6837 and there is at least one character in S, False otherwise.");
6838 
6839 static PyObject*
6840 unicode_isdigit(PyUnicodeObject *self)
6841 {
6842     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6843     register const Py_UNICODE *e;
6844 
6845     /* Shortcut for single character strings */
6846     if (PyUnicode_GET_SIZE(self) == 1 &&
6847         Py_UNICODE_ISDIGIT(*p))
6848         return PyBool_FromLong(1);
6849 
6850     /* Special case for empty strings */
6851     if (PyUnicode_GET_SIZE(self) == 0)
6852         return PyBool_FromLong(0);
6853 
6854     e = p + PyUnicode_GET_SIZE(self);
6855     for (; p < e; p++) {
6856         if (!Py_UNICODE_ISDIGIT(*p))
6857             return PyBool_FromLong(0);
6858     }
6859     return PyBool_FromLong(1);
6860 }
6861 
6862 PyDoc_STRVAR(isnumeric__doc__,
6863              "S.isnumeric() -> bool\n\
6864 \n\
6865 Return True if there are only numeric characters in S,\n\
6866 False otherwise.");
6867 
6868 static PyObject*
6869 unicode_isnumeric(PyUnicodeObject *self)
6870 {
6871     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872     register const Py_UNICODE *e;
6873 
6874     /* Shortcut for single character strings */
6875     if (PyUnicode_GET_SIZE(self) == 1 &&
6876         Py_UNICODE_ISNUMERIC(*p))
6877         return PyBool_FromLong(1);
6878 
6879     /* Special case for empty strings */
6880     if (PyUnicode_GET_SIZE(self) == 0)
6881         return PyBool_FromLong(0);
6882 
6883     e = p + PyUnicode_GET_SIZE(self);
6884     for (; p < e; p++) {
6885         if (!Py_UNICODE_ISNUMERIC(*p))
6886             return PyBool_FromLong(0);
6887     }
6888     return PyBool_FromLong(1);
6889 }
6890 
6891 PyDoc_STRVAR(join__doc__,
6892              "S.join(iterable) -> unicode\n\
6893 \n\
6894 Return a string which is the concatenation of the strings in the\n\
6895 iterable.  The separator between elements is S.");
6896 
6897 static PyObject*
6898 unicode_join(PyObject *self, PyObject *data)
6899 {
6900     return PyUnicode_Join(self, data);
6901 }
6902 
6903 static Py_ssize_t
6904 unicode_length(PyUnicodeObject *self)
6905 {
6906     return self->length;
6907 }
6908 
6909 PyDoc_STRVAR(ljust__doc__,
6910              "S.ljust(width[, fillchar]) -> int\n\
6911 \n\
6912 Return S left-justified in a Unicode string of length width. Padding is\n\
6913 done using the specified fill character (default is a space).");
6914 
6915 static PyObject *
6916 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6917 {
6918     Py_ssize_t width;
6919     Py_UNICODE fillchar = ' ';
6920 
6921     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6922         return NULL;
6923 
6924     if (self->length >= width && PyUnicode_CheckExact(self)) {
6925         Py_INCREF(self);
6926         return (PyObject*) self;
6927     }
6928 
6929     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6930 }
6931 
6932 PyDoc_STRVAR(lower__doc__,
6933              "S.lower() -> unicode\n\
6934 \n\
6935 Return a copy of the string S converted to lowercase.");
6936 
6937 static PyObject*
6938 unicode_lower(PyUnicodeObject *self)
6939 {
6940     return fixup(self, fixlower);
6941 }
6942 
6943 #define LEFTSTRIP 0
6944 #define RIGHTSTRIP 1
6945 #define BOTHSTRIP 2
6946 
6947 /* Arrays indexed by above */
6948 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6949 
6950 #define STRIPNAME(i) (stripformat[i]+3)
6951 
6952 /* externally visible for str.strip(unicode) */
6953 PyObject *
6954 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6955 {
6956     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6957     Py_ssize_t len = PyUnicode_GET_SIZE(self);
6958     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6959     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6960     Py_ssize_t i, j;
6961 
6962     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6963 
6964     i = 0;
6965     if (striptype != RIGHTSTRIP) {
6966         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6967             i++;
6968         }
6969     }
6970 
6971     j = len;
6972     if (striptype != LEFTSTRIP) {
6973         do {
6974             j--;
6975         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6976         j++;
6977     }
6978 
6979     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6980         Py_INCREF(self);
6981         return (PyObject*)self;
6982     }
6983     else
6984         return PyUnicode_FromUnicode(s+i, j-i);
6985 }
6986 
6987 
6988 static PyObject *
6989 do_strip(PyUnicodeObject *self, int striptype)
6990 {
6991     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6992     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6993 
6994     i = 0;
6995     if (striptype != RIGHTSTRIP) {
6996         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6997             i++;
6998         }
6999     }
7000 
7001     j = len;
7002     if (striptype != LEFTSTRIP) {
7003         do {
7004             j--;
7005         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7006         j++;
7007     }
7008 
7009     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7010         Py_INCREF(self);
7011         return (PyObject*)self;
7012     }
7013     else
7014         return PyUnicode_FromUnicode(s+i, j-i);
7015 }
7016 
7017 
7018 static PyObject *
7019 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7020 {
7021     PyObject *sep = NULL;
7022 
7023     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7024         return NULL;
7025 
7026     if (sep != NULL && sep != Py_None) {
7027         if (PyUnicode_Check(sep))
7028             return _PyUnicode_XStrip(self, striptype, sep);
7029         else if (PyString_Check(sep)) {
7030             PyObject *res;
7031             sep = PyUnicode_FromObject(sep);
7032             if (sep==NULL)
7033                 return NULL;
7034             res = _PyUnicode_XStrip(self, striptype, sep);
7035             Py_DECREF(sep);
7036             return res;
7037         }
7038         else {
7039             PyErr_Format(PyExc_TypeError,
7040                          "%s arg must be None, unicode or str",
7041                          STRIPNAME(striptype));
7042             return NULL;
7043         }
7044     }
7045 
7046     return do_strip(self, striptype);
7047 }
7048 
7049 
7050 PyDoc_STRVAR(strip__doc__,
7051              "S.strip([chars]) -> unicode\n\
7052 \n\
7053 Return a copy of the string S with leading and trailing\n\
7054 whitespace removed.\n\
7055 If chars is given and not None, remove characters in chars instead.\n\
7056 If chars is a str, it will be converted to unicode before stripping");
7057 
7058 static PyObject *
7059 unicode_strip(PyUnicodeObject *self, PyObject *args)
7060 {
7061     if (PyTuple_GET_SIZE(args) == 0)
7062         return do_strip(self, BOTHSTRIP); /* Common case */
7063     else
7064         return do_argstrip(self, BOTHSTRIP, args);
7065 }
7066 
7067 
7068 PyDoc_STRVAR(lstrip__doc__,
7069              "S.lstrip([chars]) -> unicode\n\
7070 \n\
7071 Return a copy of the string S with leading whitespace removed.\n\
7072 If chars is given and not None, remove characters in chars instead.\n\
7073 If chars is a str, it will be converted to unicode before stripping");
7074 
7075 static PyObject *
7076 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7077 {
7078     if (PyTuple_GET_SIZE(args) == 0)
7079         return do_strip(self, LEFTSTRIP); /* Common case */
7080     else
7081         return do_argstrip(self, LEFTSTRIP, args);
7082 }
7083 
7084 
7085 PyDoc_STRVAR(rstrip__doc__,
7086              "S.rstrip([chars]) -> unicode\n\
7087 \n\
7088 Return a copy of the string S with trailing whitespace removed.\n\
7089 If chars is given and not None, remove characters in chars instead.\n\
7090 If chars is a str, it will be converted to unicode before stripping");
7091 
7092 static PyObject *
7093 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7094 {
7095     if (PyTuple_GET_SIZE(args) == 0)
7096         return do_strip(self, RIGHTSTRIP); /* Common case */
7097     else
7098         return do_argstrip(self, RIGHTSTRIP, args);
7099 }
7100 
7101 
7102 static PyObject*
7103 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7104 {
7105     PyUnicodeObject *u;
7106     Py_UNICODE *p;
7107     Py_ssize_t nchars;
7108     size_t nbytes;
7109 
7110     if (len < 0)
7111         len = 0;
7112 
7113     if (len == 1 && PyUnicode_CheckExact(str)) {
7114         /* no repeat, return original string */
7115         Py_INCREF(str);
7116         return (PyObject*) str;
7117     }
7118 
7119     /* ensure # of chars needed doesn't overflow int and # of bytes
7120      * needed doesn't overflow size_t
7121      */
7122     nchars = len * str->length;
7123     if (len && nchars / len != str->length) {
7124         PyErr_SetString(PyExc_OverflowError,
7125                         "repeated string is too long");
7126         return NULL;
7127     }
7128     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7129     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7130         PyErr_SetString(PyExc_OverflowError,
7131                         "repeated string is too long");
7132         return NULL;
7133     }
7134     u = _PyUnicode_New(nchars);
7135     if (!u)
7136         return NULL;
7137 
7138     p = u->str;
7139 
7140     if (str->length == 1 && len > 0) {
7141         Py_UNICODE_FILL(p, str->str[0], len);
7142     } else {
7143         Py_ssize_t done = 0; /* number of characters copied this far */
7144         if (done < nchars) {
7145             Py_UNICODE_COPY(p, str->str, str->length);
7146             done = str->length;
7147         }
7148         while (done < nchars) {
7149             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7150             Py_UNICODE_COPY(p+done, p, n);
7151             done += n;
7152         }
7153     }
7154 
7155     return (PyObject*) u;
7156 }
7157 
7158 PyObject *PyUnicode_Replace(PyObject *obj,
7159                             PyObject *subobj,
7160                             PyObject *replobj,
7161                             Py_ssize_t maxcount)
7162 {
7163     PyObject *self;
7164     PyObject *str1;
7165     PyObject *str2;
7166     PyObject *result;
7167 
7168     self = PyUnicode_FromObject(obj);
7169     if (self == NULL)
7170         return NULL;
7171     str1 = PyUnicode_FromObject(subobj);
7172     if (str1 == NULL) {
7173         Py_DECREF(self);
7174         return NULL;
7175     }
7176     str2 = PyUnicode_FromObject(replobj);
7177     if (str2 == NULL) {
7178         Py_DECREF(self);
7179         Py_DECREF(str1);
7180         return NULL;
7181     }
7182     result = replace((PyUnicodeObject *)self,
7183                      (PyUnicodeObject *)str1,
7184                      (PyUnicodeObject *)str2,
7185                      maxcount);
7186     Py_DECREF(self);
7187     Py_DECREF(str1);
7188     Py_DECREF(str2);
7189     return result;
7190 }
7191 
7192 PyDoc_STRVAR(replace__doc__,
7193              "S.replace(old, new[, count]) -> unicode\n\
7194 \n\
7195 Return a copy of S with all occurrences of substring\n\
7196 old replaced by new.  If the optional argument count is\n\
7197 given, only the first count occurrences are replaced.");
7198 
7199 static PyObject*
7200 unicode_replace(PyUnicodeObject *self, PyObject *args)
7201 {
7202     PyUnicodeObject *str1;
7203     PyUnicodeObject *str2;
7204     Py_ssize_t maxcount = -1;
7205     PyObject *result;
7206 
7207     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7208         return NULL;
7209     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7210     if (str1 == NULL)
7211         return NULL;
7212     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7213     if (str2 == NULL) {
7214         Py_DECREF(str1);
7215         return NULL;
7216     }
7217 
7218     result = replace(self, str1, str2, maxcount);
7219 
7220     Py_DECREF(str1);
7221     Py_DECREF(str2);
7222     return result;
7223 }
7224 
7225 static
7226 PyObject *unicode_repr(PyObject *unicode)
7227 {
7228     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7229                                 PyUnicode_GET_SIZE(unicode),
7230                                 1);
7231 }
7232 
7233 PyDoc_STRVAR(rfind__doc__,
7234              "S.rfind(sub [,start [,end]]) -> int\n\
7235 \n\
7236 Return the highest index in S where substring sub is found,\n\
7237 such that sub is contained within S[start:end].  Optional\n\
7238 arguments start and end are interpreted as in slice notation.\n\
7239 \n\
7240 Return -1 on failure.");
7241 
7242 static PyObject *
7243 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7244 {
7245     PyUnicodeObject *substring;
7246     Py_ssize_t start;
7247     Py_ssize_t end;
7248     Py_ssize_t result;
7249 
7250     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7251                                             &start, &end))
7252         return NULL;
7253 
7254     result = stringlib_rfind_slice(
7255         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7256         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7257         start, end
7258         );
7259 
7260     Py_DECREF(substring);
7261 
7262     return PyInt_FromSsize_t(result);
7263 }
7264 
7265 PyDoc_STRVAR(rindex__doc__,
7266              "S.rindex(sub [,start [,end]]) -> int\n\
7267 \n\
7268 Like S.rfind() but raise ValueError when the substring is not found.");
7269 
7270 static PyObject *
7271 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7272 {
7273     PyUnicodeObject *substring;
7274     Py_ssize_t start;
7275     Py_ssize_t end;
7276     Py_ssize_t result;
7277 
7278     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7279                                             &start, &end))
7280         return NULL;
7281 
7282     result = stringlib_rfind_slice(
7283         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7284         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7285         start, end
7286         );
7287 
7288     Py_DECREF(substring);
7289 
7290     if (result < 0) {
7291         PyErr_SetString(PyExc_ValueError, "substring not found");
7292         return NULL;
7293     }
7294     return PyInt_FromSsize_t(result);
7295 }
7296 
7297 PyDoc_STRVAR(rjust__doc__,
7298              "S.rjust(width[, fillchar]) -> unicode\n\
7299 \n\
7300 Return S right-justified in a Unicode string of length width. Padding is\n\
7301 done using the specified fill character (default is a space).");
7302 
7303 static PyObject *
7304 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7305 {
7306     Py_ssize_t width;
7307     Py_UNICODE fillchar = ' ';
7308 
7309     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7310         return NULL;
7311 
7312     if (self->length >= width && PyUnicode_CheckExact(self)) {
7313         Py_INCREF(self);
7314         return (PyObject*) self;
7315     }
7316 
7317     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7318 }
7319 
7320 static PyObject*
7321 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7322 {
7323     /* standard clamping */
7324     if (start < 0)
7325         start = 0;
7326     if (end < 0)
7327         end = 0;
7328     if (end > self->length)
7329         end = self->length;
7330     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7331         /* full slice, return original string */
7332         Py_INCREF(self);
7333         return (PyObject*) self;
7334     }
7335     if (start > end)
7336         start = end;
7337     /* copy slice */
7338     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7339                                              end - start);
7340 }
7341 
7342 PyObject *PyUnicode_Split(PyObject *s,
7343                           PyObject *sep,
7344                           Py_ssize_t maxsplit)
7345 {
7346     PyObject *result;
7347 
7348     s = PyUnicode_FromObject(s);
7349     if (s == NULL)
7350         return NULL;
7351     if (sep != NULL) {
7352         sep = PyUnicode_FromObject(sep);
7353         if (sep == NULL) {
7354             Py_DECREF(s);
7355             return NULL;
7356         }
7357     }
7358 
7359     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7360 
7361     Py_DECREF(s);
7362     Py_XDECREF(sep);
7363     return result;
7364 }
7365 
7366 PyDoc_STRVAR(split__doc__,
7367              "S.split([sep [,maxsplit]]) -> list of strings\n\
7368 \n\
7369 Return a list of the words in S, using sep as the\n\
7370 delimiter string.  If maxsplit is given, at most maxsplit\n\
7371 splits are done. If sep is not specified or is None, any\n\
7372 whitespace string is a separator and empty strings are\n\
7373 removed from the result.");
7374 
7375 static PyObject*
7376 unicode_split(PyUnicodeObject *self, PyObject *args)
7377 {
7378     PyObject *substring = Py_None;
7379     Py_ssize_t maxcount = -1;
7380 
7381     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7382         return NULL;
7383 
7384     if (substring == Py_None)
7385         return split(self, NULL, maxcount);
7386     else if (PyUnicode_Check(substring))
7387         return split(self, (PyUnicodeObject *)substring, maxcount);
7388     else
7389         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7390 }
7391 
7392 PyObject *
7393 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7394 {
7395     PyObject* str_obj;
7396     PyObject* sep_obj;
7397     PyObject* out;
7398 
7399     str_obj = PyUnicode_FromObject(str_in);
7400     if (!str_obj)
7401         return NULL;
7402     sep_obj = PyUnicode_FromObject(sep_in);
7403     if (!sep_obj) {
7404         Py_DECREF(str_obj);
7405         return NULL;
7406     }
7407 
7408     out = stringlib_partition(
7409         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7410         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7411         );
7412 
7413     Py_DECREF(sep_obj);
7414     Py_DECREF(str_obj);
7415 
7416     return out;
7417 }
7418 
7419 
7420 PyObject *
7421 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7422 {
7423     PyObject* str_obj;
7424     PyObject* sep_obj;
7425     PyObject* out;
7426 
7427     str_obj = PyUnicode_FromObject(str_in);
7428     if (!str_obj)
7429         return NULL;
7430     sep_obj = PyUnicode_FromObject(sep_in);
7431     if (!sep_obj) {
7432         Py_DECREF(str_obj);
7433         return NULL;
7434     }
7435 
7436     out = stringlib_rpartition(
7437         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7438         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7439         );
7440 
7441     Py_DECREF(sep_obj);
7442     Py_DECREF(str_obj);
7443 
7444     return out;
7445 }
7446 
7447 PyDoc_STRVAR(partition__doc__,
7448              "S.partition(sep) -> (head, sep, tail)\n\
7449 \n\
7450 Search for the separator sep in S, and return the part before it,\n\
7451 the separator itself, and the part after it.  If the separator is not\n\
7452 found, return S and two empty strings.");
7453 
7454 static PyObject*
7455 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7456 {
7457     return PyUnicode_Partition((PyObject *)self, separator);
7458 }
7459 
7460 PyDoc_STRVAR(rpartition__doc__,
7461              "S.rpartition(sep) -> (head, sep, tail)\n\
7462 \n\
7463 Search for the separator sep in S, starting at the end of S, and return\n\
7464 the part before it, the separator itself, and the part after it.  If the\n\
7465 separator is not found, return two empty strings and S.");
7466 
7467 static PyObject*
7468 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7469 {
7470     return PyUnicode_RPartition((PyObject *)self, separator);
7471 }
7472 
7473 PyObject *PyUnicode_RSplit(PyObject *s,
7474                            PyObject *sep,
7475                            Py_ssize_t maxsplit)
7476 {
7477     PyObject *result;
7478 
7479     s = PyUnicode_FromObject(s);
7480     if (s == NULL)
7481         return NULL;
7482     if (sep != NULL) {
7483         sep = PyUnicode_FromObject(sep);
7484         if (sep == NULL) {
7485             Py_DECREF(s);
7486             return NULL;
7487         }
7488     }
7489 
7490     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7491 
7492     Py_DECREF(s);
7493     Py_XDECREF(sep);
7494     return result;
7495 }
7496 
7497 PyDoc_STRVAR(rsplit__doc__,
7498              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7499 \n\
7500 Return a list of the words in S, using sep as the\n\
7501 delimiter string, starting at the end of the string and\n\
7502 working to the front.  If maxsplit is given, at most maxsplit\n\
7503 splits are done. If sep is not specified, any whitespace string\n\
7504 is a separator.");
7505 
7506 static PyObject*
7507 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7508 {
7509     PyObject *substring = Py_None;
7510     Py_ssize_t maxcount = -1;
7511 
7512     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7513         return NULL;
7514 
7515     if (substring == Py_None)
7516         return rsplit(self, NULL, maxcount);
7517     else if (PyUnicode_Check(substring))
7518         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7519     else
7520         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7521 }
7522 
7523 PyDoc_STRVAR(splitlines__doc__,
7524              "S.splitlines([keepends]) -> list of strings\n\
7525 \n\
7526 Return a list of the lines in S, breaking at line boundaries.\n\
7527 Line breaks are not included in the resulting list unless keepends\n\
7528 is given and true.");
7529 
7530 static PyObject*
7531 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7532 {
7533     int keepends = 0;
7534 
7535     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7536         return NULL;
7537 
7538     return PyUnicode_Splitlines((PyObject *)self, keepends);
7539 }
7540 
7541 static
7542 PyObject *unicode_str(PyUnicodeObject *self)
7543 {
7544     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7545 }
7546 
7547 PyDoc_STRVAR(swapcase__doc__,
7548              "S.swapcase() -> unicode\n\
7549 \n\
7550 Return a copy of S with uppercase characters converted to lowercase\n\
7551 and vice versa.");
7552 
7553 static PyObject*
7554 unicode_swapcase(PyUnicodeObject *self)
7555 {
7556     return fixup(self, fixswapcase);
7557 }
7558 
7559 PyDoc_STRVAR(translate__doc__,
7560              "S.translate(table) -> unicode\n\
7561 \n\
7562 Return a copy of the string S, where all characters have been mapped\n\
7563 through the given translation table, which must be a mapping of\n\
7564 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7565 Unmapped characters are left untouched. Characters mapped to None\n\
7566 are deleted.");
7567 
7568 static PyObject*
7569 unicode_translate(PyUnicodeObject *self, PyObject *table)
7570 {
7571     return PyUnicode_TranslateCharmap(self->str,
7572                                       self->length,
7573                                       table,
7574                                       "ignore");
7575 }
7576 
7577 PyDoc_STRVAR(upper__doc__,
7578              "S.upper() -> unicode\n\
7579 \n\
7580 Return a copy of S converted to uppercase.");
7581 
7582 static PyObject*
7583 unicode_upper(PyUnicodeObject *self)
7584 {
7585     return fixup(self, fixupper);
7586 }
7587 
7588 PyDoc_STRVAR(zfill__doc__,
7589              "S.zfill(width) -> unicode\n\
7590 \n\
7591 Pad a numeric string S with zeros on the left, to fill a field\n\
7592 of the specified width. The string S is never truncated.");
7593 
7594 static PyObject *
7595 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7596 {
7597     Py_ssize_t fill;
7598     PyUnicodeObject *u;
7599 
7600     Py_ssize_t width;
7601     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7602         return NULL;
7603 
7604     if (self->length >= width) {
7605         if (PyUnicode_CheckExact(self)) {
7606             Py_INCREF(self);
7607             return (PyObject*) self;
7608         }
7609         else
7610             return PyUnicode_FromUnicode(
7611                 PyUnicode_AS_UNICODE(self),
7612                 PyUnicode_GET_SIZE(self)
7613                 );
7614     }
7615 
7616     fill = width - self->length;
7617 
7618     u = pad(self, fill, 0, '0');
7619 
7620     if (u == NULL)
7621         return NULL;
7622 
7623     if (u->str[fill] == '+' || u->str[fill] == '-') {
7624         /* move sign to beginning of string */
7625         u->str[0] = u->str[fill];
7626         u->str[fill] = '0';
7627     }
7628 
7629     return (PyObject*) u;
7630 }
7631 
7632 #if 0
7633 static PyObject*
7634 free_listsize(PyUnicodeObject *self)
7635 {
7636     return PyInt_FromLong(numfree);
7637 }
7638 #endif
7639 
7640 PyDoc_STRVAR(startswith__doc__,
7641              "S.startswith(prefix[, start[, end]]) -> bool\n\
7642 \n\
7643 Return True if S starts with the specified prefix, False otherwise.\n\
7644 With optional start, test S beginning at that position.\n\
7645 With optional end, stop comparing S at that position.\n\
7646 prefix can also be a tuple of strings to try.");
7647 
7648 static PyObject *
7649 unicode_startswith(PyUnicodeObject *self,
7650                    PyObject *args)
7651 {
7652     PyObject *subobj;
7653     PyUnicodeObject *substring;
7654     Py_ssize_t start = 0;
7655     Py_ssize_t end = PY_SSIZE_T_MAX;
7656     int result;
7657 
7658     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7659         return NULL;
7660     if (PyTuple_Check(subobj)) {
7661         Py_ssize_t i;
7662         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7663             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7664                 PyTuple_GET_ITEM(subobj, i));
7665             if (substring == NULL)
7666                 return NULL;
7667             result = tailmatch(self, substring, start, end, -1);
7668             Py_DECREF(substring);
7669             if (result) {
7670                 Py_RETURN_TRUE;
7671             }
7672         }
7673         /* nothing matched */
7674         Py_RETURN_FALSE;
7675     }
7676     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7677     if (substring == NULL) {
7678         if (PyErr_ExceptionMatches(PyExc_TypeError))
7679             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7680                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7681         return NULL;
7682     }
7683     result = tailmatch(self, substring, start, end, -1);
7684     Py_DECREF(substring);
7685     return PyBool_FromLong(result);
7686 }
7687 
7688 
7689 PyDoc_STRVAR(endswith__doc__,
7690              "S.endswith(suffix[, start[, end]]) -> bool\n\
7691 \n\
7692 Return True if S ends with the specified suffix, False otherwise.\n\
7693 With optional start, test S beginning at that position.\n\
7694 With optional end, stop comparing S at that position.\n\
7695 suffix can also be a tuple of strings to try.");
7696 
7697 static PyObject *
7698 unicode_endswith(PyUnicodeObject *self,
7699                  PyObject *args)
7700 {
7701     PyObject *subobj;
7702     PyUnicodeObject *substring;
7703     Py_ssize_t start = 0;
7704     Py_ssize_t end = PY_SSIZE_T_MAX;
7705     int result;
7706 
7707     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7708         return NULL;
7709     if (PyTuple_Check(subobj)) {
7710         Py_ssize_t i;
7711         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7712             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7713                 PyTuple_GET_ITEM(subobj, i));
7714             if (substring == NULL)
7715                 return NULL;
7716             result = tailmatch(self, substring, start, end, +1);
7717             Py_DECREF(substring);
7718             if (result) {
7719                 Py_RETURN_TRUE;
7720             }
7721         }
7722         Py_RETURN_FALSE;
7723     }
7724     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7725     if (substring == NULL) {
7726         if (PyErr_ExceptionMatches(PyExc_TypeError))
7727             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7728                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7729         return NULL;
7730     }
7731     result = tailmatch(self, substring, start, end, +1);
7732     Py_DECREF(substring);
7733     return PyBool_FromLong(result);
7734 }
7735 
7736 
7737 /* Implements do_string_format, which is unicode because of stringlib */
7738 #include "stringlib/string_format.h"
7739 
7740 PyDoc_STRVAR(format__doc__,
7741              "S.format(*args, **kwargs) -> unicode\n\
7742 \n\
7743 Return a formatted version of S, using substitutions from args and kwargs.\n\
7744 The substitutions are identified by braces ('{' and '}').");
7745 
7746 static PyObject *
7747 unicode__format__(PyObject *self, PyObject *args)
7748 {
7749     PyObject *format_spec;
7750     PyObject *result = NULL;
7751     PyObject *tmp = NULL;
7752 
7753     /* If 2.x, convert format_spec to the same type as value */
7754     /* This is to allow things like u''.format('') */
7755     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7756         goto done;
7757     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7758         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7759                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7760         goto done;
7761     }
7762     tmp = PyObject_Unicode(format_spec);
7763     if (tmp == NULL)
7764         goto done;
7765     format_spec = tmp;
7766 
7767     result = _PyUnicode_FormatAdvanced(self,
7768                                        PyUnicode_AS_UNICODE(format_spec),
7769                                        PyUnicode_GET_SIZE(format_spec));
7770   done:
7771     Py_XDECREF(tmp);
7772     return result;
7773 }
7774 
7775 PyDoc_STRVAR(p_format__doc__,
7776              "S.__format__(format_spec) -> unicode\n\
7777 \n\
7778 Return a formatted version of S as described by format_spec.");
7779 
7780 static PyObject *
7781 unicode__sizeof__(PyUnicodeObject *v)
7782 {
7783     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7784                              sizeof(Py_UNICODE) * (v->length + 1));
7785 }
7786 
7787 PyDoc_STRVAR(sizeof__doc__,
7788              "S.__sizeof__() -> size of S in memory, in bytes\n\
7789 \n\
7790 ");
7791 
7792 static PyObject *
7793 unicode_getnewargs(PyUnicodeObject *v)
7794 {
7795     return Py_BuildValue("(u#)", v->str, v->length);
7796 }
7797 
7798 
7799 static PyMethodDef unicode_methods[] = {
7800 
7801     /* Order is according to common usage: often used methods should
7802        appear first, since lookup is done sequentially. */
7803 
7804     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7805     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7806     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7807     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7808     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7809     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7810     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7811     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7812     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7813     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7814     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7815     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7816     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7817     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7818     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7819     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7820     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7821 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7822     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7823     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7824     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7825     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7826     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7827     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7828     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7829     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7830     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7831     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7832     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7833     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7834     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7835     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7836     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7837     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7838     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7839     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7840     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7841     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7842     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7843     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7844     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7845     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7846     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7847     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7848     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7849 #if 0
7850     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7851 #endif
7852 
7853 #if 0
7854     /* This one is just used for debugging the implementation. */
7855     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7856 #endif
7857 
7858     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7859     {NULL, NULL}
7860 };
7861 
7862 static PyObject *
7863 unicode_mod(PyObject *v, PyObject *w)
7864 {
7865     if (!PyUnicode_Check(v)) {
7866         Py_INCREF(Py_NotImplemented);
7867         return Py_NotImplemented;
7868     }
7869     return PyUnicode_Format(v, w);
7870 }
7871 
7872 static PyNumberMethods unicode_as_number = {
7873     0,              /*nb_add*/
7874     0,              /*nb_subtract*/
7875     0,              /*nb_multiply*/
7876     0,              /*nb_divide*/
7877     unicode_mod,            /*nb_remainder*/
7878 };
7879 
7880 static PySequenceMethods unicode_as_sequence = {
7881     (lenfunc) unicode_length,       /* sq_length */
7882     PyUnicode_Concat,           /* sq_concat */
7883     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7884     (ssizeargfunc) unicode_getitem,     /* sq_item */
7885     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7886     0,                  /* sq_ass_item */
7887     0,                  /* sq_ass_slice */
7888     PyUnicode_Contains,         /* sq_contains */
7889 };
7890 
7891 static PyObject*
7892 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7893 {
7894     if (PyIndex_Check(item)) {
7895         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7896         if (i == -1 && PyErr_Occurred())
7897             return NULL;
7898         if (i < 0)
7899             i += PyUnicode_GET_SIZE(self);
7900         return unicode_getitem(self, i);
7901     } else if (PySlice_Check(item)) {
7902         Py_ssize_t start, stop, step, slicelength, cur, i;
7903         Py_UNICODE* source_buf;
7904         Py_UNICODE* result_buf;
7905         PyObject* result;
7906 
7907         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7908                                  &start, &stop, &step, &slicelength) < 0) {
7909             return NULL;
7910         }
7911 
7912         if (slicelength <= 0) {
7913             return PyUnicode_FromUnicode(NULL, 0);
7914         } else if (start == 0 && step == 1 && slicelength == self->length &&
7915                    PyUnicode_CheckExact(self)) {
7916             Py_INCREF(self);
7917             return (PyObject *)self;
7918         } else if (step == 1) {
7919             return PyUnicode_FromUnicode(self->str + start, slicelength);
7920         } else {
7921             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7922             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7923                                                        sizeof(Py_UNICODE));
7924 
7925             if (result_buf == NULL)
7926                 return PyErr_NoMemory();
7927 
7928             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7929                 result_buf[i] = source_buf[cur];
7930             }
7931 
7932             result = PyUnicode_FromUnicode(result_buf, slicelength);
7933             PyObject_FREE(result_buf);
7934             return result;
7935         }
7936     } else {
7937         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7938         return NULL;
7939     }
7940 }
7941 
7942 static PyMappingMethods unicode_as_mapping = {
7943     (lenfunc)unicode_length,        /* mp_length */
7944     (binaryfunc)unicode_subscript,  /* mp_subscript */
7945     (objobjargproc)0,           /* mp_ass_subscript */
7946 };
7947 
7948 static Py_ssize_t
7949 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7950                           Py_ssize_t index,
7951                           const void **ptr)
7952 {
7953     if (index != 0) {
7954         PyErr_SetString(PyExc_SystemError,
7955                         "accessing non-existent unicode segment");
7956         return -1;
7957     }
7958     *ptr = (void *) self->str;
7959     return PyUnicode_GET_DATA_SIZE(self);
7960 }
7961 
7962 static Py_ssize_t
7963 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7964                            const void **ptr)
7965 {
7966     PyErr_SetString(PyExc_TypeError,
7967                     "cannot use unicode as modifiable buffer");
7968     return -1;
7969 }
7970 
7971 static int
7972 unicode_buffer_getsegcount(PyUnicodeObject *self,
7973                            Py_ssize_t *lenp)
7974 {
7975     if (lenp)
7976         *lenp = PyUnicode_GET_DATA_SIZE(self);
7977     return 1;
7978 }
7979 
7980 static Py_ssize_t
7981 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7982                           Py_ssize_t index,
7983                           const void **ptr)
7984 {
7985     PyObject *str;
7986 
7987     if (index != 0) {
7988         PyErr_SetString(PyExc_SystemError,
7989                         "accessing non-existent unicode segment");
7990         return -1;
7991     }
7992     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7993     if (str == NULL)
7994         return -1;
7995     *ptr = (void *) PyString_AS_STRING(str);
7996     return PyString_GET_SIZE(str);
7997 }
7998 
7999 /* Helpers for PyUnicode_Format() */
8000 
8001 static PyObject *
8002 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8003 {
8004     Py_ssize_t argidx = *p_argidx;
8005     if (argidx < arglen) {
8006         (*p_argidx)++;
8007         if (arglen < 0)
8008             return args;
8009         else
8010             return PyTuple_GetItem(args, argidx);
8011     }
8012     PyErr_SetString(PyExc_TypeError,
8013                     "not enough arguments for format string");
8014     return NULL;
8015 }
8016 
8017 #define F_LJUST (1<<0)
8018 #define F_SIGN  (1<<1)
8019 #define F_BLANK (1<<2)
8020 #define F_ALT   (1<<3)
8021 #define F_ZERO  (1<<4)
8022 
8023 static Py_ssize_t
8024 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8025 {
8026     register Py_ssize_t i;
8027     Py_ssize_t len = strlen(charbuffer);
8028     for (i = len - 1; i >= 0; i--)
8029         buffer[i] = (Py_UNICODE) charbuffer[i];
8030 
8031     return len;
8032 }
8033 
8034 static int
8035 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8036 {
8037     Py_ssize_t result;
8038 
8039     PyOS_snprintf((char *)buffer, len, format, x);
8040     result = strtounicode(buffer, (char *)buffer);
8041     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8042 }
8043 
8044 /* XXX To save some code duplication, formatfloat/long/int could have been
8045    shared with stringobject.c, converting from 8-bit to Unicode after the
8046    formatting is done. */
8047 
8048 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8049 
8050 static PyObject *
8051 formatfloat(PyObject *v, int flags, int prec, int type)
8052 {
8053     char *p;
8054     PyObject *result;
8055     double x;
8056 
8057     x = PyFloat_AsDouble(v);
8058     if (x == -1.0 && PyErr_Occurred())
8059         return NULL;
8060 
8061     if (prec < 0)
8062         prec = 6;
8063 
8064     p = PyOS_double_to_string(x, type, prec,
8065                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8066     if (p == NULL)
8067         return NULL;
8068     result = PyUnicode_FromStringAndSize(p, strlen(p));
8069     PyMem_Free(p);
8070     return result;
8071 }
8072 
8073 static PyObject*
8074 formatlong(PyObject *val, int flags, int prec, int type)
8075 {
8076     char *buf;
8077     int i, len;
8078     PyObject *str; /* temporary string object. */
8079     PyUnicodeObject *result;
8080 
8081     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8082     if (!str)
8083         return NULL;
8084     result = _PyUnicode_New(len);
8085     if (!result) {
8086         Py_DECREF(str);
8087         return NULL;
8088     }
8089     for (i = 0; i < len; i++)
8090         result->str[i] = buf[i];
8091     result->str[len] = 0;
8092     Py_DECREF(str);
8093     return (PyObject*)result;
8094 }
8095 
8096 static int
8097 formatint(Py_UNICODE *buf,
8098           size_t buflen,
8099           int flags,
8100           int prec,
8101           int type,
8102           PyObject *v)
8103 {
8104     /* fmt = '%#.' + `prec` + 'l' + `type`
8105      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8106      *                     + 1 + 1
8107      *                   = 24
8108      */
8109     char fmt[64]; /* plenty big enough! */
8110     char *sign;
8111     long x;
8112 
8113     x = PyInt_AsLong(v);
8114     if (x == -1 && PyErr_Occurred())
8115         return -1;
8116     if (x < 0 && type == 'u') {
8117         type = 'd';
8118     }
8119     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8120         sign = "-";
8121     else
8122         sign = "";
8123     if (prec < 0)
8124         prec = 1;
8125 
8126     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8127      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8128      */
8129     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8130         PyErr_SetString(PyExc_OverflowError,
8131                         "formatted integer is too long (precision too large?)");
8132         return -1;
8133     }
8134 
8135     if ((flags & F_ALT) &&
8136         (type == 'x' || type == 'X')) {
8137         /* When converting under %#x or %#X, there are a number
8138          * of issues that cause pain:
8139          * - when 0 is being converted, the C standard leaves off
8140          *   the '0x' or '0X', which is inconsistent with other
8141          *   %#x/%#X conversions and inconsistent with Python's
8142          *   hex() function
8143          * - there are platforms that violate the standard and
8144          *   convert 0 with the '0x' or '0X'
8145          *   (Metrowerks, Compaq Tru64)
8146          * - there are platforms that give '0x' when converting
8147          *   under %#X, but convert 0 in accordance with the
8148          *   standard (OS/2 EMX)
8149          *
8150          * We can achieve the desired consistency by inserting our
8151          * own '0x' or '0X' prefix, and substituting %x/%X in place
8152          * of %#x/%#X.
8153          *
8154          * Note that this is the same approach as used in
8155          * formatint() in stringobject.c
8156          */
8157         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8158                       sign, type, prec, type);
8159     }
8160     else {
8161         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8162                       sign, (flags&F_ALT) ? "#" : "",
8163                       prec, type);
8164     }
8165     if (sign[0])
8166         return longtounicode(buf, buflen, fmt, -x);
8167     else
8168         return longtounicode(buf, buflen, fmt, x);
8169 }
8170 
8171 static int
8172 formatchar(Py_UNICODE *buf,
8173            size_t buflen,
8174            PyObject *v)
8175 {
8176     PyObject *unistr;
8177     char *str;
8178     /* presume that the buffer is at least 2 characters long */
8179     if (PyUnicode_Check(v)) {
8180         if (PyUnicode_GET_SIZE(v) != 1)
8181             goto onError;
8182         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8183     }
8184 
8185     else if (PyString_Check(v)) {
8186         if (PyString_GET_SIZE(v) != 1)
8187             goto onError;
8188         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8189            with a UnicodeDecodeError if 'char' is not decodable with the
8190            default encoding (usually ASCII, but it might be something else) */
8191         str = PyString_AS_STRING(v);
8192         if ((unsigned char)str[0] > 0x7F) {
8193             /* the char is not ASCII; try to decode the string using the
8194                default encoding and return -1 to let the UnicodeDecodeError
8195                be raised if the string can't be decoded */
8196             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8197             if (unistr == NULL)
8198                 return -1;
8199             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8200             Py_DECREF(unistr);
8201         }
8202         else
8203             buf[0] = (Py_UNICODE)str[0];
8204     }
8205 
8206     else {
8207         /* Integer input truncated to a character */
8208         long x;
8209         x = PyInt_AsLong(v);
8210         if (x == -1 && PyErr_Occurred())
8211             goto onError;
8212 #ifdef Py_UNICODE_WIDE
8213         if (x < 0 || x > 0x10ffff) {
8214             PyErr_SetString(PyExc_OverflowError,
8215                             "%c arg not in range(0x110000) "
8216                             "(wide Python build)");
8217             return -1;
8218         }
8219 #else
8220         if (x < 0 || x > 0xffff) {
8221             PyErr_SetString(PyExc_OverflowError,
8222                             "%c arg not in range(0x10000) "
8223                             "(narrow Python build)");
8224             return -1;
8225         }
8226 #endif
8227         buf[0] = (Py_UNICODE) x;
8228     }
8229     buf[1] = '\0';
8230     return 1;
8231 
8232   onError:
8233     PyErr_SetString(PyExc_TypeError,
8234                     "%c requires int or char");
8235     return -1;
8236 }
8237 
8238 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8239 
8240    FORMATBUFLEN is the length of the buffer in which the ints &
8241    chars are formatted. XXX This is a magic number. Each formatting
8242    routine does bounds checking to ensure no overflow, but a better
8243    solution may be to malloc a buffer of appropriate size for each
8244    format. For now, the current solution is sufficient.
8245 */
8246 #define FORMATBUFLEN (size_t)120
8247 
8248 PyObject *PyUnicode_Format(PyObject *format,
8249                            PyObject *args)
8250 {
8251     Py_UNICODE *fmt, *res;
8252     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8253     int args_owned = 0;
8254     PyUnicodeObject *result = NULL;
8255     PyObject *dict = NULL;
8256     PyObject *uformat;
8257 
8258     if (format == NULL || args == NULL) {
8259         PyErr_BadInternalCall();
8260         return NULL;
8261     }
8262     uformat = PyUnicode_FromObject(format);
8263     if (uformat == NULL)
8264         return NULL;
8265     fmt = PyUnicode_AS_UNICODE(uformat);
8266     fmtcnt = PyUnicode_GET_SIZE(uformat);
8267 
8268     reslen = rescnt = fmtcnt + 100;
8269     result = _PyUnicode_New(reslen);
8270     if (result == NULL)
8271         goto onError;
8272     res = PyUnicode_AS_UNICODE(result);
8273 
8274     if (PyTuple_Check(args)) {
8275         arglen = PyTuple_Size(args);
8276         argidx = 0;
8277     }
8278     else {
8279         arglen = -1;
8280         argidx = -2;
8281     }
8282     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8283         !PyObject_TypeCheck(args, &PyBaseString_Type))
8284         dict = args;
8285 
8286     while (--fmtcnt >= 0) {
8287         if (*fmt != '%') {
8288             if (--rescnt < 0) {
8289                 rescnt = fmtcnt + 100;
8290                 reslen += rescnt;
8291                 if (_PyUnicode_Resize(&result, reslen) < 0)
8292                     goto onError;
8293                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8294                 --rescnt;
8295             }
8296             *res++ = *fmt++;
8297         }
8298         else {
8299             /* Got a format specifier */
8300             int flags = 0;
8301             Py_ssize_t width = -1;
8302             int prec = -1;
8303             Py_UNICODE c = '\0';
8304             Py_UNICODE fill;
8305             int isnumok;
8306             PyObject *v = NULL;
8307             PyObject *temp = NULL;
8308             Py_UNICODE *pbuf;
8309             Py_UNICODE sign;
8310             Py_ssize_t len;
8311             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8312 
8313             fmt++;
8314             if (*fmt == '(') {
8315                 Py_UNICODE *keystart;
8316                 Py_ssize_t keylen;
8317                 PyObject *key;
8318                 int pcount = 1;
8319 
8320                 if (dict == NULL) {
8321                     PyErr_SetString(PyExc_TypeError,
8322                                     "format requires a mapping");
8323                     goto onError;
8324                 }
8325                 ++fmt;
8326                 --fmtcnt;
8327                 keystart = fmt;
8328                 /* Skip over balanced parentheses */
8329                 while (pcount > 0 && --fmtcnt >= 0) {
8330                     if (*fmt == ')')
8331                         --pcount;
8332                     else if (*fmt == '(')
8333                         ++pcount;
8334                     fmt++;
8335                 }
8336                 keylen = fmt - keystart - 1;
8337                 if (fmtcnt < 0 || pcount > 0) {
8338                     PyErr_SetString(PyExc_ValueError,
8339                                     "incomplete format key");
8340                     goto onError;
8341                 }
8342 #if 0
8343                 /* keys are converted to strings using UTF-8 and
8344                    then looked up since Python uses strings to hold
8345                    variables names etc. in its namespaces and we
8346                    wouldn't want to break common idioms. */
8347                 key = PyUnicode_EncodeUTF8(keystart,
8348                                            keylen,
8349                                            NULL);
8350 #else
8351                 key = PyUnicode_FromUnicode(keystart, keylen);
8352 #endif
8353                 if (key == NULL)
8354                     goto onError;
8355                 if (args_owned) {
8356                     Py_DECREF(args);
8357                     args_owned = 0;
8358                 }
8359                 args = PyObject_GetItem(dict, key);
8360                 Py_DECREF(key);
8361                 if (args == NULL) {
8362                     goto onError;
8363                 }
8364                 args_owned = 1;
8365                 arglen = -1;
8366                 argidx = -2;
8367             }
8368             while (--fmtcnt >= 0) {
8369                 switch (c = *fmt++) {
8370                 case '-': flags |= F_LJUST; continue;
8371                 case '+': flags |= F_SIGN; continue;
8372                 case ' ': flags |= F_BLANK; continue;
8373                 case '#': flags |= F_ALT; continue;
8374                 case '0': flags |= F_ZERO; continue;
8375                 }
8376                 break;
8377             }
8378             if (c == '*') {
8379                 v = getnextarg(args, arglen, &argidx);
8380                 if (v == NULL)
8381                     goto onError;
8382                 if (!PyInt_Check(v)) {
8383                     PyErr_SetString(PyExc_TypeError,
8384                                     "* wants int");
8385                     goto onError;
8386                 }
8387                 width = PyInt_AsLong(v);
8388                 if (width < 0) {
8389                     flags |= F_LJUST;
8390                     width = -width;
8391                 }
8392                 if (--fmtcnt >= 0)
8393                     c = *fmt++;
8394             }
8395             else if (c >= '0' && c <= '9') {
8396                 width = c - '0';
8397                 while (--fmtcnt >= 0) {
8398                     c = *fmt++;
8399                     if (c < '0' || c > '9')
8400                         break;
8401                     if ((width*10) / 10 != width) {
8402                         PyErr_SetString(PyExc_ValueError,
8403                                         "width too big");
8404                         goto onError;
8405                     }
8406                     width = width*10 + (c - '0');
8407                 }
8408             }
8409             if (c == '.') {
8410                 prec = 0;
8411                 if (--fmtcnt >= 0)
8412                     c = *fmt++;
8413                 if (c == '*') {
8414                     v = getnextarg(args, arglen, &argidx);
8415                     if (v == NULL)
8416                         goto onError;
8417                     if (!PyInt_Check(v)) {
8418                         PyErr_SetString(PyExc_TypeError,
8419                                         "* wants int");
8420                         goto onError;
8421                     }
8422                     prec = PyInt_AsLong(v);
8423                     if (prec < 0)
8424                         prec = 0;
8425                     if (--fmtcnt >= 0)
8426                         c = *fmt++;
8427                 }
8428                 else if (c >= '0' && c <= '9') {
8429                     prec = c - '0';
8430                     while (--fmtcnt >= 0) {
8431                         c = *fmt++;
8432                         if (c < '0' || c > '9')
8433                             break;
8434                         if ((prec*10) / 10 != prec) {
8435                             PyErr_SetString(PyExc_ValueError,
8436                                             "prec too big");
8437                             goto onError;
8438                         }
8439                         prec = prec*10 + (c - '0');
8440                     }
8441                 }
8442             } /* prec */
8443             if (fmtcnt >= 0) {
8444                 if (c == 'h' || c == 'l' || c == 'L') {
8445                     if (--fmtcnt >= 0)
8446                         c = *fmt++;
8447                 }
8448             }
8449             if (fmtcnt < 0) {
8450                 PyErr_SetString(PyExc_ValueError,
8451                                 "incomplete format");
8452                 goto onError;
8453             }
8454             if (c != '%') {
8455                 v = getnextarg(args, arglen, &argidx);
8456                 if (v == NULL)
8457                     goto onError;
8458             }
8459             sign = 0;
8460             fill = ' ';
8461             switch (c) {
8462 
8463             case '%':
8464                 pbuf = formatbuf;
8465                 /* presume that buffer length is at least 1 */
8466                 pbuf[0] = '%';
8467                 len = 1;
8468                 break;
8469 
8470             case 's':
8471             case 'r':
8472                 if (PyUnicode_CheckExact(v) && c == 's') {
8473                     temp = v;
8474                     Py_INCREF(temp);
8475                 }
8476                 else {
8477                     PyObject *unicode;
8478                     if (c == 's')
8479                         temp = PyObject_Unicode(v);
8480                     else
8481                         temp = PyObject_Repr(v);
8482                     if (temp == NULL)
8483                         goto onError;
8484                     if (PyUnicode_Check(temp))
8485                         /* nothing to do */;
8486                     else if (PyString_Check(temp)) {
8487                         /* convert to string to Unicode */
8488                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8489                                                    PyString_GET_SIZE(temp),
8490                                                    NULL,
8491                                                    "strict");
8492                         Py_DECREF(temp);
8493                         temp = unicode;
8494                         if (temp == NULL)
8495                             goto onError;
8496                     }
8497                     else {
8498                         Py_DECREF(temp);
8499                         PyErr_SetString(PyExc_TypeError,
8500                                         "%s argument has non-string str()");
8501                         goto onError;
8502                     }
8503                 }
8504                 pbuf = PyUnicode_AS_UNICODE(temp);
8505                 len = PyUnicode_GET_SIZE(temp);
8506                 if (prec >= 0 && len > prec)
8507                     len = prec;
8508                 break;
8509 
8510             case 'i':
8511             case 'd':
8512             case 'u':
8513             case 'o':
8514             case 'x':
8515             case 'X':
8516                 if (c == 'i')
8517                     c = 'd';
8518                 isnumok = 0;
8519                 if (PyNumber_Check(v)) {
8520                     PyObject *iobj=NULL;
8521 
8522                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8523                         iobj = v;
8524                         Py_INCREF(iobj);
8525                     }
8526                     else {
8527                         iobj = PyNumber_Int(v);
8528                         if (iobj==NULL) iobj = PyNumber_Long(v);
8529                     }
8530                     if (iobj!=NULL) {
8531                         if (PyInt_Check(iobj)) {
8532                             isnumok = 1;
8533                             pbuf = formatbuf;
8534                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8535                                             flags, prec, c, iobj);
8536                             Py_DECREF(iobj);
8537                             if (len < 0)
8538                                 goto onError;
8539                             sign = 1;
8540                         }
8541                         else if (PyLong_Check(iobj)) {
8542                             isnumok = 1;
8543                             temp = formatlong(iobj, flags, prec, c);
8544                             Py_DECREF(iobj);
8545                             if (!temp)
8546                                 goto onError;
8547                             pbuf = PyUnicode_AS_UNICODE(temp);
8548                             len = PyUnicode_GET_SIZE(temp);
8549                             sign = 1;
8550                         }
8551                         else {
8552                             Py_DECREF(iobj);
8553                         }
8554                     }
8555                 }
8556                 if (!isnumok) {
8557                     PyErr_Format(PyExc_TypeError,
8558                                  "%%%c format: a number is required, "
8559                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8560                     goto onError;
8561                 }
8562                 if (flags & F_ZERO)
8563                     fill = '0';
8564                 break;
8565 
8566             case 'e':
8567             case 'E':
8568             case 'f':
8569             case 'F':
8570             case 'g':
8571             case 'G':
8572                 temp = formatfloat(v, flags, prec, c);
8573                 if (temp == NULL)
8574                     goto onError;
8575                 pbuf = PyUnicode_AS_UNICODE(temp);
8576                 len = PyUnicode_GET_SIZE(temp);
8577                 sign = 1;
8578                 if (flags & F_ZERO)
8579                     fill = '0';
8580                 break;
8581 
8582             case 'c':
8583                 pbuf = formatbuf;
8584                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8585                 if (len < 0)
8586                     goto onError;
8587                 break;
8588 
8589             default:
8590                 PyErr_Format(PyExc_ValueError,
8591                              "unsupported format character '%c' (0x%x) "
8592                              "at index %zd",
8593                              (31<=c && c<=126) ? (char)c : '?',
8594                              (int)c,
8595                              (Py_ssize_t)(fmt - 1 -
8596                                           PyUnicode_AS_UNICODE(uformat)));
8597                 goto onError;
8598             }
8599             if (sign) {
8600                 if (*pbuf == '-' || *pbuf == '+') {
8601                     sign = *pbuf++;
8602                     len--;
8603                 }
8604                 else if (flags & F_SIGN)
8605                     sign = '+';
8606                 else if (flags & F_BLANK)
8607                     sign = ' ';
8608                 else
8609                     sign = 0;
8610             }
8611             if (width < len)
8612                 width = len;
8613             if (rescnt - (sign != 0) < width) {
8614                 reslen -= rescnt;
8615                 rescnt = width + fmtcnt + 100;
8616                 reslen += rescnt;
8617                 if (reslen < 0) {
8618                     Py_XDECREF(temp);
8619                     PyErr_NoMemory();
8620                     goto onError;
8621                 }
8622                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8623                     Py_XDECREF(temp);
8624                     goto onError;
8625                 }
8626                 res = PyUnicode_AS_UNICODE(result)
8627                     + reslen - rescnt;
8628             }
8629             if (sign) {
8630                 if (fill != ' ')
8631                     *res++ = sign;
8632                 rescnt--;
8633                 if (width > len)
8634                     width--;
8635             }
8636             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8637                 assert(pbuf[0] == '0');
8638                 assert(pbuf[1] == c);
8639                 if (fill != ' ') {
8640                     *res++ = *pbuf++;
8641                     *res++ = *pbuf++;
8642                 }
8643                 rescnt -= 2;
8644                 width -= 2;
8645                 if (width < 0)
8646                     width = 0;
8647                 len -= 2;
8648             }
8649             if (width > len && !(flags & F_LJUST)) {
8650                 do {
8651                     --rescnt;
8652                     *res++ = fill;
8653                 } while (--width > len);
8654             }
8655             if (fill == ' ') {
8656                 if (sign)
8657                     *res++ = sign;
8658                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8659                     assert(pbuf[0] == '0');
8660                     assert(pbuf[1] == c);
8661                     *res++ = *pbuf++;
8662                     *res++ = *pbuf++;
8663                 }
8664             }
8665             Py_UNICODE_COPY(res, pbuf, len);
8666             res += len;
8667             rescnt -= len;
8668             while (--width >= len) {
8669                 --rescnt;
8670                 *res++ = ' ';
8671             }
8672             if (dict && (argidx < arglen) && c != '%') {
8673                 PyErr_SetString(PyExc_TypeError,
8674                                 "not all arguments converted during string formatting");
8675                 Py_XDECREF(temp);
8676                 goto onError;
8677             }
8678             Py_XDECREF(temp);
8679         } /* '%' */
8680     } /* until end */
8681     if (argidx < arglen && !dict) {
8682         PyErr_SetString(PyExc_TypeError,
8683                         "not all arguments converted during string formatting");
8684         goto onError;
8685     }
8686 
8687     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8688         goto onError;
8689     if (args_owned) {
8690         Py_DECREF(args);
8691     }
8692     Py_DECREF(uformat);
8693     return (PyObject *)result;
8694 
8695   onError:
8696     Py_XDECREF(result);
8697     Py_DECREF(uformat);
8698     if (args_owned) {
8699         Py_DECREF(args);
8700     }
8701     return NULL;
8702 }
8703 
8704 static PyBufferProcs unicode_as_buffer = {
8705     (readbufferproc) unicode_buffer_getreadbuf,
8706     (writebufferproc) unicode_buffer_getwritebuf,
8707     (segcountproc) unicode_buffer_getsegcount,
8708     (charbufferproc) unicode_buffer_getcharbuf,
8709 };
8710 
8711 static PyObject *
8712 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8713 
8714 static PyObject *
8715 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8716 {
8717     PyObject *x = NULL;
8718     static char *kwlist[] = {"string", "encoding", "errors", 0};
8719     char *encoding = NULL;
8720     char *errors = NULL;
8721 
8722     if (type != &PyUnicode_Type)
8723         return unicode_subtype_new(type, args, kwds);
8724     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8725                                      kwlist, &x, &encoding, &errors))
8726         return NULL;
8727     if (x == NULL)
8728         return (PyObject *)_PyUnicode_New(0);
8729     if (encoding == NULL && errors == NULL)
8730         return PyObject_Unicode(x);
8731     else
8732         return PyUnicode_FromEncodedObject(x, encoding, errors);
8733 }
8734 
8735 static PyObject *
8736 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8737 {
8738     PyUnicodeObject *tmp, *pnew;
8739     Py_ssize_t n;
8740 
8741     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8742     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8743     if (tmp == NULL)
8744         return NULL;
8745     assert(PyUnicode_Check(tmp));
8746     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8747     if (pnew == NULL) {
8748         Py_DECREF(tmp);
8749         return NULL;
8750     }
8751     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8752     if (pnew->str == NULL) {
8753         _Py_ForgetReference((PyObject *)pnew);
8754         PyObject_Del(pnew);
8755         Py_DECREF(tmp);
8756         return PyErr_NoMemory();
8757     }
8758     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8759     pnew->length = n;
8760     pnew->hash = tmp->hash;
8761     Py_DECREF(tmp);
8762     return (PyObject *)pnew;
8763 }
8764 
8765 PyDoc_STRVAR(unicode_doc,
8766              "unicode(string [, encoding[, errors]]) -> object\n\
8767 \n\
8768 Create a new Unicode object from the given encoded string.\n\
8769 encoding defaults to the current default string encoding.\n\
8770 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8771 
8772 PyTypeObject PyUnicode_Type = {
8773     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8774     "unicode",              /* tp_name */
8775     sizeof(PyUnicodeObject),        /* tp_size */
8776     0,                  /* tp_itemsize */
8777     /* Slots */
8778     (destructor)unicode_dealloc,    /* tp_dealloc */
8779     0,                  /* tp_print */
8780     0,                  /* tp_getattr */
8781     0,                  /* tp_setattr */
8782     0,                  /* tp_compare */
8783     unicode_repr,           /* tp_repr */
8784     &unicode_as_number,         /* tp_as_number */
8785     &unicode_as_sequence,       /* tp_as_sequence */
8786     &unicode_as_mapping,        /* tp_as_mapping */
8787     (hashfunc) unicode_hash,        /* tp_hash*/
8788     0,                  /* tp_call*/
8789     (reprfunc) unicode_str,     /* tp_str */
8790     PyObject_GenericGetAttr,        /* tp_getattro */
8791     0,                  /* tp_setattro */
8792     &unicode_as_buffer,         /* tp_as_buffer */
8793     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8794     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8795     unicode_doc,            /* tp_doc */
8796     0,                  /* tp_traverse */
8797     0,                  /* tp_clear */
8798     PyUnicode_RichCompare,      /* tp_richcompare */
8799     0,                  /* tp_weaklistoffset */
8800     0,                  /* tp_iter */
8801     0,                  /* tp_iternext */
8802     unicode_methods,            /* tp_methods */
8803     0,                  /* tp_members */
8804     0,                  /* tp_getset */
8805     &PyBaseString_Type,         /* tp_base */
8806     0,                  /* tp_dict */
8807     0,                  /* tp_descr_get */
8808     0,                  /* tp_descr_set */
8809     0,                  /* tp_dictoffset */
8810     0,                  /* tp_init */
8811     0,                  /* tp_alloc */
8812     unicode_new,            /* tp_new */
8813     PyObject_Del,           /* tp_free */
8814 };
8815 
8816 /* Initialize the Unicode implementation */
8817 
8818 void _PyUnicode_Init(void)
8819 {
8820     int i;
8821 
8822     /* XXX - move this array to unicodectype.c ? */
8823     Py_UNICODE linebreak[] = {
8824         0x000A, /* LINE FEED */
8825         0x000D, /* CARRIAGE RETURN */
8826         0x001C, /* FILE SEPARATOR */
8827         0x001D, /* GROUP SEPARATOR */
8828         0x001E, /* RECORD SEPARATOR */
8829         0x0085, /* NEXT LINE */
8830         0x2028, /* LINE SEPARATOR */
8831         0x2029, /* PARAGRAPH SEPARATOR */
8832     };
8833 
8834     /* Init the implementation */
8835     free_list = NULL;
8836     numfree = 0;
8837     unicode_empty = _PyUnicode_New(0);
8838     if (!unicode_empty)
8839         return;
8840 
8841     strcpy(unicode_default_encoding, "ascii");
8842     for (i = 0; i < 256; i++)
8843         unicode_latin1[i] = NULL;
8844     if (PyType_Ready(&PyUnicode_Type) < 0)
8845         Py_FatalError("Can't initialize 'unicode'");
8846 
8847     /* initialize the linebreak bloom filter */
8848     bloom_linebreak = make_bloom_mask(
8849         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8850         );
8851 
8852     PyType_Ready(&EncodingMapType);
8853 }
8854 
8855 /* Finalize the Unicode implementation */
8856 
8857 int
8858 PyUnicode_ClearFreeList(void)
8859 {
8860     int freelist_size = numfree;
8861     PyUnicodeObject *u;
8862 
8863     for (u = free_list; u != NULL;) {
8864         PyUnicodeObject *v = u;
8865         u = *(PyUnicodeObject **)u;
8866         if (v->str)
8867             PyObject_DEL(v->str);
8868         Py_XDECREF(v->defenc);
8869         PyObject_Del(v);
8870         numfree--;
8871     }
8872     free_list = NULL;
8873     assert(numfree == 0);
8874     return freelist_size;
8875 }
8876 
8877 void
8878 _PyUnicode_Fini(void)
8879 {
8880     int i;
8881 
8882     Py_XDECREF(unicode_empty);
8883     unicode_empty = NULL;
8884 
8885     for (i = 0; i < 256; i++) {
8886         if (unicode_latin1[i]) {
8887             Py_DECREF(unicode_latin1[i]);
8888             unicode_latin1[i] = NULL;
8889         }
8890     }
8891     (void)PyUnicode_ClearFreeList();
8892 }
8893 
8894 void _PyUnicode_DebugMallocStats(FILE *out)
8895 {
8896     _PyDebugAllocatorStats(out, "free PyUnicodeObject", numfree,
8897                            sizeof(PyUnicodeObject));
8898 }
8899 
8900 #ifdef __cplusplus
8901 }
8902 #endif
Python-2.7.3/Objects/unicodeobject.c