Python-2.7.3/Objects/unicodeobject.c

Location Tool Test ID Function Issue
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:919:17 clang-analyzer Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:919:17 clang-analyzer Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:957:25 clang-analyzer Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:957:25 clang-analyzer Dereference of null pointer (loaded from variable 'callresult')
/builddir/build/BUILD/Python-2.7.3/Objects/unicodeobject.c:2094:9 clang-analyzer Value stored to 'nallocated' is never read
   1 /*
   2 
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6 
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9 
  10 Copyright (c) Corporation for National Research Initiatives.
  11 
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14 
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17 
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21 
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30 
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39 
  40 */
  41 
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44 
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47 
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51 
  52 /* Limit for the Unicode object free list */
  53 
  54 #define PyUnicode_MAXFREELIST       1024
  55 
  56 /* Limit for the Unicode object free list stay alive optimization.
  57 
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61 
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65 
  66    Setting the limit to 0 effectively turns the feature off.
  67 
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70 
  71 */
  72 
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74 
  75 /* Endianness switches; defaults to little endian */
  76 
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82 
  83 /* --- Globals ------------------------------------------------------------
  84 
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87 
  88 */
  89 
  90 
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94 
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98 
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101 
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105 
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108 
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111 
 112 */
 113 static char unicode_default_encoding[100];
 114 
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * CHARACTER TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * LINE TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135 
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145 
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000B, * LINE TABULATION */
 151 /*         0x000C, * FORM FEED */
 152 /*         0x000D, * CARRIAGE RETURN */
 153     0, 0, 1, 1, 1, 1, 0, 0,
 154     0, 0, 0, 0, 0, 0, 0, 0,
 155 /*         0x001C, * FILE SEPARATOR */
 156 /*         0x001D, * GROUP SEPARATOR */
 157 /*         0x001E, * RECORD SEPARATOR */
 158     0, 0, 0, 0, 1, 1, 1, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161     0, 0, 0, 0, 0, 0, 0, 0,
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163 
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0,
 170     0, 0, 0, 0, 0, 0, 0, 0,
 171     0, 0, 0, 0, 0, 0, 0, 0
 172 };
 173 
 174 
 175 Py_UNICODE
 176 PyUnicode_GetMax(void)
 177 {
 178 #ifdef Py_UNICODE_WIDE
 179     return 0x10FFFF;
 180 #else
 181     /* This is actually an illegal character, so it should
 182        not be passed to unichr. */
 183     return 0xFFFF;
 184 #endif
 185 }
 186 
 187 /* --- Bloom Filters ----------------------------------------------------- */
 188 
 189 /* stuff to implement simple "bloom filters" for Unicode characters.
 190    to keep things simple, we use a single bitmask, using the least 5
 191    bits from each unicode characters as the bit index. */
 192 
 193 /* the linebreak mask is set up by Unicode_Init below */
 194 
 195 #if LONG_BIT >= 128
 196 #define BLOOM_WIDTH 128
 197 #elif LONG_BIT >= 64
 198 #define BLOOM_WIDTH 64
 199 #elif LONG_BIT >= 32
 200 #define BLOOM_WIDTH 32
 201 #else
 202 #error "LONG_BIT is smaller than 32"
 203 #endif
 204 
 205 #define BLOOM_MASK unsigned long
 206 
 207 static BLOOM_MASK bloom_linebreak;
 208 
 209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 210 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 211 
 212 #define BLOOM_LINEBREAK(ch)                                             \
 213     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 214      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 215 
 216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 217 {
 218     /* calculate simple bloom-style bitmask for a given unicode string */
 219 
 220     BLOOM_MASK mask;
 221     Py_ssize_t i;
 222 
 223     mask = 0;
 224     for (i = 0; i < len; i++)
 225         BLOOM_ADD(mask, ptr[i]);
 226 
 227     return mask;
 228 }
 229 
 230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 231 {
 232     Py_ssize_t i;
 233 
 234     for (i = 0; i < setlen; i++)
 235         if (set[i] == chr)
 236             return 1;
 237 
 238     return 0;
 239 }
 240 
 241 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 242     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 243 
 244 /* --- Unicode Object ----------------------------------------------------- */
 245 
 246 static
 247 int unicode_resize(register PyUnicodeObject *unicode,
 248                    Py_ssize_t length)
 249 {
 250     void *oldstr;
 251 
 252     /* Shortcut if there's nothing much to do. */
 253     if (unicode->length == length)
 254         goto reset;
 255 
 256     /* Resizing shared object (unicode_empty or single character
 257        objects) in-place is not allowed. Use PyUnicode_Resize()
 258        instead ! */
 259 
 260     if (unicode == unicode_empty ||
 261         (unicode->length == 1 &&
 262          unicode->str[0] < 256U &&
 263          unicode_latin1[unicode->str[0]] == unicode)) {
 264         PyErr_SetString(PyExc_SystemError,
 265                         "can't resize shared unicode objects");
 266         return -1;
 267     }
 268 
 269     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 270        The overallocation is also used by fastsearch, which assumes that it's
 271        safe to look at str[length] (without making any assumptions about what
 272        it contains). */
 273 
 274     oldstr = unicode->str;
 275     unicode->str = PyObject_REALLOC(unicode->str,
 276                                     sizeof(Py_UNICODE) * (length + 1));
 277     if (!unicode->str) {
 278         unicode->str = (Py_UNICODE *)oldstr;
 279         PyErr_NoMemory();
 280         return -1;
 281     }
 282     unicode->str[length] = 0;
 283     unicode->length = length;
 284 
 285   reset:
 286     /* Reset the object caches */
 287     if (unicode->defenc) {
 288         Py_CLEAR(unicode->defenc);
 289     }
 290     unicode->hash = -1;
 291 
 292     return 0;
 293 }
 294 
 295 /* We allocate one more byte to make sure the string is
 296    Ux0000 terminated; some code relies on that.
 297 
 298    XXX This allocator could further be enhanced by assuring that the
 299    free list never reduces its size below 1.
 300 
 301 */
 302 
 303 static
 304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 305 {
 306     register PyUnicodeObject *unicode;
 307 
 308     /* Optimization for empty strings */
 309     if (length == 0 && unicode_empty != NULL) {
 310         Py_INCREF(unicode_empty);
 311         return unicode_empty;
 312     }
 313 
 314     /* Ensure we won't overflow the size. */
 315     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 316         return (PyUnicodeObject *)PyErr_NoMemory();
 317     }
 318 
 319     /* Unicode freelist & memory allocation */
 320     if (free_list) {
 321         unicode = free_list;
 322         free_list = *(PyUnicodeObject **)unicode;
 323         numfree--;
 324         if (unicode->str) {
 325             /* Keep-Alive optimization: we only upsize the buffer,
 326                never downsize it. */
 327             if ((unicode->length < length) &&
 328                 unicode_resize(unicode, length) < 0) {
 329                 PyObject_DEL(unicode->str);
 330                 unicode->str = NULL;
 331             }
 332         }
 333         else {
 334             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 335             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 336         }
 337         PyObject_INIT(unicode, &PyUnicode_Type);
 338     }
 339     else {
 340         size_t new_size;
 341         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 342         if (unicode == NULL)
 343             return NULL;
 344         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 345         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 346     }
 347 
 348     if (!unicode->str) {
 349         PyErr_NoMemory();
 350         goto onError;
 351     }
 352     /* Initialize the first element to guard against cases where
 353      * the caller fails before initializing str -- unicode_resize()
 354      * reads str[0], and the Keep-Alive optimization can keep memory
 355      * allocated for str alive across a call to unicode_dealloc(unicode).
 356      * We don't want unicode_resize to read uninitialized memory in
 357      * that case.
 358      */
 359     unicode->str[0] = 0;
 360     unicode->str[length] = 0;
 361     unicode->length = length;
 362     unicode->hash = -1;
 363     unicode->defenc = NULL;
 364     return unicode;
 365 
 366   onError:
 367     /* XXX UNREF/NEWREF interface should be more symmetrical */
 368     _Py_DEC_REFTOTAL;
 369     _Py_ForgetReference((PyObject *)unicode);
 370     PyObject_Del(unicode);
 371     return NULL;
 372 }
 373 
 374 static
 375 void unicode_dealloc(register PyUnicodeObject *unicode)
 376 {
 377     if (PyUnicode_CheckExact(unicode) &&
 378         numfree < PyUnicode_MAXFREELIST) {
 379         /* Keep-Alive optimization */
 380         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 381             PyObject_DEL(unicode->str);
 382             unicode->str = NULL;
 383             unicode->length = 0;
 384         }
 385         if (unicode->defenc) {
 386             Py_CLEAR(unicode->defenc);
 387         }
 388         /* Add to free list */
 389         *(PyUnicodeObject **)unicode = free_list;
 390         free_list = unicode;
 391         numfree++;
 392     }
 393     else {
 394         PyObject_DEL(unicode->str);
 395         Py_XDECREF(unicode->defenc);
 396         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 397     }
 398 }
 399 
 400 static
 401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 402 {
 403     register PyUnicodeObject *v;
 404 
 405     /* Argument checks */
 406     if (unicode == NULL) {
 407         PyErr_BadInternalCall();
 408         return -1;
 409     }
 410     v = *unicode;
 411     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 412         PyErr_BadInternalCall();
 413         return -1;
 414     }
 415 
 416     /* Resizing unicode_empty and single character objects is not
 417        possible since these are being shared. We simply return a fresh
 418        copy with the same Unicode content. */
 419     if (v->length != length &&
 420         (v == unicode_empty || v->length == 1)) {
 421         PyUnicodeObject *w = _PyUnicode_New(length);
 422         if (w == NULL)
 423             return -1;
 424         Py_UNICODE_COPY(w->str, v->str,
 425                         length < v->length ? length : v->length);
 426         Py_DECREF(*unicode);
 427         *unicode = w;
 428         return 0;
 429     }
 430 
 431     /* Note that we don't have to modify *unicode for unshared Unicode
 432        objects, since we can modify them in-place. */
 433     return unicode_resize(v, length);
 434 }
 435 
 436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 437 {
 438     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 439 }
 440 
 441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 442                                 Py_ssize_t size)
 443 {
 444     PyUnicodeObject *unicode;
 445 
 446     /* If the Unicode data is known at construction time, we can apply
 447        some optimizations which share commonly used objects. */
 448     if (u != NULL) {
 449 
 450         /* Optimization for empty strings */
 451         if (size == 0 && unicode_empty != NULL) {
 452             Py_INCREF(unicode_empty);
 453             return (PyObject *)unicode_empty;
 454         }
 455 
 456         /* Single character Unicode objects in the Latin-1 range are
 457            shared when using this constructor */
 458         if (size == 1 && *u < 256) {
 459             unicode = unicode_latin1[*u];
 460             if (!unicode) {
 461                 unicode = _PyUnicode_New(1);
 462                 if (!unicode)
 463                     return NULL;
 464                 unicode->str[0] = *u;
 465                 unicode_latin1[*u] = unicode;
 466             }
 467             Py_INCREF(unicode);
 468             return (PyObject *)unicode;
 469         }
 470     }
 471 
 472     unicode = _PyUnicode_New(size);
 473     if (!unicode)
 474         return NULL;
 475 
 476     /* Copy the Unicode data into the new object */
 477     if (u != NULL)
 478         Py_UNICODE_COPY(unicode->str, u, size);
 479 
 480     return (PyObject *)unicode;
 481 }
 482 
 483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 484 {
 485     PyUnicodeObject *unicode;
 486 
 487     if (size < 0) {
 488         PyErr_SetString(PyExc_SystemError,
 489                         "Negative size passed to PyUnicode_FromStringAndSize");
 490         return NULL;
 491     }
 492 
 493     /* If the Unicode data is known at construction time, we can apply
 494        some optimizations which share commonly used objects.
 495        Also, this means the input must be UTF-8, so fall back to the
 496        UTF-8 decoder at the end. */
 497     if (u != NULL) {
 498 
 499         /* Optimization for empty strings */
 500         if (size == 0 && unicode_empty != NULL) {
 501             Py_INCREF(unicode_empty);
 502             return (PyObject *)unicode_empty;
 503         }
 504 
 505         /* Single characters are shared when using this constructor.
 506            Restrict to ASCII, since the input must be UTF-8. */
 507         if (size == 1 && Py_CHARMASK(*u) < 128) {
 508             unicode = unicode_latin1[Py_CHARMASK(*u)];
 509             if (!unicode) {
 510                 unicode = _PyUnicode_New(1);
 511                 if (!unicode)
 512                     return NULL;
 513                 unicode->str[0] = Py_CHARMASK(*u);
 514                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 515             }
 516             Py_INCREF(unicode);
 517             return (PyObject *)unicode;
 518         }
 519 
 520         return PyUnicode_DecodeUTF8(u, size, NULL);
 521     }
 522 
 523     unicode = _PyUnicode_New(size);
 524     if (!unicode)
 525         return NULL;
 526 
 527     return (PyObject *)unicode;
 528 }
 529 
 530 PyObject *PyUnicode_FromString(const char *u)
 531 {
 532     size_t size = strlen(u);
 533     if (size > PY_SSIZE_T_MAX) {
 534         PyErr_SetString(PyExc_OverflowError, "input too long");
 535         return NULL;
 536     }
 537 
 538     return PyUnicode_FromStringAndSize(u, size);
 539 }
 540 
 541 #ifdef HAVE_WCHAR_H
 542 
 543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 544 # define CONVERT_WCHAR_TO_SURROGATES
 545 #endif
 546 
 547 #ifdef CONVERT_WCHAR_TO_SURROGATES
 548 
 549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 550    to convert from UTF32 to UTF16. */
 551 
 552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 553                                  Py_ssize_t size)
 554 {
 555     PyUnicodeObject *unicode;
 556     register Py_ssize_t i;
 557     Py_ssize_t alloc;
 558     const wchar_t *orig_w;
 559 
 560     if (w == NULL) {
 561         PyErr_BadInternalCall();
 562         return NULL;
 563     }
 564 
 565     alloc = size;
 566     orig_w = w;
 567     for (i = size; i > 0; i--) {
 568         if (*w > 0xFFFF)
 569             alloc++;
 570         w++;
 571     }
 572     w = orig_w;
 573     unicode = _PyUnicode_New(alloc);
 574     if (!unicode)
 575         return NULL;
 576 
 577     /* Copy the wchar_t data into the new object */
 578     {
 579         register Py_UNICODE *u;
 580         u = PyUnicode_AS_UNICODE(unicode);
 581         for (i = size; i > 0; i--) {
 582             if (*w > 0xFFFF) {
 583                 wchar_t ordinal = *w++;
 584                 ordinal -= 0x10000;
 585                 *u++ = 0xD800 | (ordinal >> 10);
 586                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 587             }
 588             else
 589                 *u++ = *w++;
 590         }
 591     }
 592     return (PyObject *)unicode;
 593 }
 594 
 595 #else
 596 
 597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 598                                  Py_ssize_t size)
 599 {
 600     PyUnicodeObject *unicode;
 601 
 602     if (w == NULL) {
 603         PyErr_BadInternalCall();
 604         return NULL;
 605     }
 606 
 607     unicode = _PyUnicode_New(size);
 608     if (!unicode)
 609         return NULL;
 610 
 611     /* Copy the wchar_t data into the new object */
 612 #ifdef HAVE_USABLE_WCHAR_T
 613     memcpy(unicode->str, w, size * sizeof(wchar_t));
 614 #else
 615     {
 616         register Py_UNICODE *u;
 617         register Py_ssize_t i;
 618         u = PyUnicode_AS_UNICODE(unicode);
 619         for (i = size; i > 0; i--)
 620             *u++ = *w++;
 621     }
 622 #endif
 623 
 624     return (PyObject *)unicode;
 625 }
 626 
 627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 628 
 629 #undef CONVERT_WCHAR_TO_SURROGATES
 630 
 631 static void
 632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 633 {
 634     *fmt++ = '%';
 635     if (width) {
 636         if (zeropad)
 637             *fmt++ = '0';
 638         fmt += sprintf(fmt, "%d", width);
 639     }
 640     if (precision)
 641         fmt += sprintf(fmt, ".%d", precision);
 642     if (longflag)
 643         *fmt++ = 'l';
 644     else if (size_tflag) {
 645         char *f = PY_FORMAT_SIZE_T;
 646         while (*f)
 647             *fmt++ = *f++;
 648     }
 649     *fmt++ = c;
 650     *fmt = '\0';
 651 }
 652 
 653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 654 
 655 PyObject *
 656 PyUnicode_FromFormatV(const char *format, va_list vargs)
 657 {
 658     va_list count;
 659     Py_ssize_t callcount = 0;
 660     PyObject **callresults = NULL;
 661     PyObject **callresult = NULL;
 662     Py_ssize_t n = 0;
 663     int width = 0;
 664     int precision = 0;
 665     int zeropad;
 666     const char* f;
 667     Py_UNICODE *s;
 668     PyObject *string;
 669     /* used by sprintf */
 670     char buffer[21];
 671     /* use abuffer instead of buffer, if we need more space
 672      * (which can happen if there's a format specifier with width). */
 673     char *abuffer = NULL;
 674     char *realbuffer;
 675     Py_ssize_t abuffersize = 0;
 676     char fmt[60]; /* should be enough for %0width.precisionld */
 677     const char *copy;
 678 
 679 #ifdef VA_LIST_IS_ARRAY
 680     Py_MEMCPY(count, vargs, sizeof(va_list));
 681 #else
 682 #ifdef  __va_copy
 683     __va_copy(count, vargs);
 684 #else
 685     count = vargs;
 686 #endif
 687 #endif
 688      /* step 1: count the number of %S/%R/%s format specifications
 689       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 690       * objects once during step 3 and put the result in an array) */
 691     for (f = format; *f; f++) {
 692          if (*f == '%') {
 693              if (*(f+1)=='%')
 694                  continue;
 695              if (*(f+1)=='S' || *(f+1)=='R')
 696                  ++callcount;
 697              while (isdigit((unsigned)*f))
 698                  width = (width*10) + *f++ - '0';
 699              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 700                  ;
 701              if (*f == 's')
 702                  ++callcount;
 703          }
 704     }
 705     /* step 2: allocate memory for the results of
 706      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 707     if (callcount) {
 708         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 709         if (!callresults) {
 710             PyErr_NoMemory();
 711             return NULL;
 712         }
 713         callresult = callresults;
 714     }
 715     /* step 3: figure out how large a buffer we need */
 716     for (f = format; *f; f++) {
 717         if (*f == '%') {
 718             const char* p = f;
 719             width = 0;
 720             while (isdigit((unsigned)*f))
 721                 width = (width*10) + *f++ - '0';
 722             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 723                 ;
 724 
 725             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 726              * they don't affect the amount of space we reserve.
 727              */
 728             if ((*f == 'l' || *f == 'z') &&
 729                 (f[1] == 'd' || f[1] == 'u'))
 730                 ++f;
 731 
 732             switch (*f) {
 733             case 'c':
 734                 (void)va_arg(count, int);
 735                 /* fall through... */
 736             case '%':
 737                 n++;
 738                 break;
 739             case 'd': case 'u': case 'i': case 'x':
 740                 (void) va_arg(count, int);
 741                 /* 20 bytes is enough to hold a 64-bit
 742                    integer.  Decimal takes the most space.
 743                    This isn't enough for octal.
 744                    If a width is specified we need more
 745                    (which we allocate later). */
 746                 if (width < 20)
 747                     width = 20;
 748                 n += width;
 749                 if (abuffersize < width)
 750                     abuffersize = width;
 751                 break;
 752             case 's':
 753             {
 754                 /* UTF-8 */
 755                 const char *s = va_arg(count, const char*);
 756                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 757                 if (!str)
 758                     goto fail;
 759                 n += PyUnicode_GET_SIZE(str);
 760                 /* Remember the str and switch to the next slot */
 761                 *callresult++ = str;
 762                 break;
 763             }
 764             case 'U':
 765             {
 766                 PyObject *obj = va_arg(count, PyObject *);
 767                 assert(obj && PyUnicode_Check(obj));
 768                 n += PyUnicode_GET_SIZE(obj);
 769                 break;
 770             }
 771             case 'V':
 772             {
 773                 PyObject *obj = va_arg(count, PyObject *);
 774                 const char *str = va_arg(count, const char *);
 775                 assert(obj || str);
 776                 assert(!obj || PyUnicode_Check(obj));
 777                 if (obj)
 778                     n += PyUnicode_GET_SIZE(obj);
 779                 else
 780                     n += strlen(str);
 781                 break;
 782             }
 783             case 'S':
 784             {
 785                 PyObject *obj = va_arg(count, PyObject *);
 786                 PyObject *str;
 787                 assert(obj);
 788                 str = PyObject_Str(obj);
 789                 if (!str)
 790                     goto fail;
 791                 n += PyUnicode_GET_SIZE(str);
 792                 /* Remember the str and switch to the next slot */
 793                 *callresult++ = str;
 794                 break;
 795             }
 796             case 'R':
 797             {
 798                 PyObject *obj = va_arg(count, PyObject *);
 799                 PyObject *repr;
 800                 assert(obj);
 801                 repr = PyObject_Repr(obj);
 802                 if (!repr)
 803                     goto fail;
 804                 n += PyUnicode_GET_SIZE(repr);
 805                 /* Remember the repr and switch to the next slot */
 806                 *callresult++ = repr;
 807                 break;
 808             }
 809             case 'p':
 810                 (void) va_arg(count, int);
 811                 /* maximum 64-bit pointer representation:
 812                  * 0xffffffffffffffff
 813                  * so 19 characters is enough.
 814                  * XXX I count 18 -- what's the extra for?
 815                  */
 816                 n += 19;
 817                 break;
 818             default:
 819                 /* if we stumble upon an unknown
 820                    formatting code, copy the rest of
 821                    the format string to the output
 822                    string. (we cannot just skip the
 823                    code, since there's no way to know
 824                    what's in the argument list) */
 825                 n += strlen(p);
 826                 goto expand;
 827             }
 828         } else
 829             n++;
 830     }
 831   expand:
 832     if (abuffersize > 20) {
 833         abuffer = PyObject_Malloc(abuffersize);
 834         if (!abuffer) {
 835             PyErr_NoMemory();
 836             goto fail;
 837         }
 838         realbuffer = abuffer;
 839     }
 840     else
 841         realbuffer = buffer;
 842     /* step 4: fill the buffer */
 843     /* Since we've analyzed how much space we need for the worst case,
 844        we don't have to resize the string.
 845        There can be no errors beyond this point. */
 846     string = PyUnicode_FromUnicode(NULL, n);
 847     if (!string)
 848         goto fail;
 849 
 850     s = PyUnicode_AS_UNICODE(string);
 851     callresult = callresults;
 852 
 853     for (f = format; *f; f++) {
 854         if (*f == '%') {
 855             const char* p = f++;
 856             int longflag = 0;
 857             int size_tflag = 0;
 858             zeropad = (*f == '0');
 859             /* parse the width.precision part */
 860             width = 0;
 861             while (isdigit((unsigned)*f))
 862                 width = (width*10) + *f++ - '0';
 863             precision = 0;
 864             if (*f == '.') {
 865                 f++;
 866                 while (isdigit((unsigned)*f))
 867                     precision = (precision*10) + *f++ - '0';
 868             }
 869             /* handle the long flag, but only for %ld and %lu.
 870                others can be added when necessary. */
 871             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 872                 longflag = 1;
 873                 ++f;
 874             }
 875             /* handle the size_t flag. */
 876             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 877                 size_tflag = 1;
 878                 ++f;
 879             }
 880 
 881             switch (*f) {
 882             case 'c':
 883                 *s++ = va_arg(vargs, int);
 884                 break;
 885             case 'd':
 886                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 887                 if (longflag)
 888                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 889                 else if (size_tflag)
 890                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 891                 else
 892                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 893                 appendstring(realbuffer);
 894                 break;
 895             case 'u':
 896                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 897                 if (longflag)
 898                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 899                 else if (size_tflag)
 900                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 901                 else
 902                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 903                 appendstring(realbuffer);
 904                 break;
 905             case 'i':
 906                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 907                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 908                 appendstring(realbuffer);
 909                 break;
 910             case 'x':
 911                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 912                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 913                 appendstring(realbuffer);
 914                 break;
 915             case 's':
 916             {
 917                 /* unused, since we already have the result */
 918                 (void) va_arg(vargs, char *);
 919                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
Dereference of null pointer (loaded from variable 'callresult')
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

Dereference of null pointer (loaded from variable 'callresult')
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

920 PyUnicode_GET_SIZE(*callresult)); 921 s += PyUnicode_GET_SIZE(*callresult); 922 /* We're done with the unicode()/repr() => forget it */ 923 Py_DECREF(*callresult); 924 /* switch to next unicode()/repr() result */ 925 ++callresult; 926 break; 927 } 928 case 'U': 929 { 930 PyObject *obj = va_arg(vargs, PyObject *); 931 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 933 s += size; 934 break; 935 } 936 case 'V': 937 { 938 PyObject *obj = va_arg(vargs, PyObject *); 939 const char *str = va_arg(vargs, const char *); 940 if (obj) { 941 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 943 s += size; 944 } else { 945 appendstring(str); 946 } 947 break; 948 } 949 case 'S': 950 case 'R': 951 { 952 Py_UNICODE *ucopy; 953 Py_ssize_t usize; 954 Py_ssize_t upos; 955 /* unused, since we already have the result */ 956 (void) va_arg(vargs, PyObject *); 957 ucopy = PyUnicode_AS_UNICODE(*callresult);
Dereference of null pointer (loaded from variable 'callresult')
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

Dereference of null pointer (loaded from variable 'callresult')
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

958 usize = PyUnicode_GET_SIZE(*callresult); 959 for (upos = 0; upos<usize;) 960 *s++ = ucopy[upos++]; 961 /* We're done with the unicode()/repr() => forget it */ 962 Py_DECREF(*callresult); 963 /* switch to next unicode()/repr() result */ 964 ++callresult; 965 break; 966 } 967 case 'p': 968 sprintf(buffer, "%p", va_arg(vargs, void*)); 969 /* %p is ill-defined: ensure leading 0x. */ 970 if (buffer[1] == 'X') 971 buffer[1] = 'x'; 972 else if (buffer[1] != 'x') { 973 memmove(buffer+2, buffer, strlen(buffer)+1); 974 buffer[0] = '0'; 975 buffer[1] = 'x'; 976 } 977 appendstring(buffer); 978 break; 979 case '%': 980 *s++ = '%'; 981 break; 982 default: 983 appendstring(p); 984 goto end; 985 } 986 } else 987 *s++ = *f; 988 } 989 990 end: 991 if (callresults) 992 PyObject_Free(callresults); 993 if (abuffer) 994 PyObject_Free(abuffer); 995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 996 return string; 997 fail: 998 if (callresults) { 999 PyObject **callresult2 = callresults; 1000 while (callresult2 < callresult) { 1001 Py_DECREF(*callresult2); 1002 ++callresult2; 1003 } 1004 PyObject_Free(callresults); 1005 } 1006 if (abuffer) 1007 PyObject_Free(abuffer); 1008 return NULL; 1009 } 1010 1011 #undef appendstring 1012 1013 PyObject * 1014 PyUnicode_FromFormat(const char *format, ...) 1015 { 1016 PyObject* ret; 1017 va_list vargs; 1018 1019 #ifdef HAVE_STDARG_PROTOTYPES 1020 va_start(vargs, format); 1021 #else 1022 va_start(vargs); 1023 #endif 1024 ret = PyUnicode_FromFormatV(format, vargs); 1025 va_end(vargs); 1026 return ret; 1027 } 1028 1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 1030 wchar_t *w, 1031 Py_ssize_t size) 1032 { 1033 if (unicode == NULL) { 1034 PyErr_BadInternalCall(); 1035 return -1; 1036 } 1037 1038 /* If possible, try to copy the 0-termination as well */ 1039 if (size > PyUnicode_GET_SIZE(unicode)) 1040 size = PyUnicode_GET_SIZE(unicode) + 1; 1041 1042 #ifdef HAVE_USABLE_WCHAR_T 1043 memcpy(w, unicode->str, size * sizeof(wchar_t)); 1044 #else 1045 { 1046 register Py_UNICODE *u; 1047 register Py_ssize_t i; 1048 u = PyUnicode_AS_UNICODE(unicode); 1049 for (i = size; i > 0; i--) 1050 *w++ = *u++; 1051 } 1052 #endif 1053 1054 if (size > PyUnicode_GET_SIZE(unicode)) 1055 return PyUnicode_GET_SIZE(unicode); 1056 else 1057 return size; 1058 } 1059 1060 #endif 1061 1062 PyObject *PyUnicode_FromOrdinal(int ordinal) 1063 { 1064 Py_UNICODE s[1]; 1065 1066 #ifdef Py_UNICODE_WIDE 1067 if (ordinal < 0 || ordinal > 0x10ffff) { 1068 PyErr_SetString(PyExc_ValueError, 1069 "unichr() arg not in range(0x110000) " 1070 "(wide Python build)"); 1071 return NULL; 1072 } 1073 #else 1074 if (ordinal < 0 || ordinal > 0xffff) { 1075 PyErr_SetString(PyExc_ValueError, 1076 "unichr() arg not in range(0x10000) " 1077 "(narrow Python build)"); 1078 return NULL; 1079 } 1080 #endif 1081 1082 s[0] = (Py_UNICODE)ordinal; 1083 return PyUnicode_FromUnicode(s, 1); 1084 } 1085 1086 PyObject *PyUnicode_FromObject(register PyObject *obj) 1087 { 1088 /* XXX Perhaps we should make this API an alias of 1089 PyObject_Unicode() instead ?! */ 1090 if (PyUnicode_CheckExact(obj)) { 1091 Py_INCREF(obj); 1092 return obj; 1093 } 1094 if (PyUnicode_Check(obj)) { 1095 /* For a Unicode subtype that's not a Unicode object, 1096 return a true Unicode object with the same data. */ 1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 1098 PyUnicode_GET_SIZE(obj)); 1099 } 1100 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 1101 } 1102 1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 1104 const char *encoding, 1105 const char *errors) 1106 { 1107 const char *s = NULL; 1108 Py_ssize_t len; 1109 PyObject *v; 1110 1111 if (obj == NULL) { 1112 PyErr_BadInternalCall(); 1113 return NULL; 1114 } 1115 1116 #if 0 1117 /* For b/w compatibility we also accept Unicode objects provided 1118 that no encodings is given and then redirect to 1119 PyObject_Unicode() which then applies the additional logic for 1120 Unicode subclasses. 1121 1122 NOTE: This API should really only be used for object which 1123 represent *encoded* Unicode ! 1124 1125 */ 1126 if (PyUnicode_Check(obj)) { 1127 if (encoding) { 1128 PyErr_SetString(PyExc_TypeError, 1129 "decoding Unicode is not supported"); 1130 return NULL; 1131 } 1132 return PyObject_Unicode(obj); 1133 } 1134 #else 1135 if (PyUnicode_Check(obj)) { 1136 PyErr_SetString(PyExc_TypeError, 1137 "decoding Unicode is not supported"); 1138 return NULL; 1139 } 1140 #endif 1141 1142 /* Coerce object */ 1143 if (PyString_Check(obj)) { 1144 s = PyString_AS_STRING(obj); 1145 len = PyString_GET_SIZE(obj); 1146 } 1147 else if (PyByteArray_Check(obj)) { 1148 /* Python 2.x specific */ 1149 PyErr_Format(PyExc_TypeError, 1150 "decoding bytearray is not supported"); 1151 return NULL; 1152 } 1153 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1154 /* Overwrite the error message with something more useful in 1155 case of a TypeError. */ 1156 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1157 PyErr_Format(PyExc_TypeError, 1158 "coercing to Unicode: need string or buffer, " 1159 "%.80s found", 1160 Py_TYPE(obj)->tp_name); 1161 goto onError; 1162 } 1163 1164 /* Convert to Unicode */ 1165 if (len == 0) { 1166 Py_INCREF(unicode_empty); 1167 v = (PyObject *)unicode_empty; 1168 } 1169 else 1170 v = PyUnicode_Decode(s, len, encoding, errors); 1171 1172 return v; 1173 1174 onError: 1175 return NULL; 1176 } 1177 1178 PyObject *PyUnicode_Decode(const char *s, 1179 Py_ssize_t size, 1180 const char *encoding, 1181 const char *errors) 1182 { 1183 PyObject *buffer = NULL, *unicode; 1184 1185 if (encoding == NULL) 1186 encoding = PyUnicode_GetDefaultEncoding(); 1187 1188 /* Shortcuts for common default encodings */ 1189 if (strcmp(encoding, "utf-8") == 0) 1190 return PyUnicode_DecodeUTF8(s, size, errors); 1191 else if (strcmp(encoding, "latin-1") == 0) 1192 return PyUnicode_DecodeLatin1(s, size, errors); 1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1194 else if (strcmp(encoding, "mbcs") == 0) 1195 return PyUnicode_DecodeMBCS(s, size, errors); 1196 #endif 1197 else if (strcmp(encoding, "ascii") == 0) 1198 return PyUnicode_DecodeASCII(s, size, errors); 1199 1200 /* Decode via the codec registry */ 1201 buffer = PyBuffer_FromMemory((void *)s, size); 1202 if (buffer == NULL) 1203 goto onError; 1204 unicode = PyCodec_Decode(buffer, encoding, errors); 1205 if (unicode == NULL) 1206 goto onError; 1207 if (!PyUnicode_Check(unicode)) { 1208 PyErr_Format(PyExc_TypeError, 1209 "decoder did not return an unicode object (type=%.400s)", 1210 Py_TYPE(unicode)->tp_name); 1211 Py_DECREF(unicode); 1212 goto onError; 1213 } 1214 Py_DECREF(buffer); 1215 return unicode; 1216 1217 onError: 1218 Py_XDECREF(buffer); 1219 return NULL; 1220 } 1221 1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1223 const char *encoding, 1224 const char *errors) 1225 { 1226 PyObject *v; 1227 1228 if (!PyUnicode_Check(unicode)) { 1229 PyErr_BadArgument(); 1230 goto onError; 1231 } 1232 1233 if (encoding == NULL) 1234 encoding = PyUnicode_GetDefaultEncoding(); 1235 1236 /* Decode via the codec registry */ 1237 v = PyCodec_Decode(unicode, encoding, errors); 1238 if (v == NULL) 1239 goto onError; 1240 return v; 1241 1242 onError: 1243 return NULL; 1244 } 1245 1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1247 Py_ssize_t size, 1248 const char *encoding, 1249 const char *errors) 1250 { 1251 PyObject *v, *unicode; 1252 1253 unicode = PyUnicode_FromUnicode(s, size); 1254 if (unicode == NULL) 1255 return NULL; 1256 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1257 Py_DECREF(unicode); 1258 return v; 1259 } 1260 1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1262 const char *encoding, 1263 const char *errors) 1264 { 1265 PyObject *v; 1266 1267 if (!PyUnicode_Check(unicode)) { 1268 PyErr_BadArgument(); 1269 goto onError; 1270 } 1271 1272 if (encoding == NULL) 1273 encoding = PyUnicode_GetDefaultEncoding(); 1274 1275 /* Encode via the codec registry */ 1276 v = PyCodec_Encode(unicode, encoding, errors); 1277 if (v == NULL) 1278 goto onError; 1279 return v; 1280 1281 onError: 1282 return NULL; 1283 } 1284 1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1286 const char *encoding, 1287 const char *errors) 1288 { 1289 PyObject *v; 1290 1291 if (!PyUnicode_Check(unicode)) { 1292 PyErr_BadArgument(); 1293 goto onError; 1294 } 1295 1296 if (encoding == NULL) 1297 encoding = PyUnicode_GetDefaultEncoding(); 1298 1299 /* Shortcuts for common default encodings */ 1300 if (errors == NULL) { 1301 if (strcmp(encoding, "utf-8") == 0) 1302 return PyUnicode_AsUTF8String(unicode); 1303 else if (strcmp(encoding, "latin-1") == 0) 1304 return PyUnicode_AsLatin1String(unicode); 1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1306 else if (strcmp(encoding, "mbcs") == 0) 1307 return PyUnicode_AsMBCSString(unicode); 1308 #endif 1309 else if (strcmp(encoding, "ascii") == 0) 1310 return PyUnicode_AsASCIIString(unicode); 1311 } 1312 1313 /* Encode via the codec registry */ 1314 v = PyCodec_Encode(unicode, encoding, errors); 1315 if (v == NULL) 1316 goto onError; 1317 if (!PyString_Check(v)) { 1318 PyErr_Format(PyExc_TypeError, 1319 "encoder did not return a string object (type=%.400s)", 1320 Py_TYPE(v)->tp_name); 1321 Py_DECREF(v); 1322 goto onError; 1323 } 1324 return v; 1325 1326 onError: 1327 return NULL; 1328 } 1329 1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1331 const char *errors) 1332 { 1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1334 1335 if (v) 1336 return v; 1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 1338 if (v && errors == NULL) 1339 ((PyUnicodeObject *)unicode)->defenc = v; 1340 return v; 1341 } 1342 1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1344 { 1345 if (!PyUnicode_Check(unicode)) { 1346 PyErr_BadArgument(); 1347 goto onError; 1348 } 1349 return PyUnicode_AS_UNICODE(unicode); 1350 1351 onError: 1352 return NULL; 1353 } 1354 1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1356 { 1357 if (!PyUnicode_Check(unicode)) { 1358 PyErr_BadArgument(); 1359 goto onError; 1360 } 1361 return PyUnicode_GET_SIZE(unicode); 1362 1363 onError: 1364 return -1; 1365 } 1366 1367 const char *PyUnicode_GetDefaultEncoding(void) 1368 { 1369 return unicode_default_encoding; 1370 } 1371 1372 int PyUnicode_SetDefaultEncoding(const char *encoding) 1373 { 1374 PyObject *v; 1375 1376 /* Make sure the encoding is valid. As side effect, this also 1377 loads the encoding into the codec registry cache. */ 1378 v = _PyCodec_Lookup(encoding); 1379 if (v == NULL) 1380 goto onError; 1381 Py_DECREF(v); 1382 strncpy(unicode_default_encoding, 1383 encoding, 1384 sizeof(unicode_default_encoding)); 1385 return 0; 1386 1387 onError: 1388 return -1; 1389 } 1390 1391 /* error handling callback helper: 1392 build arguments, call the callback and check the arguments, 1393 if no exception occurred, copy the replacement to the output 1394 and adjust various state variables. 1395 return 0 on success, -1 on error 1396 */ 1397 1398 static 1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1400 const char *encoding, const char *reason, 1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, 1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1404 { 1405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1406 1407 PyObject *restuple = NULL; 1408 PyObject *repunicode = NULL; 1409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1410 Py_ssize_t requiredsize; 1411 Py_ssize_t newpos; 1412 Py_UNICODE *repptr; 1413 Py_ssize_t repsize; 1414 int res = -1; 1415 1416 if (*errorHandler == NULL) { 1417 *errorHandler = PyCodec_LookupError(errors); 1418 if (*errorHandler == NULL) 1419 goto onError; 1420 } 1421 1422 if (*exceptionObject == NULL) { 1423 *exceptionObject = PyUnicodeDecodeError_Create( 1424 encoding, input, insize, *startinpos, *endinpos, reason); 1425 if (*exceptionObject == NULL) 1426 goto onError; 1427 } 1428 else { 1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1430 goto onError; 1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1432 goto onError; 1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1434 goto onError; 1435 } 1436 1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1438 if (restuple == NULL) 1439 goto onError; 1440 if (!PyTuple_Check(restuple)) { 1441 PyErr_SetString(PyExc_TypeError, &argparse[4]); 1442 goto onError; 1443 } 1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1445 goto onError; 1446 if (newpos<0) 1447 newpos = insize+newpos; 1448 if (newpos<0 || newpos>insize) { 1449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1450 goto onError; 1451 } 1452 1453 /* need more space? (at least enough for what we 1454 have+the replacement+the rest of the string (starting 1455 at the new input position), so we won't have to check space 1456 when there are no errors in the rest of the string) */ 1457 repptr = PyUnicode_AS_UNICODE(repunicode); 1458 repsize = PyUnicode_GET_SIZE(repunicode); 1459 requiredsize = *outpos + repsize + insize-newpos; 1460 if (requiredsize > outsize) { 1461 if (requiredsize<2*outsize) 1462 requiredsize = 2*outsize; 1463 if (_PyUnicode_Resize(output, requiredsize) < 0) 1464 goto onError; 1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1466 } 1467 *endinpos = newpos; 1468 *inptr = input + newpos; 1469 Py_UNICODE_COPY(*outptr, repptr, repsize); 1470 *outptr += repsize; 1471 *outpos += repsize; 1472 /* we made it! */ 1473 res = 0; 1474 1475 onError: 1476 Py_XDECREF(restuple); 1477 return res; 1478 } 1479 1480 /* --- UTF-7 Codec -------------------------------------------------------- */ 1481 1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 1483 1484 /* Three simple macros defining base-64. */ 1485 1486 /* Is c a base-64 character? */ 1487 1488 #define IS_BASE64(c) \ 1489 (isalnum(c) || (c) == '+' || (c) == '/') 1490 1491 /* given that c is a base-64 character, what is its base-64 value? */ 1492 1493 #define FROM_BASE64(c) \ 1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 1497 (c) == '+' ? 62 : 63) 1498 1499 /* What is the base-64 character of the bottom 6 bits of n? */ 1500 1501 #define TO_BASE64(n) \ 1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1503 1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 1505 * decoded as itself. We are permissive on decoding; the only ASCII 1506 * byte not decoding to itself is the + which begins a base64 1507 * string. */ 1508 1509 #define DECODE_DIRECT(c) \ 1510 ((c) <= 127 && (c) != '+') 1511 1512 /* The UTF-7 encoder treats ASCII characters differently according to 1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 1514 * the above). See RFC2152. This array identifies these different 1515 * sets: 1516 * 0 : "Set D" 1517 * alphanumeric and '(),-./:? 1518 * 1 : "Set O" 1519 * !"#$%&*;<=>@[]^_`{|} 1520 * 2 : "whitespace" 1521 * ht nl cr sp 1522 * 3 : special (must be base64 encoded) 1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 1524 */ 1525 1526 static 1527 char utf7_category[128] = { 1528 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1532 /* sp ! " # $ % & ' ( ) * + , - . / */ 1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 1534 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1536 /* @ A B C D E F G H I J K L M N O */ 1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1538 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 1540 /* ` a b c d e f g h i j k l m n o */ 1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1542 /* p q r s t u v w x y z { | } ~ del */ 1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 1544 }; 1545 1546 /* ENCODE_DIRECT: this character should be encoded as itself. The 1547 * answer depends on whether we are encoding set O as itself, and also 1548 * on whether we are encoding whitespace as itself. RFC2152 makes it 1549 * clear that the answers to these questions vary between 1550 * applications, so this code needs to be flexible. */ 1551 1552 #define ENCODE_DIRECT(c, directO, directWS) \ 1553 ((c) < 128 && (c) > 0 && \ 1554 ((utf7_category[(c)] == 0) || \ 1555 (directWS && (utf7_category[(c)] == 2)) || \ 1556 (directO && (utf7_category[(c)] == 1)))) 1557 1558 PyObject *PyUnicode_DecodeUTF7(const char *s, 1559 Py_ssize_t size, 1560 const char *errors) 1561 { 1562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1563 } 1564 1565 /* The decoder. The only state we preserve is our read position, 1566 * i.e. how many characters we have consumed. So if we end in the 1567 * middle of a shift sequence we have to back off the read position 1568 * and the output to the beginning of the sequence, otherwise we lose 1569 * all the shift state (seen bits, number of bits seen, high 1570 * surrogate). */ 1571 1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, 1573 Py_ssize_t size, 1574 const char *errors, 1575 Py_ssize_t *consumed) 1576 { 1577 const char *starts = s; 1578 Py_ssize_t startinpos; 1579 Py_ssize_t endinpos; 1580 Py_ssize_t outpos; 1581 const char *e; 1582 PyUnicodeObject *unicode; 1583 Py_UNICODE *p; 1584 const char *errmsg = ""; 1585 int inShift = 0; 1586 Py_UNICODE *shiftOutStart; 1587 unsigned int base64bits = 0; 1588 unsigned long base64buffer = 0; 1589 Py_UNICODE surrogate = 0; 1590 PyObject *errorHandler = NULL; 1591 PyObject *exc = NULL; 1592 1593 unicode = _PyUnicode_New(size); 1594 if (!unicode) 1595 return NULL; 1596 if (size == 0) { 1597 if (consumed) 1598 *consumed = 0; 1599 return (PyObject *)unicode; 1600 } 1601 1602 p = unicode->str; 1603 shiftOutStart = p; 1604 e = s + size; 1605 1606 while (s < e) { 1607 Py_UNICODE ch = (unsigned char) *s; 1608 1609 if (inShift) { /* in a base-64 section */ 1610 if (IS_BASE64(ch)) { /* consume a base-64 character */ 1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 1612 base64bits += 6; 1613 s++; 1614 if (base64bits >= 16) { 1615 /* we have enough bits for a UTF-16 value */ 1616 Py_UNICODE outCh = (Py_UNICODE) 1617 (base64buffer >> (base64bits-16)); 1618 base64bits -= 16; 1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 1620 if (surrogate) { 1621 /* expecting a second surrogate */ 1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 1623 #ifdef Py_UNICODE_WIDE 1624 *p++ = (((surrogate & 0x3FF)<<10) 1625 | (outCh & 0x3FF)) + 0x10000; 1626 #else 1627 *p++ = surrogate; 1628 *p++ = outCh; 1629 #endif 1630 surrogate = 0; 1631 continue; 1632 } 1633 else { 1634 *p++ = surrogate; 1635 surrogate = 0; 1636 } 1637 } 1638 if (outCh >= 0xD800 && outCh <= 0xDBFF) { 1639 /* first surrogate */ 1640 surrogate = outCh; 1641 } 1642 else { 1643 *p++ = outCh; 1644 } 1645 } 1646 } 1647 else { /* now leaving a base-64 section */ 1648 inShift = 0; 1649 s++; 1650 if (surrogate) { 1651 *p++ = surrogate; 1652 surrogate = 0; 1653 } 1654 if (base64bits > 0) { /* left-over bits */ 1655 if (base64bits >= 6) { 1656 /* We've seen at least one base-64 character */ 1657 errmsg = "partial character in shift sequence"; 1658 goto utf7Error; 1659 } 1660 else { 1661 /* Some bits remain; they should be zero */ 1662 if (base64buffer != 0) { 1663 errmsg = "non-zero padding bits in shift sequence"; 1664 goto utf7Error; 1665 } 1666 } 1667 } 1668 if (ch != '-') { 1669 /* '-' is absorbed; other terminating 1670 characters are preserved */ 1671 *p++ = ch; 1672 } 1673 } 1674 } 1675 else if ( ch == '+' ) { 1676 startinpos = s-starts; 1677 s++; /* consume '+' */ 1678 if (s < e && *s == '-') { /* '+-' encodes '+' */ 1679 s++; 1680 *p++ = '+'; 1681 } 1682 else { /* begin base64-encoded section */ 1683 inShift = 1; 1684 shiftOutStart = p; 1685 base64bits = 0; 1686 } 1687 } 1688 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 1689 *p++ = ch; 1690 s++; 1691 } 1692 else { 1693 startinpos = s-starts; 1694 s++; 1695 errmsg = "unexpected special character"; 1696 goto utf7Error; 1697 } 1698 continue; 1699 utf7Error: 1700 outpos = p-PyUnicode_AS_UNICODE(unicode); 1701 endinpos = s-starts; 1702 if (unicode_decode_call_errorhandler( 1703 errors, &errorHandler, 1704 "utf7", errmsg, 1705 starts, size, &startinpos, &endinpos, &exc, &s, 1706 &unicode, &outpos, &p)) 1707 goto onError; 1708 } 1709 1710 /* end of string */ 1711 1712 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 1713 /* if we're in an inconsistent state, that's an error */ 1714 if (surrogate || 1715 (base64bits >= 6) || 1716 (base64bits > 0 && base64buffer != 0)) { 1717 outpos = p-PyUnicode_AS_UNICODE(unicode); 1718 endinpos = size; 1719 if (unicode_decode_call_errorhandler( 1720 errors, &errorHandler, 1721 "utf7", "unterminated shift sequence", 1722 starts, size, &startinpos, &endinpos, &exc, &s, 1723 &unicode, &outpos, &p)) 1724 goto onError; 1725 } 1726 } 1727 1728 /* return state */ 1729 if (consumed) { 1730 if (inShift) { 1731 p = shiftOutStart; /* back off output */ 1732 *consumed = startinpos; 1733 } 1734 else { 1735 *consumed = s-starts; 1736 } 1737 } 1738 1739 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1740 goto onError; 1741 1742 Py_XDECREF(errorHandler); 1743 Py_XDECREF(exc); 1744 return (PyObject *)unicode; 1745 1746 onError: 1747 Py_XDECREF(errorHandler); 1748 Py_XDECREF(exc); 1749 Py_DECREF(unicode); 1750 return NULL; 1751 } 1752 1753 1754 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1755 Py_ssize_t size, 1756 int base64SetO, 1757 int base64WhiteSpace, 1758 const char *errors) 1759 { 1760 PyObject *v; 1761 /* It might be possible to tighten this worst case */ 1762 Py_ssize_t allocated = 8 * size; 1763 int inShift = 0; 1764 Py_ssize_t i = 0; 1765 unsigned int base64bits = 0; 1766 unsigned long base64buffer = 0; 1767 char * out; 1768 char * start; 1769 1770 if (allocated / 8 != size) 1771 return PyErr_NoMemory(); 1772 1773 if (size == 0) 1774 return PyString_FromStringAndSize(NULL, 0); 1775 1776 v = PyString_FromStringAndSize(NULL, allocated); 1777 if (v == NULL) 1778 return NULL; 1779 1780 start = out = PyString_AS_STRING(v); 1781 for (;i < size; ++i) { 1782 Py_UNICODE ch = s[i]; 1783 1784 if (inShift) { 1785 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1786 /* shifting out */ 1787 if (base64bits) { /* output remaining bits */ 1788 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 1789 base64buffer = 0; 1790 base64bits = 0; 1791 } 1792 inShift = 0; 1793 /* Characters not in the BASE64 set implicitly unshift the sequence 1794 so no '-' is required, except if the character is itself a '-' */ 1795 if (IS_BASE64(ch) || ch == '-') { 1796 *out++ = '-'; 1797 } 1798 *out++ = (char) ch; 1799 } 1800 else { 1801 goto encode_char; 1802 } 1803 } 1804 else { /* not in a shift sequence */ 1805 if (ch == '+') { 1806 *out++ = '+'; 1807 *out++ = '-'; 1808 } 1809 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1810 *out++ = (char) ch; 1811 } 1812 else { 1813 *out++ = '+'; 1814 inShift = 1; 1815 goto encode_char; 1816 } 1817 } 1818 continue; 1819 encode_char: 1820 #ifdef Py_UNICODE_WIDE 1821 if (ch >= 0x10000) { 1822 /* code first surrogate */ 1823 base64bits += 16; 1824 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 1825 while (base64bits >= 6) { 1826 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1827 base64bits -= 6; 1828 } 1829 /* prepare second surrogate */ 1830 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 1831 } 1832 #endif 1833 base64bits += 16; 1834 base64buffer = (base64buffer << 16) | ch; 1835 while (base64bits >= 6) { 1836 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1837 base64bits -= 6; 1838 } 1839 } 1840 if (base64bits) 1841 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 1842 if (inShift) 1843 *out++ = '-'; 1844 1845 if (_PyString_Resize(&v, out - start)) 1846 return NULL; 1847 return v; 1848 } 1849 1850 #undef IS_BASE64 1851 #undef FROM_BASE64 1852 #undef TO_BASE64 1853 #undef DECODE_DIRECT 1854 #undef ENCODE_DIRECT 1855 1856 /* --- UTF-8 Codec -------------------------------------------------------- */ 1857 1858 static 1859 char utf8_code_length[256] = { 1860 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 1861 illegal prefix. See RFC 3629 for details */ 1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 1863 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1864 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 1870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 1871 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1872 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 1874 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 1875 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 1876 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 1877 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 1878 }; 1879 1880 PyObject *PyUnicode_DecodeUTF8(const char *s, 1881 Py_ssize_t size, 1882 const char *errors) 1883 { 1884 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1885 } 1886 1887 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1888 Py_ssize_t size, 1889 const char *errors, 1890 Py_ssize_t *consumed) 1891 { 1892 const char *starts = s; 1893 int n; 1894 int k; 1895 Py_ssize_t startinpos; 1896 Py_ssize_t endinpos; 1897 Py_ssize_t outpos; 1898 const char *e; 1899 PyUnicodeObject *unicode; 1900 Py_UNICODE *p; 1901 const char *errmsg = ""; 1902 PyObject *errorHandler = NULL; 1903 PyObject *exc = NULL; 1904 1905 /* Note: size will always be longer than the resulting Unicode 1906 character count */ 1907 unicode = _PyUnicode_New(size); 1908 if (!unicode) 1909 return NULL; 1910 if (size == 0) { 1911 if (consumed) 1912 *consumed = 0; 1913 return (PyObject *)unicode; 1914 } 1915 1916 /* Unpack UTF-8 encoded data */ 1917 p = unicode->str; 1918 e = s + size; 1919 1920 while (s < e) { 1921 Py_UCS4 ch = (unsigned char)*s; 1922 1923 if (ch < 0x80) { 1924 *p++ = (Py_UNICODE)ch; 1925 s++; 1926 continue; 1927 } 1928 1929 n = utf8_code_length[ch]; 1930 1931 if (s + n > e) { 1932 if (consumed) 1933 break; 1934 else { 1935 errmsg = "unexpected end of data"; 1936 startinpos = s-starts; 1937 endinpos = startinpos+1; 1938 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 1939 endinpos++; 1940 goto utf8Error; 1941 } 1942 } 1943 1944 switch (n) { 1945 1946 case 0: 1947 errmsg = "invalid start byte"; 1948 startinpos = s-starts; 1949 endinpos = startinpos+1; 1950 goto utf8Error; 1951 1952 case 1: 1953 errmsg = "internal error"; 1954 startinpos = s-starts; 1955 endinpos = startinpos+1; 1956 goto utf8Error; 1957 1958 case 2: 1959 if ((s[1] & 0xc0) != 0x80) { 1960 errmsg = "invalid continuation byte"; 1961 startinpos = s-starts; 1962 endinpos = startinpos + 1; 1963 goto utf8Error; 1964 } 1965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1966 assert ((ch > 0x007F) && (ch <= 0x07FF)); 1967 *p++ = (Py_UNICODE)ch; 1968 break; 1969 1970 case 3: 1971 /* XXX: surrogates shouldn't be valid UTF-8! 1972 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 1973 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 1974 Uncomment the 2 lines below to make them invalid, 1975 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */ 1976 if ((s[1] & 0xc0) != 0x80 || 1977 (s[2] & 0xc0) != 0x80 || 1978 ((unsigned char)s[0] == 0xE0 && 1979 (unsigned char)s[1] < 0xA0)/* || 1980 ((unsigned char)s[0] == 0xED && 1981 (unsigned char)s[1] > 0x9F)*/) { 1982 errmsg = "invalid continuation byte"; 1983 startinpos = s-starts; 1984 endinpos = startinpos + 1; 1985 1986 /* if s[1] first two bits are 1 and 0, then the invalid 1987 continuation byte is s[2], so increment endinpos by 1, 1988 if not, s[1] is invalid and endinpos doesn't need to 1989 be incremented. */ 1990 if ((s[1] & 0xC0) == 0x80) 1991 endinpos++; 1992 goto utf8Error; 1993 } 1994 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1995 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 1996 *p++ = (Py_UNICODE)ch; 1997 break; 1998 1999 case 4: 2000 if ((s[1] & 0xc0) != 0x80 || 2001 (s[2] & 0xc0) != 0x80 || 2002 (s[3] & 0xc0) != 0x80 || 2003 ((unsigned char)s[0] == 0xF0 && 2004 (unsigned char)s[1] < 0x90) || 2005 ((unsigned char)s[0] == 0xF4 && 2006 (unsigned char)s[1] > 0x8F)) { 2007 errmsg = "invalid continuation byte"; 2008 startinpos = s-starts; 2009 endinpos = startinpos + 1; 2010 if ((s[1] & 0xC0) == 0x80) { 2011 endinpos++; 2012 if ((s[2] & 0xC0) == 0x80) 2013 endinpos++; 2014 } 2015 goto utf8Error; 2016 } 2017 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 2018 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2019 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2020 2021 #ifdef Py_UNICODE_WIDE 2022 *p++ = (Py_UNICODE)ch; 2023 #else 2024 /* compute and append the two surrogates: */ 2025 2026 /* translate from 10000..10FFFF to 0..FFFF */ 2027 ch -= 0x10000; 2028 2029 /* high surrogate = top 10 bits added to D800 */ 2030 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 2031 2032 /* low surrogate = bottom 10 bits added to DC00 */ 2033 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 2034 #endif 2035 break; 2036 } 2037 s += n; 2038 continue; 2039 2040 utf8Error: 2041 outpos = p-PyUnicode_AS_UNICODE(unicode); 2042 if (unicode_decode_call_errorhandler( 2043 errors, &errorHandler, 2044 "utf8", errmsg, 2045 starts, size, &startinpos, &endinpos, &exc, &s, 2046 &unicode, &outpos, &p)) 2047 goto onError; 2048 } 2049 if (consumed) 2050 *consumed = s-starts; 2051 2052 /* Adjust length */ 2053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2054 goto onError; 2055 2056 Py_XDECREF(errorHandler); 2057 Py_XDECREF(exc); 2058 return (PyObject *)unicode; 2059 2060 onError: 2061 Py_XDECREF(errorHandler); 2062 Py_XDECREF(exc); 2063 Py_DECREF(unicode); 2064 return NULL; 2065 } 2066 2067 /* Allocation strategy: if the string is short, convert into a stack buffer 2068 and allocate exactly as much space needed at the end. Else allocate the 2069 maximum possible needed (4 result bytes per Unicode character), and return 2070 the excess memory at the end. 2071 */ 2072 PyObject * 2073 PyUnicode_EncodeUTF8(const Py_UNICODE *s, 2074 Py_ssize_t size, 2075 const char *errors) 2076 { 2077 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 2078 2079 Py_ssize_t i; /* index into s of next input byte */ 2080 PyObject *v; /* result string object */ 2081 char *p; /* next free byte in output buffer */ 2082 Py_ssize_t nallocated; /* number of result bytes allocated */ 2083 Py_ssize_t nneeded; /* number of result bytes needed */ 2084 char stackbuf[MAX_SHORT_UNICHARS * 4]; 2085 2086 assert(s != NULL); 2087 assert(size >= 0); 2088 2089 if (size <= MAX_SHORT_UNICHARS) { 2090 /* Write into the stack buffer; nallocated can't overflow. 2091 * At the end, we'll allocate exactly as much heap space as it 2092 * turns out we need. 2093 */ 2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Value stored to 'nallocated' is never read
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

2095 v = NULL; /* will allocate after we're done */ 2096 p = stackbuf; 2097 } 2098 else { 2099 /* Overallocate on the heap, and give the excess back at the end. */ 2100 nallocated = size * 4; 2101 if (nallocated / 4 != size) /* overflow! */ 2102 return PyErr_NoMemory(); 2103 v = PyString_FromStringAndSize(NULL, nallocated); 2104 if (v == NULL) 2105 return NULL; 2106 p = PyString_AS_STRING(v); 2107 } 2108 2109 for (i = 0; i < size;) { 2110 Py_UCS4 ch = s[i++]; 2111 2112 if (ch < 0x80) 2113 /* Encode ASCII */ 2114 *p++ = (char) ch; 2115 2116 else if (ch < 0x0800) { 2117 /* Encode Latin-1 */ 2118 *p++ = (char)(0xc0 | (ch >> 6)); 2119 *p++ = (char)(0x80 | (ch & 0x3f)); 2120 } 2121 else { 2122 /* Encode UCS2 Unicode ordinals */ 2123 if (ch < 0x10000) { 2124 /* Special case: check for high surrogate */ 2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 2126 Py_UCS4 ch2 = s[i]; 2127 /* Check for low surrogate and combine the two to 2128 form a UCS4 value */ 2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 2131 i++; 2132 goto encodeUCS4; 2133 } 2134 /* Fall through: handles isolated high surrogates */ 2135 } 2136 *p++ = (char)(0xe0 | (ch >> 12)); 2137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2138 *p++ = (char)(0x80 | (ch & 0x3f)); 2139 continue; 2140 } 2141 encodeUCS4: 2142 /* Encode UCS4 Unicode ordinals */ 2143 *p++ = (char)(0xf0 | (ch >> 18)); 2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 2146 *p++ = (char)(0x80 | (ch & 0x3f)); 2147 } 2148 } 2149 2150 if (v == NULL) { 2151 /* This was stack allocated. */ 2152 nneeded = p - stackbuf; 2153 assert(nneeded <= nallocated); 2154 v = PyString_FromStringAndSize(stackbuf, nneeded); 2155 } 2156 else { 2157 /* Cut back to size actually needed. */ 2158 nneeded = p - PyString_AS_STRING(v); 2159 assert(nneeded <= nallocated); 2160 if (_PyString_Resize(&v, nneeded)) 2161 return NULL; 2162 } 2163 return v; 2164 2165 #undef MAX_SHORT_UNICHARS 2166 } 2167 2168 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 2169 { 2170 if (!PyUnicode_Check(unicode)) { 2171 PyErr_BadArgument(); 2172 return NULL; 2173 } 2174 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 2175 PyUnicode_GET_SIZE(unicode), 2176 NULL); 2177 } 2178 2179 /* --- UTF-32 Codec ------------------------------------------------------- */ 2180 2181 PyObject * 2182 PyUnicode_DecodeUTF32(const char *s, 2183 Py_ssize_t size, 2184 const char *errors, 2185 int *byteorder) 2186 { 2187 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 2188 } 2189 2190 PyObject * 2191 PyUnicode_DecodeUTF32Stateful(const char *s, 2192 Py_ssize_t size, 2193 const char *errors, 2194 int *byteorder, 2195 Py_ssize_t *consumed) 2196 { 2197 const char *starts = s; 2198 Py_ssize_t startinpos; 2199 Py_ssize_t endinpos; 2200 Py_ssize_t outpos; 2201 PyUnicodeObject *unicode; 2202 Py_UNICODE *p; 2203 #ifndef Py_UNICODE_WIDE 2204 int pairs = 0; 2205 const unsigned char *qq; 2206 #else 2207 const int pairs = 0; 2208 #endif 2209 const unsigned char *q, *e; 2210 int bo = 0; /* assume native ordering by default */ 2211 const char *errmsg = ""; 2212 /* Offsets from q for retrieving bytes in the right order. */ 2213 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2214 int iorder[] = {0, 1, 2, 3}; 2215 #else 2216 int iorder[] = {3, 2, 1, 0}; 2217 #endif 2218 PyObject *errorHandler = NULL; 2219 PyObject *exc = NULL; 2220 2221 q = (unsigned char *)s; 2222 e = q + size; 2223 2224 if (byteorder) 2225 bo = *byteorder; 2226 2227 /* Check for BOM marks (U+FEFF) in the input and adjust current 2228 byte order setting accordingly. In native mode, the leading BOM 2229 mark is skipped, in all other modes, it is copied to the output 2230 stream as-is (giving a ZWNBSP character). */ 2231 if (bo == 0) { 2232 if (size >= 4) { 2233 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2234 (q[iorder[1]] << 8) | q[iorder[0]]; 2235 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2236 if (bom == 0x0000FEFF) { 2237 q += 4; 2238 bo = -1; 2239 } 2240 else if (bom == 0xFFFE0000) { 2241 q += 4; 2242 bo = 1; 2243 } 2244 #else 2245 if (bom == 0x0000FEFF) { 2246 q += 4; 2247 bo = 1; 2248 } 2249 else if (bom == 0xFFFE0000) { 2250 q += 4; 2251 bo = -1; 2252 } 2253 #endif 2254 } 2255 } 2256 2257 if (bo == -1) { 2258 /* force LE */ 2259 iorder[0] = 0; 2260 iorder[1] = 1; 2261 iorder[2] = 2; 2262 iorder[3] = 3; 2263 } 2264 else if (bo == 1) { 2265 /* force BE */ 2266 iorder[0] = 3; 2267 iorder[1] = 2; 2268 iorder[2] = 1; 2269 iorder[3] = 0; 2270 } 2271 2272 /* On narrow builds we split characters outside the BMP into two 2273 codepoints => count how much extra space we need. */ 2274 #ifndef Py_UNICODE_WIDE 2275 for (qq = q; qq < e; qq += 4) 2276 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 2277 pairs++; 2278 #endif 2279 2280 /* This might be one to much, because of a BOM */ 2281 unicode = _PyUnicode_New((size+3)/4+pairs); 2282 if (!unicode) 2283 return NULL; 2284 if (size == 0) 2285 return (PyObject *)unicode; 2286 2287 /* Unpack UTF-32 encoded data */ 2288 p = unicode->str; 2289 2290 while (q < e) { 2291 Py_UCS4 ch; 2292 /* remaining bytes at the end? (size should be divisible by 4) */ 2293 if (e-q<4) { 2294 if (consumed) 2295 break; 2296 errmsg = "truncated data"; 2297 startinpos = ((const char *)q)-starts; 2298 endinpos = ((const char *)e)-starts; 2299 goto utf32Error; 2300 /* The remaining input chars are ignored if the callback 2301 chooses to skip the input */ 2302 } 2303 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 2304 (q[iorder[1]] << 8) | q[iorder[0]]; 2305 2306 if (ch >= 0x110000) 2307 { 2308 errmsg = "codepoint not in range(0x110000)"; 2309 startinpos = ((const char *)q)-starts; 2310 endinpos = startinpos+4; 2311 goto utf32Error; 2312 } 2313 #ifndef Py_UNICODE_WIDE 2314 if (ch >= 0x10000) 2315 { 2316 *p++ = 0xD800 | ((ch-0x10000) >> 10); 2317 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 2318 } 2319 else 2320 #endif 2321 *p++ = ch; 2322 q += 4; 2323 continue; 2324 utf32Error: 2325 outpos = p-PyUnicode_AS_UNICODE(unicode); 2326 if (unicode_decode_call_errorhandler( 2327 errors, &errorHandler, 2328 "utf32", errmsg, 2329 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2330 &unicode, &outpos, &p)) 2331 goto onError; 2332 } 2333 2334 if (byteorder) 2335 *byteorder = bo; 2336 2337 if (consumed) 2338 *consumed = (const char *)q-starts; 2339 2340 /* Adjust length */ 2341 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2342 goto onError; 2343 2344 Py_XDECREF(errorHandler); 2345 Py_XDECREF(exc); 2346 return (PyObject *)unicode; 2347 2348 onError: 2349 Py_DECREF(unicode); 2350 Py_XDECREF(errorHandler); 2351 Py_XDECREF(exc); 2352 return NULL; 2353 } 2354 2355 PyObject * 2356 PyUnicode_EncodeUTF32(const Py_UNICODE *s, 2357 Py_ssize_t size, 2358 const char *errors, 2359 int byteorder) 2360 { 2361 PyObject *v; 2362 unsigned char *p; 2363 Py_ssize_t nsize, bytesize; 2364 #ifndef Py_UNICODE_WIDE 2365 Py_ssize_t i, pairs; 2366 #else 2367 const int pairs = 0; 2368 #endif 2369 /* Offsets from p for storing byte pairs in the right order. */ 2370 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2371 int iorder[] = {0, 1, 2, 3}; 2372 #else 2373 int iorder[] = {3, 2, 1, 0}; 2374 #endif 2375 2376 #define STORECHAR(CH) \ 2377 do { \ 2378 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 2379 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 2380 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 2381 p[iorder[0]] = (CH) & 0xff; \ 2382 p += 4; \ 2383 } while(0) 2384 2385 /* In narrow builds we can output surrogate pairs as one codepoint, 2386 so we need less space. */ 2387 #ifndef Py_UNICODE_WIDE 2388 for (i = pairs = 0; i < size-1; i++) 2389 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 2390 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 2391 pairs++; 2392 #endif 2393 nsize = (size - pairs + (byteorder == 0)); 2394 bytesize = nsize * 4; 2395 if (bytesize / 4 != nsize) 2396 return PyErr_NoMemory(); 2397 v = PyString_FromStringAndSize(NULL, bytesize); 2398 if (v == NULL) 2399 return NULL; 2400 2401 p = (unsigned char *)PyString_AS_STRING(v); 2402 if (byteorder == 0) 2403 STORECHAR(0xFEFF); 2404 if (size == 0) 2405 return v; 2406 2407 if (byteorder == -1) { 2408 /* force LE */ 2409 iorder[0] = 0; 2410 iorder[1] = 1; 2411 iorder[2] = 2; 2412 iorder[3] = 3; 2413 } 2414 else if (byteorder == 1) { 2415 /* force BE */ 2416 iorder[0] = 3; 2417 iorder[1] = 2; 2418 iorder[2] = 1; 2419 iorder[3] = 0; 2420 } 2421 2422 while (size-- > 0) { 2423 Py_UCS4 ch = *s++; 2424 #ifndef Py_UNICODE_WIDE 2425 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 2426 Py_UCS4 ch2 = *s; 2427 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2428 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2429 s++; 2430 size--; 2431 } 2432 } 2433 #endif 2434 STORECHAR(ch); 2435 } 2436 return v; 2437 #undef STORECHAR 2438 } 2439 2440 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) 2441 { 2442 if (!PyUnicode_Check(unicode)) { 2443 PyErr_BadArgument(); 2444 return NULL; 2445 } 2446 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 2447 PyUnicode_GET_SIZE(unicode), 2448 NULL, 2449 0); 2450 } 2451 2452 /* --- UTF-16 Codec ------------------------------------------------------- */ 2453 2454 PyObject * 2455 PyUnicode_DecodeUTF16(const char *s, 2456 Py_ssize_t size, 2457 const char *errors, 2458 int *byteorder) 2459 { 2460 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 2461 } 2462 2463 PyObject * 2464 PyUnicode_DecodeUTF16Stateful(const char *s, 2465 Py_ssize_t size, 2466 const char *errors, 2467 int *byteorder, 2468 Py_ssize_t *consumed) 2469 { 2470 const char *starts = s; 2471 Py_ssize_t startinpos; 2472 Py_ssize_t endinpos; 2473 Py_ssize_t outpos; 2474 PyUnicodeObject *unicode; 2475 Py_UNICODE *p; 2476 const unsigned char *q, *e; 2477 int bo = 0; /* assume native ordering by default */ 2478 const char *errmsg = ""; 2479 /* Offsets from q for retrieving byte pairs in the right order. */ 2480 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2481 int ihi = 1, ilo = 0; 2482 #else 2483 int ihi = 0, ilo = 1; 2484 #endif 2485 PyObject *errorHandler = NULL; 2486 PyObject *exc = NULL; 2487 2488 /* Note: size will always be longer than the resulting Unicode 2489 character count */ 2490 unicode = _PyUnicode_New(size); 2491 if (!unicode) 2492 return NULL; 2493 if (size == 0) 2494 return (PyObject *)unicode; 2495 2496 /* Unpack UTF-16 encoded data */ 2497 p = unicode->str; 2498 q = (unsigned char *)s; 2499 e = q + size; 2500 2501 if (byteorder) 2502 bo = *byteorder; 2503 2504 /* Check for BOM marks (U+FEFF) in the input and adjust current 2505 byte order setting accordingly. In native mode, the leading BOM 2506 mark is skipped, in all other modes, it is copied to the output 2507 stream as-is (giving a ZWNBSP character). */ 2508 if (bo == 0) { 2509 if (size >= 2) { 2510 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2511 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2512 if (bom == 0xFEFF) { 2513 q += 2; 2514 bo = -1; 2515 } 2516 else if (bom == 0xFFFE) { 2517 q += 2; 2518 bo = 1; 2519 } 2520 #else 2521 if (bom == 0xFEFF) { 2522 q += 2; 2523 bo = 1; 2524 } 2525 else if (bom == 0xFFFE) { 2526 q += 2; 2527 bo = -1; 2528 } 2529 #endif 2530 } 2531 } 2532 2533 if (bo == -1) { 2534 /* force LE */ 2535 ihi = 1; 2536 ilo = 0; 2537 } 2538 else if (bo == 1) { 2539 /* force BE */ 2540 ihi = 0; 2541 ilo = 1; 2542 } 2543 2544 while (q < e) { 2545 Py_UNICODE ch; 2546 /* remaining bytes at the end? (size should be even) */ 2547 if (e-q<2) { 2548 if (consumed) 2549 break; 2550 errmsg = "truncated data"; 2551 startinpos = ((const char *)q)-starts; 2552 endinpos = ((const char *)e)-starts; 2553 goto utf16Error; 2554 /* The remaining input chars are ignored if the callback 2555 chooses to skip the input */ 2556 } 2557 ch = (q[ihi] << 8) | q[ilo]; 2558 2559 q += 2; 2560 2561 if (ch < 0xD800 || ch > 0xDFFF) { 2562 *p++ = ch; 2563 continue; 2564 } 2565 2566 /* UTF-16 code pair: */ 2567 if (q >= e) { 2568 errmsg = "unexpected end of data"; 2569 startinpos = (((const char *)q)-2)-starts; 2570 endinpos = ((const char *)e)-starts; 2571 goto utf16Error; 2572 } 2573 if (0xD800 <= ch && ch <= 0xDBFF) { 2574 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2575 q += 2; 2576 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2577 #ifndef Py_UNICODE_WIDE 2578 *p++ = ch; 2579 *p++ = ch2; 2580 #else 2581 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2582 #endif 2583 continue; 2584 } 2585 else { 2586 errmsg = "illegal UTF-16 surrogate"; 2587 startinpos = (((const char *)q)-4)-starts; 2588 endinpos = startinpos+2; 2589 goto utf16Error; 2590 } 2591 2592 } 2593 errmsg = "illegal encoding"; 2594 startinpos = (((const char *)q)-2)-starts; 2595 endinpos = startinpos+2; 2596 /* Fall through to report the error */ 2597 2598 utf16Error: 2599 outpos = p-PyUnicode_AS_UNICODE(unicode); 2600 if (unicode_decode_call_errorhandler( 2601 errors, &errorHandler, 2602 "utf16", errmsg, 2603 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2604 &unicode, &outpos, &p)) 2605 goto onError; 2606 } 2607 2608 if (byteorder) 2609 *byteorder = bo; 2610 2611 if (consumed) 2612 *consumed = (const char *)q-starts; 2613 2614 /* Adjust length */ 2615 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2616 goto onError; 2617 2618 Py_XDECREF(errorHandler); 2619 Py_XDECREF(exc); 2620 return (PyObject *)unicode; 2621 2622 onError: 2623 Py_DECREF(unicode); 2624 Py_XDECREF(errorHandler); 2625 Py_XDECREF(exc); 2626 return NULL; 2627 } 2628 2629 PyObject * 2630 PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2631 Py_ssize_t size, 2632 const char *errors, 2633 int byteorder) 2634 { 2635 PyObject *v; 2636 unsigned char *p; 2637 Py_ssize_t nsize, bytesize; 2638 #ifdef Py_UNICODE_WIDE 2639 Py_ssize_t i, pairs; 2640 #else 2641 const int pairs = 0; 2642 #endif 2643 /* Offsets from p for storing byte pairs in the right order. */ 2644 #ifdef BYTEORDER_IS_LITTLE_ENDIAN 2645 int ihi = 1, ilo = 0; 2646 #else 2647 int ihi = 0, ilo = 1; 2648 #endif 2649 2650 #define STORECHAR(CH) \ 2651 do { \ 2652 p[ihi] = ((CH) >> 8) & 0xff; \ 2653 p[ilo] = (CH) & 0xff; \ 2654 p += 2; \ 2655 } while(0) 2656 2657 #ifdef Py_UNICODE_WIDE 2658 for (i = pairs = 0; i < size; i++) 2659 if (s[i] >= 0x10000) 2660 pairs++; 2661 #endif 2662 /* 2 * (size + pairs + (byteorder == 0)) */ 2663 if (size > PY_SSIZE_T_MAX || 2664 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 2665 return PyErr_NoMemory(); 2666 nsize = size + pairs + (byteorder == 0); 2667 bytesize = nsize * 2; 2668 if (bytesize / 2 != nsize) 2669 return PyErr_NoMemory(); 2670 v = PyString_FromStringAndSize(NULL, bytesize); 2671 if (v == NULL) 2672 return NULL; 2673 2674 p = (unsigned char *)PyString_AS_STRING(v); 2675 if (byteorder == 0) 2676 STORECHAR(0xFEFF); 2677 if (size == 0) 2678 return v; 2679 2680 if (byteorder == -1) { 2681 /* force LE */ 2682 ihi = 1; 2683 ilo = 0; 2684 } 2685 else if (byteorder == 1) { 2686 /* force BE */ 2687 ihi = 0; 2688 ilo = 1; 2689 } 2690 2691 while (size-- > 0) { 2692 Py_UNICODE ch = *s++; 2693 Py_UNICODE ch2 = 0; 2694 #ifdef Py_UNICODE_WIDE 2695 if (ch >= 0x10000) { 2696 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2697 ch = 0xD800 | ((ch-0x10000) >> 10); 2698 } 2699 #endif 2700 STORECHAR(ch); 2701 if (ch2) 2702 STORECHAR(ch2); 2703 } 2704 return v; 2705 #undef STORECHAR 2706 } 2707 2708 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2709 { 2710 if (!PyUnicode_Check(unicode)) { 2711 PyErr_BadArgument(); 2712 return NULL; 2713 } 2714 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2715 PyUnicode_GET_SIZE(unicode), 2716 NULL, 2717 0); 2718 } 2719 2720 /* --- Unicode Escape Codec ----------------------------------------------- */ 2721 2722 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2723 2724 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2725 Py_ssize_t size, 2726 const char *errors) 2727 { 2728 const char *starts = s; 2729 Py_ssize_t startinpos; 2730 Py_ssize_t endinpos; 2731 Py_ssize_t outpos; 2732 int i; 2733 PyUnicodeObject *v; 2734 Py_UNICODE *p; 2735 const char *end; 2736 char* message; 2737 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2738 PyObject *errorHandler = NULL; 2739 PyObject *exc = NULL; 2740 2741 /* Escaped strings will always be longer than the resulting 2742 Unicode string, so we start with size here and then reduce the 2743 length after conversion to the true value. 2744 (but if the error callback returns a long replacement string 2745 we'll have to allocate more space) */ 2746 v = _PyUnicode_New(size); 2747 if (v == NULL) 2748 goto onError; 2749 if (size == 0) 2750 return (PyObject *)v; 2751 2752 p = PyUnicode_AS_UNICODE(v); 2753 end = s + size; 2754 2755 while (s < end) { 2756 unsigned char c; 2757 Py_UNICODE x; 2758 int digits; 2759 2760 /* Non-escape characters are interpreted as Unicode ordinals */ 2761 if (*s != '\\') { 2762 *p++ = (unsigned char) *s++; 2763 continue; 2764 } 2765 2766 startinpos = s-starts; 2767 /* \ - Escapes */ 2768 s++; 2769 c = *s++; 2770 if (s > end) 2771 c = '\0'; /* Invalid after \ */ 2772 switch (c) { 2773 2774 /* \x escapes */ 2775 case '\n': break; 2776 case '\\': *p++ = '\\'; break; 2777 case '\'': *p++ = '\''; break; 2778 case '\"': *p++ = '\"'; break; 2779 case 'b': *p++ = '\b'; break; 2780 case 'f': *p++ = '\014'; break; /* FF */ 2781 case 't': *p++ = '\t'; break; 2782 case 'n': *p++ = '\n'; break; 2783 case 'r': *p++ = '\r'; break; 2784 case 'v': *p++ = '\013'; break; /* VT */ 2785 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2786 2787 /* \OOO (octal) escapes */ 2788 case '0': case '1': case '2': case '3': 2789 case '4': case '5': case '6': case '7': 2790 x = s[-1] - '0'; 2791 if (s < end && '0' <= *s && *s <= '7') { 2792 x = (x<<3) + *s++ - '0'; 2793 if (s < end && '0' <= *s && *s <= '7') 2794 x = (x<<3) + *s++ - '0'; 2795 } 2796 *p++ = x; 2797 break; 2798 2799 /* hex escapes */ 2800 /* \xXX */ 2801 case 'x': 2802 digits = 2; 2803 message = "truncated \\xXX escape"; 2804 goto hexescape; 2805 2806 /* \uXXXX */ 2807 case 'u': 2808 digits = 4; 2809 message = "truncated \\uXXXX escape"; 2810 goto hexescape; 2811 2812 /* \UXXXXXXXX */ 2813 case 'U': 2814 digits = 8; 2815 message = "truncated \\UXXXXXXXX escape"; 2816 hexescape: 2817 chr = 0; 2818 outpos = p-PyUnicode_AS_UNICODE(v); 2819 if (s+digits>end) { 2820 endinpos = size; 2821 if (unicode_decode_call_errorhandler( 2822 errors, &errorHandler, 2823 "unicodeescape", "end of string in escape sequence", 2824 starts, size, &startinpos, &endinpos, &exc, &s, 2825 &v, &outpos, &p)) 2826 goto onError; 2827 goto nextByte; 2828 } 2829 for (i = 0; i < digits; ++i) { 2830 c = (unsigned char) s[i]; 2831 if (!isxdigit(c)) { 2832 endinpos = (s+i+1)-starts; 2833 if (unicode_decode_call_errorhandler( 2834 errors, &errorHandler, 2835 "unicodeescape", message, 2836 starts, size, &startinpos, &endinpos, &exc, &s, 2837 &v, &outpos, &p)) 2838 goto onError; 2839 goto nextByte; 2840 } 2841 chr = (chr<<4) & ~0xF; 2842 if (c >= '0' && c <= '9') 2843 chr += c - '0'; 2844 else if (c >= 'a' && c <= 'f') 2845 chr += 10 + c - 'a'; 2846 else 2847 chr += 10 + c - 'A'; 2848 } 2849 s += i; 2850 if (chr == 0xffffffff && PyErr_Occurred()) 2851 /* _decoding_error will have already written into the 2852 target buffer. */ 2853 break; 2854 store: 2855 /* when we get here, chr is a 32-bit unicode character */ 2856 if (chr <= 0xffff) 2857 /* UCS-2 character */ 2858 *p++ = (Py_UNICODE) chr; 2859 else if (chr <= 0x10ffff) { 2860 /* UCS-4 character. Either store directly, or as 2861 surrogate pair. */ 2862 #ifdef Py_UNICODE_WIDE 2863 *p++ = chr; 2864 #else 2865 chr -= 0x10000L; 2866 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2867 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2868 #endif 2869 } else { 2870 endinpos = s-starts; 2871 outpos = p-PyUnicode_AS_UNICODE(v); 2872 if (unicode_decode_call_errorhandler( 2873 errors, &errorHandler, 2874 "unicodeescape", "illegal Unicode character", 2875 starts, size, &startinpos, &endinpos, &exc, &s, 2876 &v, &outpos, &p)) 2877 goto onError; 2878 } 2879 break; 2880 2881 /* \N{name} */ 2882 case 'N': 2883 message = "malformed \\N character escape"; 2884 if (ucnhash_CAPI == NULL) { 2885 /* load the unicode data module */ 2886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 2887 if (ucnhash_CAPI == NULL) 2888 goto ucnhashError; 2889 } 2890 if (*s == '{') { 2891 const char *start = s+1; 2892 /* look for the closing brace */ 2893 while (*s != '}' && s < end) 2894 s++; 2895 if (s > start && s < end && *s == '}') { 2896 /* found a name. look it up in the unicode database */ 2897 message = "unknown Unicode character name"; 2898 s++; 2899 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2900 goto store; 2901 } 2902 } 2903 endinpos = s-starts; 2904 outpos = p-PyUnicode_AS_UNICODE(v); 2905 if (unicode_decode_call_errorhandler( 2906 errors, &errorHandler, 2907 "unicodeescape", message, 2908 starts, size, &startinpos, &endinpos, &exc, &s, 2909 &v, &outpos, &p)) 2910 goto onError; 2911 break; 2912 2913 default: 2914 if (s > end) { 2915 message = "\\ at end of string"; 2916 s--; 2917 endinpos = s-starts; 2918 outpos = p-PyUnicode_AS_UNICODE(v); 2919 if (unicode_decode_call_errorhandler( 2920 errors, &errorHandler, 2921 "unicodeescape", message, 2922 starts, size, &startinpos, &endinpos, &exc, &s, 2923 &v, &outpos, &p)) 2924 goto onError; 2925 } 2926 else { 2927 *p++ = '\\'; 2928 *p++ = (unsigned char)s[-1]; 2929 } 2930 break; 2931 } 2932 nextByte: 2933 ; 2934 } 2935 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2936 goto onError; 2937 Py_XDECREF(errorHandler); 2938 Py_XDECREF(exc); 2939 return (PyObject *)v; 2940 2941 ucnhashError: 2942 PyErr_SetString( 2943 PyExc_UnicodeError, 2944 "\\N escapes not supported (can't load unicodedata module)" 2945 ); 2946 Py_XDECREF(v); 2947 Py_XDECREF(errorHandler); 2948 Py_XDECREF(exc); 2949 return NULL; 2950 2951 onError: 2952 Py_XDECREF(v); 2953 Py_XDECREF(errorHandler); 2954 Py_XDECREF(exc); 2955 return NULL; 2956 } 2957 2958 /* Return a Unicode-Escape string version of the Unicode object. 2959 2960 If quotes is true, the string is enclosed in u"" or u'' quotes as 2961 appropriate. 2962 2963 */ 2964 2965 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2966 Py_ssize_t size, 2967 Py_UNICODE ch) 2968 { 2969 /* like wcschr, but doesn't stop at NULL characters */ 2970 2971 while (size-- > 0) { 2972 if (*s == ch) 2973 return s; 2974 s++; 2975 } 2976 2977 return NULL; 2978 } 2979 2980 static 2981 PyObject *unicodeescape_string(const Py_UNICODE *s, 2982 Py_ssize_t size, 2983 int quotes) 2984 { 2985 PyObject *repr; 2986 char *p; 2987 2988 static const char *hexdigit = "0123456789abcdef"; 2989 #ifdef Py_UNICODE_WIDE 2990 const Py_ssize_t expandsize = 10; 2991 #else 2992 const Py_ssize_t expandsize = 6; 2993 #endif 2994 2995 /* XXX(nnorwitz): rather than over-allocating, it would be 2996 better to choose a different scheme. Perhaps scan the 2997 first N-chars of the string and allocate based on that size. 2998 */ 2999 /* Initial allocation is based on the longest-possible unichr 3000 escape. 3001 3002 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 3003 unichr, so in this case it's the longest unichr escape. In 3004 narrow (UTF-16) builds this is five chars per source unichr 3005 since there are two unichrs in the surrogate pair, so in narrow 3006 (UTF-16) builds it's not the longest unichr escape. 3007 3008 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 3009 so in the narrow (UTF-16) build case it's the longest unichr 3010 escape. 3011 */ 3012 3013 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 3014 return PyErr_NoMemory(); 3015 3016 repr = PyString_FromStringAndSize(NULL, 3017 2 3018 + expandsize*size 3019 + 1); 3020 if (repr == NULL) 3021 return NULL; 3022 3023 p = PyString_AS_STRING(repr); 3024 3025 if (quotes) { 3026 *p++ = 'u'; 3027 *p++ = (findchar(s, size, '\'') && 3028 !findchar(s, size, '"')) ? '"' : '\''; 3029 } 3030 while (size-- > 0) { 3031 Py_UNICODE ch = *s++; 3032 3033 /* Escape quotes and backslashes */ 3034 if ((quotes && 3035 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { 3036 *p++ = '\\'; 3037 *p++ = (char) ch; 3038 continue; 3039 } 3040 3041 #ifdef Py_UNICODE_WIDE 3042 /* Map 21-bit characters to '\U00xxxxxx' */ 3043 else if (ch >= 0x10000) { 3044 *p++ = '\\'; 3045 *p++ = 'U'; 3046 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 3047 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 3048 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 3049 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 3050 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 3051 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 3052 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 3053 *p++ = hexdigit[ch & 0x0000000F]; 3054 continue; 3055 } 3056 #else 3057 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3058 else if (ch >= 0xD800 && ch < 0xDC00) { 3059 Py_UNICODE ch2; 3060 Py_UCS4 ucs; 3061 3062 ch2 = *s++; 3063 size--; 3064 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3065 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3066 *p++ = '\\'; 3067 *p++ = 'U'; 3068 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 3069 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 3070 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 3071 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 3072 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 3073 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 3074 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 3075 *p++ = hexdigit[ucs & 0x0000000F]; 3076 continue; 3077 } 3078 /* Fall through: isolated surrogates are copied as-is */ 3079 s--; 3080 size++; 3081 } 3082 #endif 3083 3084 /* Map 16-bit characters to '\uxxxx' */ 3085 if (ch >= 256) { 3086 *p++ = '\\'; 3087 *p++ = 'u'; 3088 *p++ = hexdigit[(ch >> 12) & 0x000F]; 3089 *p++ = hexdigit[(ch >> 8) & 0x000F]; 3090 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3091 *p++ = hexdigit[ch & 0x000F]; 3092 } 3093 3094 /* Map special whitespace to '\t', \n', '\r' */ 3095 else if (ch == '\t') { 3096 *p++ = '\\'; 3097 *p++ = 't'; 3098 } 3099 else if (ch == '\n') { 3100 *p++ = '\\'; 3101 *p++ = 'n'; 3102 } 3103 else if (ch == '\r') { 3104 *p++ = '\\'; 3105 *p++ = 'r'; 3106 } 3107 3108 /* Map non-printable US ASCII to '\xhh' */ 3109 else if (ch < ' ' || ch >= 0x7F) { 3110 *p++ = '\\'; 3111 *p++ = 'x'; 3112 *p++ = hexdigit[(ch >> 4) & 0x000F]; 3113 *p++ = hexdigit[ch & 0x000F]; 3114 } 3115 3116 /* Copy everything else as-is */ 3117 else 3118 *p++ = (char) ch; 3119 } 3120 if (quotes) 3121 *p++ = PyString_AS_STRING(repr)[1]; 3122 3123 *p = '\0'; 3124 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 3125 return NULL; 3126 return repr; 3127 } 3128 3129 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 3130 Py_ssize_t size) 3131 { 3132 return unicodeescape_string(s, size, 0); 3133 } 3134 3135 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 3136 { 3137 if (!PyUnicode_Check(unicode)) { 3138 PyErr_BadArgument(); 3139 return NULL; 3140 } 3141 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3142 PyUnicode_GET_SIZE(unicode)); 3143 } 3144 3145 /* --- Raw Unicode Escape Codec ------------------------------------------- */ 3146 3147 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 3148 Py_ssize_t size, 3149 const char *errors) 3150 { 3151 const char *starts = s; 3152 Py_ssize_t startinpos; 3153 Py_ssize_t endinpos; 3154 Py_ssize_t outpos; 3155 PyUnicodeObject *v; 3156 Py_UNICODE *p; 3157 const char *end; 3158 const char *bs; 3159 PyObject *errorHandler = NULL; 3160 PyObject *exc = NULL; 3161 3162 /* Escaped strings will always be longer than the resulting 3163 Unicode string, so we start with size here and then reduce the 3164 length after conversion to the true value. (But decoding error 3165 handler might have to resize the string) */ 3166 v = _PyUnicode_New(size); 3167 if (v == NULL) 3168 goto onError; 3169 if (size == 0) 3170 return (PyObject *)v; 3171 p = PyUnicode_AS_UNICODE(v); 3172 end = s + size; 3173 while (s < end) { 3174 unsigned char c; 3175 Py_UCS4 x; 3176 int i; 3177 int count; 3178 3179 /* Non-escape characters are interpreted as Unicode ordinals */ 3180 if (*s != '\\') { 3181 *p++ = (unsigned char)*s++; 3182 continue; 3183 } 3184 startinpos = s-starts; 3185 3186 /* \u-escapes are only interpreted iff the number of leading 3187 backslashes if odd */ 3188 bs = s; 3189 for (;s < end;) { 3190 if (*s != '\\') 3191 break; 3192 *p++ = (unsigned char)*s++; 3193 } 3194 if (((s - bs) & 1) == 0 || 3195 s >= end || 3196 (*s != 'u' && *s != 'U')) { 3197 continue; 3198 } 3199 p--; 3200 count = *s=='u' ? 4 : 8; 3201 s++; 3202 3203 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 3204 outpos = p-PyUnicode_AS_UNICODE(v); 3205 for (x = 0, i = 0; i < count; ++i, ++s) { 3206 c = (unsigned char)*s; 3207 if (!isxdigit(c)) { 3208 endinpos = s-starts; 3209 if (unicode_decode_call_errorhandler( 3210 errors, &errorHandler, 3211 "rawunicodeescape", "truncated \\uXXXX", 3212 starts, size, &startinpos, &endinpos, &exc, &s, 3213 &v, &outpos, &p)) 3214 goto onError; 3215 goto nextByte; 3216 } 3217 x = (x<<4) & ~0xF; 3218 if (c >= '0' && c <= '9') 3219 x += c - '0'; 3220 else if (c >= 'a' && c <= 'f') 3221 x += 10 + c - 'a'; 3222 else 3223 x += 10 + c - 'A'; 3224 } 3225 if (x <= 0xffff) 3226 /* UCS-2 character */ 3227 *p++ = (Py_UNICODE) x; 3228 else if (x <= 0x10ffff) { 3229 /* UCS-4 character. Either store directly, or as 3230 surrogate pair. */ 3231 #ifdef Py_UNICODE_WIDE 3232 *p++ = (Py_UNICODE) x; 3233 #else 3234 x -= 0x10000L; 3235 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 3236 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 3237 #endif 3238 } else { 3239 endinpos = s-starts; 3240 outpos = p-PyUnicode_AS_UNICODE(v); 3241 if (unicode_decode_call_errorhandler( 3242 errors, &errorHandler, 3243 "rawunicodeescape", "\\Uxxxxxxxx out of range", 3244 starts, size, &startinpos, &endinpos, &exc, &s, 3245 &v, &outpos, &p)) 3246 goto onError; 3247 } 3248 nextByte: 3249 ; 3250 } 3251 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3252 goto onError; 3253 Py_XDECREF(errorHandler); 3254 Py_XDECREF(exc); 3255 return (PyObject *)v; 3256 3257 onError: 3258 Py_XDECREF(v); 3259 Py_XDECREF(errorHandler); 3260 Py_XDECREF(exc); 3261 return NULL; 3262 } 3263 3264 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 3265 Py_ssize_t size) 3266 { 3267 PyObject *repr; 3268 char *p; 3269 char *q; 3270 3271 static const char *hexdigit = "0123456789abcdef"; 3272 #ifdef Py_UNICODE_WIDE 3273 const Py_ssize_t expandsize = 10; 3274 #else 3275 const Py_ssize_t expandsize = 6; 3276 #endif 3277 3278 if (size > PY_SSIZE_T_MAX / expandsize) 3279 return PyErr_NoMemory(); 3280 3281 repr = PyString_FromStringAndSize(NULL, expandsize * size); 3282 if (repr == NULL) 3283 return NULL; 3284 if (size == 0) 3285 return repr; 3286 3287 p = q = PyString_AS_STRING(repr); 3288 while (size-- > 0) { 3289 Py_UNICODE ch = *s++; 3290 #ifdef Py_UNICODE_WIDE 3291 /* Map 32-bit characters to '\Uxxxxxxxx' */ 3292 if (ch >= 0x10000) { 3293 *p++ = '\\'; 3294 *p++ = 'U'; 3295 *p++ = hexdigit[(ch >> 28) & 0xf]; 3296 *p++ = hexdigit[(ch >> 24) & 0xf]; 3297 *p++ = hexdigit[(ch >> 20) & 0xf]; 3298 *p++ = hexdigit[(ch >> 16) & 0xf]; 3299 *p++ = hexdigit[(ch >> 12) & 0xf]; 3300 *p++ = hexdigit[(ch >> 8) & 0xf]; 3301 *p++ = hexdigit[(ch >> 4) & 0xf]; 3302 *p++ = hexdigit[ch & 15]; 3303 } 3304 else 3305 #else 3306 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 3307 if (ch >= 0xD800 && ch < 0xDC00) { 3308 Py_UNICODE ch2; 3309 Py_UCS4 ucs; 3310 3311 ch2 = *s++; 3312 size--; 3313 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 3314 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 3315 *p++ = '\\'; 3316 *p++ = 'U'; 3317 *p++ = hexdigit[(ucs >> 28) & 0xf]; 3318 *p++ = hexdigit[(ucs >> 24) & 0xf]; 3319 *p++ = hexdigit[(ucs >> 20) & 0xf]; 3320 *p++ = hexdigit[(ucs >> 16) & 0xf]; 3321 *p++ = hexdigit[(ucs >> 12) & 0xf]; 3322 *p++ = hexdigit[(ucs >> 8) & 0xf]; 3323 *p++ = hexdigit[(ucs >> 4) & 0xf]; 3324 *p++ = hexdigit[ucs & 0xf]; 3325 continue; 3326 } 3327 /* Fall through: isolated surrogates are copied as-is */ 3328 s--; 3329 size++; 3330 } 3331 #endif 3332 /* Map 16-bit characters to '\uxxxx' */ 3333 if (ch >= 256) { 3334 *p++ = '\\'; 3335 *p++ = 'u'; 3336 *p++ = hexdigit[(ch >> 12) & 0xf]; 3337 *p++ = hexdigit[(ch >> 8) & 0xf]; 3338 *p++ = hexdigit[(ch >> 4) & 0xf]; 3339 *p++ = hexdigit[ch & 15]; 3340 } 3341 /* Copy everything else as-is */ 3342 else 3343 *p++ = (char) ch; 3344 } 3345 *p = '\0'; 3346 if (_PyString_Resize(&repr, p - q)) 3347 return NULL; 3348 return repr; 3349 } 3350 3351 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 3352 { 3353 if (!PyUnicode_Check(unicode)) { 3354 PyErr_BadArgument(); 3355 return NULL; 3356 } 3357 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 3358 PyUnicode_GET_SIZE(unicode)); 3359 } 3360 3361 /* --- Unicode Internal Codec ------------------------------------------- */ 3362 3363 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 3364 Py_ssize_t size, 3365 const char *errors) 3366 { 3367 const char *starts = s; 3368 Py_ssize_t startinpos; 3369 Py_ssize_t endinpos; 3370 Py_ssize_t outpos; 3371 PyUnicodeObject *v; 3372 Py_UNICODE *p; 3373 const char *end; 3374 const char *reason; 3375 PyObject *errorHandler = NULL; 3376 PyObject *exc = NULL; 3377 3378 #ifdef Py_UNICODE_WIDE 3379 Py_UNICODE unimax = PyUnicode_GetMax(); 3380 #endif 3381 3382 /* XXX overflow detection missing */ 3383 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 3384 if (v == NULL) 3385 goto onError; 3386 if (PyUnicode_GetSize((PyObject *)v) == 0) 3387 return (PyObject *)v; 3388 p = PyUnicode_AS_UNICODE(v); 3389 end = s + size; 3390 3391 while (s < end) { 3392 memcpy(p, s, sizeof(Py_UNICODE)); 3393 /* We have to sanity check the raw data, otherwise doom looms for 3394 some malformed UCS-4 data. */ 3395 if ( 3396 #ifdef Py_UNICODE_WIDE 3397 *p > unimax || *p < 0 || 3398 #endif 3399 end-s < Py_UNICODE_SIZE 3400 ) 3401 { 3402 startinpos = s - starts; 3403 if (end-s < Py_UNICODE_SIZE) { 3404 endinpos = end-starts; 3405 reason = "truncated input"; 3406 } 3407 else { 3408 endinpos = s - starts + Py_UNICODE_SIZE; 3409 reason = "illegal code point (> 0x10FFFF)"; 3410 } 3411 outpos = p - PyUnicode_AS_UNICODE(v); 3412 if (unicode_decode_call_errorhandler( 3413 errors, &errorHandler, 3414 "unicode_internal", reason, 3415 starts, size, &startinpos, &endinpos, &exc, &s, 3416 &v, &outpos, &p)) { 3417 goto onError; 3418 } 3419 } 3420 else { 3421 p++; 3422 s += Py_UNICODE_SIZE; 3423 } 3424 } 3425 3426 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3427 goto onError; 3428 Py_XDECREF(errorHandler); 3429 Py_XDECREF(exc); 3430 return (PyObject *)v; 3431 3432 onError: 3433 Py_XDECREF(v); 3434 Py_XDECREF(errorHandler); 3435 Py_XDECREF(exc); 3436 return NULL; 3437 } 3438 3439 /* --- Latin-1 Codec ------------------------------------------------------ */ 3440 3441 PyObject *PyUnicode_DecodeLatin1(const char *s, 3442 Py_ssize_t size, 3443 const char *errors) 3444 { 3445 PyUnicodeObject *v; 3446 Py_UNICODE *p; 3447 3448 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 3449 if (size == 1) { 3450 Py_UNICODE r = *(unsigned char*)s; 3451 return PyUnicode_FromUnicode(&r, 1); 3452 } 3453 3454 v = _PyUnicode_New(size); 3455 if (v == NULL) 3456 goto onError; 3457 if (size == 0) 3458 return (PyObject *)v; 3459 p = PyUnicode_AS_UNICODE(v); 3460 while (size-- > 0) 3461 *p++ = (unsigned char)*s++; 3462 return (PyObject *)v; 3463 3464 onError: 3465 Py_XDECREF(v); 3466 return NULL; 3467 } 3468 3469 /* create or adjust a UnicodeEncodeError */ 3470 static void make_encode_exception(PyObject **exceptionObject, 3471 const char *encoding, 3472 const Py_UNICODE *unicode, Py_ssize_t size, 3473 Py_ssize_t startpos, Py_ssize_t endpos, 3474 const char *reason) 3475 { 3476 if (*exceptionObject == NULL) { 3477 *exceptionObject = PyUnicodeEncodeError_Create( 3478 encoding, unicode, size, startpos, endpos, reason); 3479 } 3480 else { 3481 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 3482 goto onError; 3483 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 3484 goto onError; 3485 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 3486 goto onError; 3487 return; 3488 onError: 3489 Py_DECREF(*exceptionObject); 3490 *exceptionObject = NULL; 3491 } 3492 } 3493 3494 /* raises a UnicodeEncodeError */ 3495 static void raise_encode_exception(PyObject **exceptionObject, 3496 const char *encoding, 3497 const Py_UNICODE *unicode, Py_ssize_t size, 3498 Py_ssize_t startpos, Py_ssize_t endpos, 3499 const char *reason) 3500 { 3501 make_encode_exception(exceptionObject, 3502 encoding, unicode, size, startpos, endpos, reason); 3503 if (*exceptionObject != NULL) 3504 PyCodec_StrictErrors(*exceptionObject); 3505 } 3506 3507 /* error handling callback helper: 3508 build arguments, call the callback and check the arguments, 3509 put the result into newpos and return the replacement string, which 3510 has to be freed by the caller */ 3511 static PyObject *unicode_encode_call_errorhandler(const char *errors, 3512 PyObject **errorHandler, 3513 const char *encoding, const char *reason, 3514 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3515 Py_ssize_t startpos, Py_ssize_t endpos, 3516 Py_ssize_t *newpos) 3517 { 3518 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3519 3520 PyObject *restuple; 3521 PyObject *resunicode; 3522 3523 if (*errorHandler == NULL) { 3524 *errorHandler = PyCodec_LookupError(errors); 3525 if (*errorHandler == NULL) 3526 return NULL; 3527 } 3528 3529 make_encode_exception(exceptionObject, 3530 encoding, unicode, size, startpos, endpos, reason); 3531 if (*exceptionObject == NULL) 3532 return NULL; 3533 3534 restuple = PyObject_CallFunctionObjArgs( 3535 *errorHandler, *exceptionObject, NULL); 3536 if (restuple == NULL) 3537 return NULL; 3538 if (!PyTuple_Check(restuple)) { 3539 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3540 Py_DECREF(restuple); 3541 return NULL; 3542 } 3543 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3544 &resunicode, newpos)) { 3545 Py_DECREF(restuple); 3546 return NULL; 3547 } 3548 if (*newpos<0) 3549 *newpos = size+*newpos; 3550 if (*newpos<0 || *newpos>size) { 3551 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3552 Py_DECREF(restuple); 3553 return NULL; 3554 } 3555 Py_INCREF(resunicode); 3556 Py_DECREF(restuple); 3557 return resunicode; 3558 } 3559 3560 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3561 Py_ssize_t size, 3562 const char *errors, 3563 int limit) 3564 { 3565 /* output object */ 3566 PyObject *res; 3567 /* pointers to the beginning and end+1 of input */ 3568 const Py_UNICODE *startp = p; 3569 const Py_UNICODE *endp = p + size; 3570 /* pointer to the beginning of the unencodable characters */ 3571 /* const Py_UNICODE *badp = NULL; */ 3572 /* pointer into the output */ 3573 char *str; 3574 /* current output position */ 3575 Py_ssize_t respos = 0; 3576 Py_ssize_t ressize; 3577 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3578 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3579 PyObject *errorHandler = NULL; 3580 PyObject *exc = NULL; 3581 /* the following variable is used for caching string comparisons 3582 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3583 int known_errorHandler = -1; 3584 3585 /* allocate enough for a simple encoding without 3586 replacements, if we need more, we'll resize */ 3587 res = PyString_FromStringAndSize(NULL, size); 3588 if (res == NULL) 3589 goto onError; 3590 if (size == 0) 3591 return res; 3592 str = PyString_AS_STRING(res); 3593 ressize = size; 3594 3595 while (p<endp) { 3596 Py_UNICODE c = *p; 3597 3598 /* can we encode this? */ 3599 if (c<limit) { 3600 /* no overflow check, because we know that the space is enough */ 3601 *str++ = (char)c; 3602 ++p; 3603 } 3604 else { 3605 Py_ssize_t unicodepos = p-startp; 3606 Py_ssize_t requiredsize; 3607 PyObject *repunicode; 3608 Py_ssize_t repsize; 3609 Py_ssize_t newpos; 3610 Py_ssize_t respos; 3611 Py_UNICODE *uni2; 3612 /* startpos for collecting unencodable chars */ 3613 const Py_UNICODE *collstart = p; 3614 const Py_UNICODE *collend = p; 3615 /* find all unecodable characters */ 3616 while ((collend < endp) && ((*collend)>=limit)) 3617 ++collend; 3618 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3619 if (known_errorHandler==-1) { 3620 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3621 known_errorHandler = 1; 3622 else if (!strcmp(errors, "replace")) 3623 known_errorHandler = 2; 3624 else if (!strcmp(errors, "ignore")) 3625 known_errorHandler = 3; 3626 else if (!strcmp(errors, "xmlcharrefreplace")) 3627 known_errorHandler = 4; 3628 else 3629 known_errorHandler = 0; 3630 } 3631 switch (known_errorHandler) { 3632 case 1: /* strict */ 3633 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3634 goto onError; 3635 case 2: /* replace */ 3636 while (collstart++<collend) 3637 *str++ = '?'; /* fall through */ 3638 case 3: /* ignore */ 3639 p = collend; 3640 break; 3641 case 4: /* xmlcharrefreplace */ 3642 respos = str-PyString_AS_STRING(res); 3643 /* determine replacement size (temporarily (mis)uses p) */ 3644 for (p = collstart, repsize = 0; p < collend; ++p) { 3645 if (*p<10) 3646 repsize += 2+1+1; 3647 else if (*p<100) 3648 repsize += 2+2+1; 3649 else if (*p<1000) 3650 repsize += 2+3+1; 3651 else if (*p<10000) 3652 repsize += 2+4+1; 3653 #ifndef Py_UNICODE_WIDE 3654 else 3655 repsize += 2+5+1; 3656 #else 3657 else if (*p<100000) 3658 repsize += 2+5+1; 3659 else if (*p<1000000) 3660 repsize += 2+6+1; 3661 else 3662 repsize += 2+7+1; 3663 #endif 3664 } 3665 requiredsize = respos+repsize+(endp-collend); 3666 if (requiredsize > ressize) { 3667 if (requiredsize<2*ressize) 3668 requiredsize = 2*ressize; 3669 if (_PyString_Resize(&res, requiredsize)) 3670 goto onError; 3671 str = PyString_AS_STRING(res) + respos; 3672 ressize = requiredsize; 3673 } 3674 /* generate replacement (temporarily (mis)uses p) */ 3675 for (p = collstart; p < collend; ++p) { 3676 str += sprintf(str, "&#%d;", (int)*p); 3677 } 3678 p = collend; 3679 break; 3680 default: 3681 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3682 encoding, reason, startp, size, &exc, 3683 collstart-startp, collend-startp, &newpos); 3684 if (repunicode == NULL) 3685 goto onError; 3686 /* need more space? (at least enough for what we have+the 3687 replacement+the rest of the string, so we won't have to 3688 check space for encodable characters) */ 3689 respos = str-PyString_AS_STRING(res); 3690 repsize = PyUnicode_GET_SIZE(repunicode); 3691 requiredsize = respos+repsize+(endp-collend); 3692 if (requiredsize > ressize) { 3693 if (requiredsize<2*ressize) 3694 requiredsize = 2*ressize; 3695 if (_PyString_Resize(&res, requiredsize)) { 3696 Py_DECREF(repunicode); 3697 goto onError; 3698 } 3699 str = PyString_AS_STRING(res) + respos; 3700 ressize = requiredsize; 3701 } 3702 /* check if there is anything unencodable in the replacement 3703 and copy it to the output */ 3704 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3705 c = *uni2; 3706 if (c >= limit) { 3707 raise_encode_exception(&exc, encoding, startp, size, 3708 unicodepos, unicodepos+1, reason); 3709 Py_DECREF(repunicode); 3710 goto onError; 3711 } 3712 *str = (char)c; 3713 } 3714 p = startp + newpos; 3715 Py_DECREF(repunicode); 3716 } 3717 } 3718 } 3719 /* Resize if we allocated to much */ 3720 respos = str-PyString_AS_STRING(res); 3721 if (respos<ressize) 3722 /* If this falls res will be NULL */ 3723 _PyString_Resize(&res, respos); 3724 Py_XDECREF(errorHandler); 3725 Py_XDECREF(exc); 3726 return res; 3727 3728 onError: 3729 Py_XDECREF(res); 3730 Py_XDECREF(errorHandler); 3731 Py_XDECREF(exc); 3732 return NULL; 3733 } 3734 3735 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3736 Py_ssize_t size, 3737 const char *errors) 3738 { 3739 return unicode_encode_ucs1(p, size, errors, 256); 3740 } 3741 3742 PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3743 { 3744 if (!PyUnicode_Check(unicode)) { 3745 PyErr_BadArgument(); 3746 return NULL; 3747 } 3748 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3749 PyUnicode_GET_SIZE(unicode), 3750 NULL); 3751 } 3752 3753 /* --- 7-bit ASCII Codec -------------------------------------------------- */ 3754 3755 PyObject *PyUnicode_DecodeASCII(const char *s, 3756 Py_ssize_t size, 3757 const char *errors) 3758 { 3759 const char *starts = s; 3760 PyUnicodeObject *v; 3761 Py_UNICODE *p; 3762 Py_ssize_t startinpos; 3763 Py_ssize_t endinpos; 3764 Py_ssize_t outpos; 3765 const char *e; 3766 PyObject *errorHandler = NULL; 3767 PyObject *exc = NULL; 3768 3769 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3770 if (size == 1 && *(unsigned char*)s < 128) { 3771 Py_UNICODE r = *(unsigned char*)s; 3772 return PyUnicode_FromUnicode(&r, 1); 3773 } 3774 3775 v = _PyUnicode_New(size); 3776 if (v == NULL) 3777 goto onError; 3778 if (size == 0) 3779 return (PyObject *)v; 3780 p = PyUnicode_AS_UNICODE(v); 3781 e = s + size; 3782 while (s < e) { 3783 register unsigned char c = (unsigned char)*s; 3784 if (c < 128) { 3785 *p++ = c; 3786 ++s; 3787 } 3788 else { 3789 startinpos = s-starts; 3790 endinpos = startinpos + 1; 3791 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3792 if (unicode_decode_call_errorhandler( 3793 errors, &errorHandler, 3794 "ascii", "ordinal not in range(128)", 3795 starts, size, &startinpos, &endinpos, &exc, &s, 3796 &v, &outpos, &p)) 3797 goto onError; 3798 } 3799 } 3800 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 3801 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3802 goto onError; 3803 Py_XDECREF(errorHandler); 3804 Py_XDECREF(exc); 3805 return (PyObject *)v; 3806 3807 onError: 3808 Py_XDECREF(v); 3809 Py_XDECREF(errorHandler); 3810 Py_XDECREF(exc); 3811 return NULL; 3812 } 3813 3814 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3815 Py_ssize_t size, 3816 const char *errors) 3817 { 3818 return unicode_encode_ucs1(p, size, errors, 128); 3819 } 3820 3821 PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3822 { 3823 if (!PyUnicode_Check(unicode)) { 3824 PyErr_BadArgument(); 3825 return NULL; 3826 } 3827 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3828 PyUnicode_GET_SIZE(unicode), 3829 NULL); 3830 } 3831 3832 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3833 3834 /* --- MBCS codecs for Windows -------------------------------------------- */ 3835 3836 #if SIZEOF_INT < SIZEOF_SIZE_T 3837 #define NEED_RETRY 3838 #endif 3839 3840 /* XXX This code is limited to "true" double-byte encodings, as 3841 a) it assumes an incomplete character consists of a single byte, and 3842 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3843 encodings, see IsDBCSLeadByteEx documentation. */ 3844 3845 static int is_dbcs_lead_byte(const char *s, int offset) 3846 { 3847 const char *curr = s + offset; 3848 3849 if (IsDBCSLeadByte(*curr)) { 3850 const char *prev = CharPrev(s, curr); 3851 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3852 } 3853 return 0; 3854 } 3855 3856 /* 3857 * Decode MBCS string into unicode object. If 'final' is set, converts 3858 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3859 */ 3860 static int decode_mbcs(PyUnicodeObject **v, 3861 const char *s, /* MBCS string */ 3862 int size, /* sizeof MBCS string */ 3863 int final) 3864 { 3865 Py_UNICODE *p; 3866 Py_ssize_t n = 0; 3867 int usize = 0; 3868 3869 assert(size >= 0); 3870 3871 /* Skip trailing lead-byte unless 'final' is set */ 3872 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3873 --size; 3874 3875 /* First get the size of the result */ 3876 if (size > 0) { 3877 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3878 if (usize == 0) { 3879 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3880 return -1; 3881 } 3882 } 3883 3884 if (*v == NULL) { 3885 /* Create unicode object */ 3886 *v = _PyUnicode_New(usize); 3887 if (*v == NULL) 3888 return -1; 3889 } 3890 else { 3891 /* Extend unicode object */ 3892 n = PyUnicode_GET_SIZE(*v); 3893 if (_PyUnicode_Resize(v, n + usize) < 0) 3894 return -1; 3895 } 3896 3897 /* Do the conversion */ 3898 if (size > 0) { 3899 p = PyUnicode_AS_UNICODE(*v) + n; 3900 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3901 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3902 return -1; 3903 } 3904 } 3905 3906 return size; 3907 } 3908 3909 PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3910 Py_ssize_t size, 3911 const char *errors, 3912 Py_ssize_t *consumed) 3913 { 3914 PyUnicodeObject *v = NULL; 3915 int done; 3916 3917 if (consumed) 3918 *consumed = 0; 3919 3920 #ifdef NEED_RETRY 3921 retry: 3922 if (size > INT_MAX) 3923 done = decode_mbcs(&v, s, INT_MAX, 0); 3924 else 3925 #endif 3926 done = decode_mbcs(&v, s, (int)size, !consumed); 3927 3928 if (done < 0) { 3929 Py_XDECREF(v); 3930 return NULL; 3931 } 3932 3933 if (consumed) 3934 *consumed += done; 3935 3936 #ifdef NEED_RETRY 3937 if (size > INT_MAX) { 3938 s += done; 3939 size -= done; 3940 goto retry; 3941 } 3942 #endif 3943 3944 return (PyObject *)v; 3945 } 3946 3947 PyObject *PyUnicode_DecodeMBCS(const char *s, 3948 Py_ssize_t size, 3949 const char *errors) 3950 { 3951 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 3952 } 3953 3954 /* 3955 * Convert unicode into string object (MBCS). 3956 * Returns 0 if succeed, -1 otherwise. 3957 */ 3958