Python-2.7.3/Modules/unicodedata.c

No issues found

   1 /* ------------------------------------------------------------------------
   2 
   3    unicodedata -- Provides access to the Unicode 5.2 data base.
   4 
   5    Data was extracted from the Unicode 5.2 UnicodeData.txt file.
   6 
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10 
  11    Copyright (c) Corporation for National Research Initiatives.
  12 
  13    ------------------------------------------------------------------------ */
  14 
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18 
  19 /* character properties */
  20 
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30     const unsigned char normalization_quick_check; /* see is_normalized() */
  31 } _PyUnicode_DatabaseRecord;
  32 
  33 typedef struct change_record {
  34     /* sequence of fields should be the same as in merge_old_version */
  35     const unsigned char bidir_changed;
  36     const unsigned char category_changed;
  37     const unsigned char decimal_changed;
  38     const unsigned char mirrored_changed;
  39     const double numeric_changed;
  40 } change_record;
  41 
  42 /* data file generated by Tools/unicode/makeunicodedata.py */
  43 #include "unicodedata_db.h"
  44 
  45 static const _PyUnicode_DatabaseRecord*
  46 _getrecord_ex(Py_UCS4 code)
  47 {
  48     int index;
  49     if (code >= 0x110000)
  50         index = 0;
  51     else {
  52         index = index1[(code>>SHIFT)];
  53         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  54     }
  55 
  56     return &_PyUnicode_Database_Records[index];
  57 }
  58 
  59 /* ------------- Previous-version API ------------------------------------- */
  60 typedef struct previous_version {
  61     PyObject_HEAD
  62     const char *name;
  63     const change_record* (*getrecord)(Py_UCS4);
  64     Py_UCS4 (*normalization)(Py_UCS4);
  65 } PreviousDBVersion;
  66 
  67 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  68 
  69 static PyMemberDef DB_members[] = {
  70         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  71         {NULL}
  72 };
  73 
  74 /* forward declaration */
  75 static PyTypeObject UCD_Type;
  76 
  77 static PyObject*
  78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  79                      Py_UCS4 (*normalization)(Py_UCS4))
  80 {
  81         PreviousDBVersion *self;
  82         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  83         if (self == NULL)
  84                 return NULL;
  85         self->name = name;
  86         self->getrecord = getrecord;
  87         self->normalization = normalization;
  88         return (PyObject*)self;
  89 }
  90 
  91 
  92 static Py_UCS4 getuchar(PyUnicodeObject *obj)
  93 {
  94     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  95 
  96     if (PyUnicode_GET_SIZE(obj) == 1)
  97         return *v;
  98 #ifndef Py_UNICODE_WIDE
  99     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
 100              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
 101              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 102         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 103 #endif
 104     PyErr_SetString(PyExc_TypeError,
 105                     "need a single Unicode character as parameter");
 106     return (Py_UCS4)-1;
 107 }
 108 
 109 /* --- Module API --------------------------------------------------------- */
 110 
 111 PyDoc_STRVAR(unicodedata_decimal__doc__,
 112 "decimal(unichr[, default])\n\
 113 \n\
 114 Returns the decimal value assigned to the Unicode character unichr\n\
 115 as integer. If no such value is defined, default is returned, or, if\n\
 116 not given, ValueError is raised.");
 117 
 118 static PyObject *
 119 unicodedata_decimal(PyObject *self, PyObject *args)
 120 {
 121     PyUnicodeObject *v;
 122     PyObject *defobj = NULL;
 123     int have_old = 0;
 124     long rc;
 125     Py_UCS4 c;
 126 
 127     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 128         return NULL;
 129     c = getuchar(v);
 130     if (c == (Py_UCS4)-1)
 131         return NULL;
 132 
 133     if (self) {
 134         const change_record *old = get_old_record(self, c);
 135         if (old->category_changed == 0) {
 136             /* unassigned */
 137             have_old = 1;
 138             rc = -1;
 139         }
 140         else if (old->decimal_changed != 0xFF) {
 141             have_old = 1;
 142             rc = old->decimal_changed;
 143         }
 144     }
 145 
 146     if (!have_old)
 147         rc = Py_UNICODE_TODECIMAL(c);
 148     if (rc < 0) {
 149         if (defobj == NULL) {
 150             PyErr_SetString(PyExc_ValueError,
 151                             "not a decimal");
 152             return NULL;
 153         }
 154         else {
 155             Py_INCREF(defobj);
 156             return defobj;
 157         }
 158     }
 159     return PyInt_FromLong(rc);
 160 }
 161 
 162 PyDoc_STRVAR(unicodedata_digit__doc__,
 163 "digit(unichr[, default])\n\
 164 \n\
 165 Returns the digit value assigned to the Unicode character unichr as\n\
 166 integer. If no such value is defined, default is returned, or, if\n\
 167 not given, ValueError is raised.");
 168 
 169 static PyObject *
 170 unicodedata_digit(PyObject *self, PyObject *args)
 171 {
 172     PyUnicodeObject *v;
 173     PyObject *defobj = NULL;
 174     long rc;
 175     Py_UCS4 c;
 176 
 177     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 178         return NULL;
 179     c = getuchar(v);
 180     if (c == (Py_UCS4)-1)
 181         return NULL;
 182     rc = Py_UNICODE_TODIGIT(c);
 183     if (rc < 0) {
 184         if (defobj == NULL) {
 185             PyErr_SetString(PyExc_ValueError, "not a digit");
 186             return NULL;
 187         }
 188         else {
 189             Py_INCREF(defobj);
 190             return defobj;
 191         }
 192     }
 193     return PyInt_FromLong(rc);
 194 }
 195 
 196 PyDoc_STRVAR(unicodedata_numeric__doc__,
 197 "numeric(unichr[, default])\n\
 198 \n\
 199 Returns the numeric value assigned to the Unicode character unichr\n\
 200 as float. If no such value is defined, default is returned, or, if\n\
 201 not given, ValueError is raised.");
 202 
 203 static PyObject *
 204 unicodedata_numeric(PyObject *self, PyObject *args)
 205 {
 206     PyUnicodeObject *v;
 207     PyObject *defobj = NULL;
 208     int have_old = 0;
 209     double rc;
 210     Py_UCS4 c;
 211 
 212     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 213         return NULL;
 214     c = getuchar(v);
 215     if (c == (Py_UCS4)-1)
 216         return NULL;
 217 
 218     if (self) {
 219         const change_record *old = get_old_record(self, c);
 220         if (old->category_changed == 0) {
 221             /* unassigned */
 222             have_old = 1;
 223             rc = -1.0;
 224         }
 225         else if (old->decimal_changed != 0xFF) {
 226             have_old = 1;
 227             rc = old->decimal_changed;
 228         }
 229     }
 230 
 231     if (!have_old)
 232         rc = Py_UNICODE_TONUMERIC(c);
 233     if (rc == -1.0) {
 234         if (defobj == NULL) {
 235             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 236             return NULL;
 237         }
 238         else {
 239             Py_INCREF(defobj);
 240             return defobj;
 241         }
 242     }
 243     return PyFloat_FromDouble(rc);
 244 }
 245 
 246 PyDoc_STRVAR(unicodedata_category__doc__,
 247 "category(unichr)\n\
 248 \n\
 249 Returns the general category assigned to the Unicode character\n\
 250 unichr as string.");
 251 
 252 static PyObject *
 253 unicodedata_category(PyObject *self, PyObject *args)
 254 {
 255     PyUnicodeObject *v;
 256     int index;
 257     Py_UCS4 c;
 258 
 259     if (!PyArg_ParseTuple(args, "O!:category",
 260                           &PyUnicode_Type, &v))
 261         return NULL;
 262     c = getuchar(v);
 263     if (c == (Py_UCS4)-1)
 264         return NULL;
 265     index = (int) _getrecord_ex(c)->category;
 266     if (self) {
 267         const change_record *old = get_old_record(self, c);
 268         if (old->category_changed != 0xFF)
 269             index = old->category_changed;
 270     }
 271     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 272 }
 273 
 274 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 275 "bidirectional(unichr)\n\
 276 \n\
 277 Returns the bidirectional category assigned to the Unicode character\n\
 278 unichr as string. If no such value is defined, an empty string is\n\
 279 returned.");
 280 
 281 static PyObject *
 282 unicodedata_bidirectional(PyObject *self, PyObject *args)
 283 {
 284     PyUnicodeObject *v;
 285     int index;
 286     Py_UCS4 c;
 287 
 288     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 289                           &PyUnicode_Type, &v))
 290         return NULL;
 291     c = getuchar(v);
 292     if (c == (Py_UCS4)-1)
 293         return NULL;
 294     index = (int) _getrecord_ex(c)->bidirectional;
 295     if (self) {
 296         const change_record *old = get_old_record(self, c);
 297         if (old->category_changed == 0)
 298             index = 0; /* unassigned */
 299         else if (old->bidir_changed != 0xFF)
 300             index = old->bidir_changed;
 301     }
 302     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 303 }
 304 
 305 PyDoc_STRVAR(unicodedata_combining__doc__,
 306 "combining(unichr)\n\
 307 \n\
 308 Returns the canonical combining class assigned to the Unicode\n\
 309 character unichr as integer. Returns 0 if no combining class is\n\
 310 defined.");
 311 
 312 static PyObject *
 313 unicodedata_combining(PyObject *self, PyObject *args)
 314 {
 315     PyUnicodeObject *v;
 316     int index;
 317     Py_UCS4 c;
 318 
 319     if (!PyArg_ParseTuple(args, "O!:combining",
 320                           &PyUnicode_Type, &v))
 321         return NULL;
 322     c = getuchar(v);
 323     if (c == (Py_UCS4)-1)
 324         return NULL;
 325     index = (int) _getrecord_ex(c)->combining;
 326     if (self) {
 327         const change_record *old = get_old_record(self, c);
 328         if (old->category_changed == 0)
 329             index = 0; /* unassigned */
 330     }
 331     return PyInt_FromLong(index);
 332 }
 333 
 334 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 335 "mirrored(unichr)\n\
 336 \n\
 337 Returns the mirrored property assigned to the Unicode character\n\
 338 unichr as integer. Returns 1 if the character has been identified as\n\
 339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 340 
 341 static PyObject *
 342 unicodedata_mirrored(PyObject *self, PyObject *args)
 343 {
 344     PyUnicodeObject *v;
 345     int index;
 346     Py_UCS4 c;
 347 
 348     if (!PyArg_ParseTuple(args, "O!:mirrored",
 349                           &PyUnicode_Type, &v))
 350         return NULL;
 351     c = getuchar(v);
 352     if (c == (Py_UCS4)-1)
 353         return NULL;
 354     index = (int) _getrecord_ex(c)->mirrored;
 355     if (self) {
 356         const change_record *old = get_old_record(self, c);
 357         if (old->category_changed == 0)
 358             index = 0; /* unassigned */
 359         else if (old->mirrored_changed != 0xFF)
 360             index = old->mirrored_changed;
 361     }
 362     return PyInt_FromLong(index);
 363 }
 364 
 365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 366 "east_asian_width(unichr)\n\
 367 \n\
 368 Returns the east asian width assigned to the Unicode character\n\
 369 unichr as string.");
 370 
 371 static PyObject *
 372 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 373 {
 374     PyUnicodeObject *v;
 375     int index;
 376     Py_UCS4 c;
 377 
 378     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 379                           &PyUnicode_Type, &v))
 380         return NULL;
 381     c = getuchar(v);
 382     if (c == (Py_UCS4)-1)
 383         return NULL;
 384     index = (int) _getrecord_ex(c)->east_asian_width;
 385     if (self) {
 386         const change_record *old = get_old_record(self, c);
 387         if (old->category_changed == 0)
 388             index = 0; /* unassigned */
 389     }
 390     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 391 }
 392 
 393 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 394 "decomposition(unichr)\n\
 395 \n\
 396 Returns the character decomposition mapping assigned to the Unicode\n\
 397 character unichr as string. An empty string is returned in case no\n\
 398 such mapping is defined.");
 399 
 400 static PyObject *
 401 unicodedata_decomposition(PyObject *self, PyObject *args)
 402 {
 403     PyUnicodeObject *v;
 404     char decomp[256];
 405     int code, index, count, i;
 406     unsigned int prefix_index;
 407     Py_UCS4 c;
 408 
 409     if (!PyArg_ParseTuple(args, "O!:decomposition",
 410                           &PyUnicode_Type, &v))
 411         return NULL;
 412     c = getuchar(v);
 413     if (c == (Py_UCS4)-1)
 414         return NULL;
 415 
 416     code = (int)c;
 417 
 418     if (self) {
 419         const change_record *old = get_old_record(self, c);
 420         if (old->category_changed == 0)
 421             return PyString_FromString(""); /* unassigned */
 422     }
 423 
 424     if (code < 0 || code >= 0x110000)
 425         index = 0;
 426     else {
 427         index = decomp_index1[(code>>DECOMP_SHIFT)];
 428         index = decomp_index2[(index<<DECOMP_SHIFT)+
 429                              (code&((1<<DECOMP_SHIFT)-1))];
 430     }
 431 
 432     /* high byte is number of hex bytes (usually one or two), low byte
 433        is prefix code (from*/
 434     count = decomp_data[index] >> 8;
 435 
 436     /* XXX: could allocate the PyString up front instead
 437        (strlen(prefix) + 5 * count + 1 bytes) */
 438 
 439     /* Based on how index is calculated above and decomp_data is generated
 440        from Tools/unicode/makeunicodedata.py, it should not be possible
 441        to overflow decomp_prefix. */
 442     prefix_index = decomp_data[index] & 255;
 443     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 444 
 445     /* copy prefix */
 446     i = strlen(decomp_prefix[prefix_index]);
 447     memcpy(decomp, decomp_prefix[prefix_index], i);
 448 
 449     while (count-- > 0) {
 450         if (i)
 451             decomp[i++] = ' ';
 452         assert((size_t)i < sizeof(decomp));
 453         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 454                       decomp_data[++index]);
 455         i += strlen(decomp + i);
 456     }
 457 
 458     decomp[i] = '\0';
 459 
 460     return PyString_FromString(decomp);
 461 }
 462 
 463 static void
 464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 465 {
 466     if (code >= 0x110000) {
 467         *index = 0;
 468     } else if (self && get_old_record(self, code)->category_changed==0) {
 469         /* unassigned in old version */
 470         *index = 0;
 471     }
 472     else {
 473         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 474         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 475                                (code&((1<<DECOMP_SHIFT)-1))];
 476     }
 477 
 478     /* high byte is number of hex bytes (usually one or two), low byte
 479        is prefix code (from*/
 480     *count = decomp_data[*index] >> 8;
 481     *prefix = decomp_data[*index] & 255;
 482 
 483     (*index)++;
 484 }
 485 
 486 #define SBase   0xAC00
 487 #define LBase   0x1100
 488 #define VBase   0x1161
 489 #define TBase   0x11A7
 490 #define LCount  19
 491 #define VCount  21
 492 #define TCount  28
 493 #define NCount  (VCount*TCount)
 494 #define SCount  (LCount*NCount)
 495 
 496 static PyObject*
 497 nfd_nfkd(PyObject *self, PyObject *input, int k)
 498 {
 499     PyObject *result;
 500     Py_UNICODE *i, *end, *o;
 501     /* Longest decomposition in Unicode 3.2: U+FDFA */
 502     Py_UNICODE stack[20];
 503     Py_ssize_t space, isize;
 504     int index, prefix, count, stackptr;
 505     unsigned char prev, cur;
 506 
 507     stackptr = 0;
 508     isize = PyUnicode_GET_SIZE(input);
 509     /* Overallocate atmost 10 characters. */
 510     space = (isize > 10 ? 10 : isize) + isize;
 511     result = PyUnicode_FromUnicode(NULL, space);
 512     if (!result)
 513         return NULL;
 514     i = PyUnicode_AS_UNICODE(input);
 515     end = i + isize;
 516     o = PyUnicode_AS_UNICODE(result);
 517 
 518     while (i < end) {
 519         stack[stackptr++] = *i++;
 520         while(stackptr) {
 521             Py_UNICODE code = stack[--stackptr];
 522             /* Hangul Decomposition adds three characters in
 523                a single step, so we need atleast that much room. */
 524             if (space < 3) {
 525                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 526                 space += 10;
 527                 if (PyUnicode_Resize(&result, newsize) == -1)
 528                     return NULL;
 529                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 530             }
 531             /* Hangul Decomposition. */
 532             if (SBase <= code && code < (SBase+SCount)) {
 533                 int SIndex = code - SBase;
 534                 int L = LBase + SIndex / NCount;
 535                 int V = VBase + (SIndex % NCount) / TCount;
 536                 int T = TBase + SIndex % TCount;
 537                 *o++ = L;
 538                 *o++ = V;
 539                 space -= 2;
 540                 if (T != TBase) {
 541                     *o++ = T;
 542                     space --;
 543                 }
 544                 continue;
 545             }
 546             /* normalization changes */
 547             if (self) {
 548                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 549                 if (value != 0) {
 550                     stack[stackptr++] = value;
 551                     continue;
 552                 }
 553             }
 554 
 555             /* Other decompositions. */
 556             get_decomp_record(self, code, &index, &prefix, &count);
 557 
 558             /* Copy character if it is not decomposable, or has a
 559                compatibility decomposition, but we do NFD. */
 560             if (!count || (prefix && !k)) {
 561                 *o++ = code;
 562                 space--;
 563                 continue;
 564             }
 565             /* Copy decomposition onto the stack, in reverse
 566                order.  */
 567             while(count) {
 568                 code = decomp_data[index + (--count)];
 569                 stack[stackptr++] = code;
 570             }
 571         }
 572     }
 573 
 574     /* Drop overallocation. Cannot fail. */
 575     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 576 
 577     /* Sort canonically. */
 578     i = PyUnicode_AS_UNICODE(result);
 579     prev = _getrecord_ex(*i)->combining;
 580     end = i + PyUnicode_GET_SIZE(result);
 581     for (i++; i < end; i++) {
 582         cur = _getrecord_ex(*i)->combining;
 583         if (prev == 0 || cur == 0 || prev <= cur) {
 584             prev = cur;
 585             continue;
 586         }
 587         /* Non-canonical order. Need to switch *i with previous. */
 588         o = i - 1;
 589         while (1) {
 590             Py_UNICODE tmp = o[1];
 591             o[1] = o[0];
 592             o[0] = tmp;
 593             o--;
 594             if (o < PyUnicode_AS_UNICODE(result))
 595                 break;
 596             prev = _getrecord_ex(*o)->combining;
 597             if (prev == 0 || prev <= cur)
 598                 break;
 599         }
 600         prev = _getrecord_ex(*i)->combining;
 601     }
 602     return result;
 603 }
 604 
 605 static int
 606 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 607 {
 608     int index;
 609     for (index = 0; nfc[index].start; index++) {
 610         int start = nfc[index].start;
 611         if (code < start)
 612             return -1;
 613         if (code <= start + nfc[index].count) {
 614             int delta = code - start;
 615             return nfc[index].index + delta;
 616         }
 617     }
 618     return -1;
 619 }
 620 
 621 static PyObject*
 622 nfc_nfkc(PyObject *self, PyObject *input, int k)
 623 {
 624     PyObject *result;
 625     Py_UNICODE *i, *i1, *o, *end;
 626     int f,l,index,index1,comb;
 627     Py_UNICODE code;
 628     Py_UNICODE *skipped[20];
 629     int cskipped = 0;
 630 
 631     result = nfd_nfkd(self, input, k);
 632     if (!result)
 633         return NULL;
 634 
 635     /* We are going to modify result in-place.
 636        If nfd_nfkd is changed to sometimes return the input,
 637        this code needs to be reviewed. */
 638     assert(result != input);
 639 
 640     i = PyUnicode_AS_UNICODE(result);
 641     end = i + PyUnicode_GET_SIZE(result);
 642     o = PyUnicode_AS_UNICODE(result);
 643 
 644   again:
 645     while (i < end) {
 646       for (index = 0; index < cskipped; index++) {
 647           if (skipped[index] == i) {
 648               /* *i character is skipped.
 649                  Remove from list. */
 650               skipped[index] = skipped[cskipped-1];
 651               cskipped--;
 652               i++;
 653               goto again; /* continue while */
 654           }
 655       }
 656       /* Hangul Composition. We don't need to check for <LV,T>
 657          pairs, since we always have decomposed data. */
 658       if (LBase <= *i && *i < (LBase+LCount) &&
 659           i + 1 < end &&
 660           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 661           int LIndex, VIndex;
 662           LIndex = i[0] - LBase;
 663           VIndex = i[1] - VBase;
 664           code = SBase + (LIndex*VCount+VIndex)*TCount;
 665           i+=2;
 666           if (i < end &&
 667               TBase <= *i && *i <= (TBase+TCount)) {
 668               code += *i-TBase;
 669               i++;
 670           }
 671           *o++ = code;
 672           continue;
 673       }
 674 
 675       f = find_nfc_index(self, nfc_first, *i);
 676       if (f == -1) {
 677           *o++ = *i++;
 678           continue;
 679       }
 680       /* Find next unblocked character. */
 681       i1 = i+1;
 682       comb = 0;
 683       while (i1 < end) {
 684           int comb1 = _getrecord_ex(*i1)->combining;
 685           if (comb) {
 686               if (comb1 == 0)
 687                   break;
 688               if (comb >= comb1) {
 689                   /* Character is blocked. */
 690                   i1++;
 691                   continue;
 692               }
 693           }
 694           l = find_nfc_index(self, nfc_last, *i1);
 695           /* *i1 cannot be combined with *i. If *i1
 696              is a starter, we don't need to look further.
 697              Otherwise, record the combining class. */
 698           if (l == -1) {
 699             not_combinable:
 700               if (comb1 == 0)
 701                   break;
 702               comb = comb1;
 703               i1++;
 704               continue;
 705           }
 706           index = f*TOTAL_LAST + l;
 707           index1 = comp_index[index >> COMP_SHIFT];
 708           code = comp_data[(index1<<COMP_SHIFT)+
 709                            (index&((1<<COMP_SHIFT)-1))];
 710           if (code == 0)
 711               goto not_combinable;
 712 
 713           /* Replace the original character. */
 714           *i = code;
 715           /* Mark the second character unused. */
 716           assert(cskipped < 20);
 717           skipped[cskipped++] = i1;
 718           i1++;
 719           f = find_nfc_index(self, nfc_first, *i);
 720           if (f == -1)
 721               break;
 722       }
 723       *o++ = *i++;
 724     }
 725     if (o != end)
 726         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 727     return result;
 728 }
 729 
 730 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
 731 static int
 732 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 733 {
 734     Py_UNICODE *i, *end;
 735     unsigned char prev_combining = 0, quickcheck_mask;
 736 
 737     /* An older version of the database is requested, quickchecks must be
 738        disabled. */
 739     if (self != NULL)
 740         return 0;
 741 
 742     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
 743        as described in http://unicode.org/reports/tr15/#Annex8. */
 744     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
 745 
 746     i = PyUnicode_AS_UNICODE(input);
 747     end = i + PyUnicode_GET_SIZE(input);
 748     while (i < end) {
 749         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
 750         unsigned char combining = record->combining;
 751         unsigned char quickcheck = record->normalization_quick_check;
 752 
 753         if (quickcheck & quickcheck_mask)
 754             return 0; /* this string might need normalization */
 755         if (combining && prev_combining > combining)
 756             return 0; /* non-canonical sort order, not normalized */
 757         prev_combining = combining;
 758     }
 759     return 1; /* certainly normalized */
 760 }
 761 
 762 PyDoc_STRVAR(unicodedata_normalize__doc__,
 763 "normalize(form, unistr)\n\
 764 \n\
 765 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 766 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 767 
 768 static PyObject*
 769 unicodedata_normalize(PyObject *self, PyObject *args)
 770 {
 771     char *form;
 772     PyObject *input;
 773 
 774     if(!PyArg_ParseTuple(args, "sO!:normalize",
 775                          &form, &PyUnicode_Type, &input))
 776         return NULL;
 777 
 778     if (PyUnicode_GetSize(input) == 0) {
 779         /* Special case empty input strings, since resizing
 780            them  later would cause internal errors. */
 781         Py_INCREF(input);
 782         return input;
 783     }
 784 
 785     if (strcmp(form, "NFC") == 0) {
 786         if (is_normalized(self, input, 1, 0)) {
 787             Py_INCREF(input);
 788             return input;
 789         }
 790         return nfc_nfkc(self, input, 0);
 791     }
 792     if (strcmp(form, "NFKC") == 0) {
 793         if (is_normalized(self, input, 1, 1)) {
 794             Py_INCREF(input);
 795             return input;
 796         }
 797         return nfc_nfkc(self, input, 1);
 798     }
 799     if (strcmp(form, "NFD") == 0) {
 800         if (is_normalized(self, input, 0, 0)) {
 801             Py_INCREF(input);
 802             return input;
 803         }
 804         return nfd_nfkd(self, input, 0);
 805     }
 806     if (strcmp(form, "NFKD") == 0) {
 807         if (is_normalized(self, input, 0, 1)) {
 808             Py_INCREF(input);
 809             return input;
 810         }
 811         return nfd_nfkd(self, input, 1);
 812     }
 813     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 814     return NULL;
 815 }
 816 
 817 /* -------------------------------------------------------------------- */
 818 /* unicode character name tables */
 819 
 820 /* data file generated by Tools/unicode/makeunicodedata.py */
 821 #include "unicodename_db.h"
 822 
 823 /* -------------------------------------------------------------------- */
 824 /* database code (cut and pasted from the unidb package) */
 825 
 826 static unsigned long
 827 _gethash(const char *s, int len, int scale)
 828 {
 829     int i;
 830     unsigned long h = 0;
 831     unsigned long ix;
 832     for (i = 0; i < len; i++) {
 833         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
 834         ix = h & 0xff000000;
 835         if (ix)
 836             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 837     }
 838     return h;
 839 }
 840 
 841 static char *hangul_syllables[][3] = {
 842     { "G",  "A",   ""   },
 843     { "GG", "AE",  "G"  },
 844     { "N",  "YA",  "GG" },
 845     { "D",  "YAE", "GS" },
 846     { "DD", "EO",  "N", },
 847     { "R",  "E",   "NJ" },
 848     { "M",  "YEO", "NH" },
 849     { "B",  "YE",  "D"  },
 850     { "BB", "O",   "L"  },
 851     { "S",  "WA",  "LG" },
 852     { "SS", "WAE", "LM" },
 853     { "",   "OE",  "LB" },
 854     { "J",  "YO",  "LS" },
 855     { "JJ", "U",   "LT" },
 856     { "C",  "WEO", "LP" },
 857     { "K",  "WE",  "LH" },
 858     { "T",  "WI",  "M"  },
 859     { "P",  "YU",  "B"  },
 860     { "H",  "EU",  "BS" },
 861     { 0,    "YI",  "S"  },
 862     { 0,    "I",   "SS" },
 863     { 0,    0,     "NG" },
 864     { 0,    0,     "J"  },
 865     { 0,    0,     "C"  },
 866     { 0,    0,     "K"  },
 867     { 0,    0,     "T"  },
 868     { 0,    0,     "P"  },
 869     { 0,    0,     "H"  }
 870 };
 871 
 872 static int
 873 is_unified_ideograph(Py_UCS4 code)
 874 {
 875     return (
 876         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 877         (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
 878         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
 879         (0x2A700 <= code && code <= 0x2B734));  /* CJK Ideograph Extension C */
 880 }
 881 
 882 static int
 883 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 884 {
 885     int offset;
 886     int i;
 887     int word;
 888     unsigned char* w;
 889 
 890     if (code >= 0x110000)
 891         return 0;
 892 
 893     if (self) {
 894         const change_record *old = get_old_record(self, code);
 895         if (old->category_changed == 0) {
 896             /* unassigned */
 897             return 0;
 898         }
 899     }
 900 
 901     if (SBase <= code && code < SBase+SCount) {
 902         /* Hangul syllable. */
 903         int SIndex = code - SBase;
 904         int L = SIndex / NCount;
 905         int V = (SIndex % NCount) / TCount;
 906         int T = SIndex % TCount;
 907 
 908         if (buflen < 27)
 909             /* Worst case: HANGUL SYLLABLE <10chars>. */
 910             return 0;
 911         strcpy(buffer, "HANGUL SYLLABLE ");
 912         buffer += 16;
 913         strcpy(buffer, hangul_syllables[L][0]);
 914         buffer += strlen(hangul_syllables[L][0]);
 915         strcpy(buffer, hangul_syllables[V][1]);
 916         buffer += strlen(hangul_syllables[V][1]);
 917         strcpy(buffer, hangul_syllables[T][2]);
 918         buffer += strlen(hangul_syllables[T][2]);
 919         *buffer = '\0';
 920         return 1;
 921     }
 922 
 923     if (is_unified_ideograph(code)) {
 924         if (buflen < 28)
 925             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 926             return 0;
 927         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 928         return 1;
 929     }
 930 
 931     /* get offset into phrasebook */
 932     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 933     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 934                                (code&((1<<phrasebook_shift)-1))];
 935     if (!offset)
 936         return 0;
 937 
 938     i = 0;
 939 
 940     for (;;) {
 941         /* get word index */
 942         word = phrasebook[offset] - phrasebook_short;
 943         if (word >= 0) {
 944             word = (word << 8) + phrasebook[offset+1];
 945             offset += 2;
 946         } else
 947             word = phrasebook[offset++];
 948         if (i) {
 949             if (i > buflen)
 950                 return 0; /* buffer overflow */
 951             buffer[i++] = ' ';
 952         }
 953         /* copy word string from lexicon.  the last character in the
 954            word has bit 7 set.  the last word in a string ends with
 955            0x80 */
 956         w = lexicon + lexicon_offset[word];
 957         while (*w < 128) {
 958             if (i >= buflen)
 959                 return 0; /* buffer overflow */
 960             buffer[i++] = *w++;
 961         }
 962         if (i >= buflen)
 963             return 0; /* buffer overflow */
 964         buffer[i++] = *w & 127;
 965         if (*w == 128)
 966             break; /* end of word */
 967     }
 968 
 969     return 1;
 970 }
 971 
 972 static int
 973 _cmpname(PyObject *self, int code, const char* name, int namelen)
 974 {
 975     /* check if code corresponds to the given name */
 976     int i;
 977     char buffer[NAME_MAXLEN];
 978     if (!_getucname(self, code, buffer, sizeof(buffer)))
 979         return 0;
 980     for (i = 0; i < namelen; i++) {
 981         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
 982             return 0;
 983     }
 984     return buffer[namelen] == '\0';
 985 }
 986 
 987 static void
 988 find_syllable(const char *str, int *len, int *pos, int count, int column)
 989 {
 990     int i, len1;
 991     *len = -1;
 992     for (i = 0; i < count; i++) {
 993         char *s = hangul_syllables[i][column];
 994         len1 = strlen(s);
 995         if (len1 <= *len)
 996             continue;
 997         if (strncmp(str, s, len1) == 0) {
 998             *len = len1;
 999             *pos = i;
1000         }
1001     }
1002     if (*len == -1) {
1003         *len = 0;
1004     }
1005 }
1006 
1007 static int
1008 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1009 {
1010     unsigned int h, v;
1011     unsigned int mask = code_size-1;
1012     unsigned int i, incr;
1013 
1014     /* Check for hangul syllables. */
1015     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1016         int len, L = -1, V = -1, T = -1;
1017         const char *pos = name + 16;
1018         find_syllable(pos, &len, &L, LCount, 0);
1019         pos += len;
1020         find_syllable(pos, &len, &V, VCount, 1);
1021         pos += len;
1022         find_syllable(pos, &len, &T, TCount, 2);
1023         pos += len;
1024         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1025             *code = SBase + (L*VCount+V)*TCount + T;
1026             return 1;
1027         }
1028         /* Otherwise, it's an illegal syllable name. */
1029         return 0;
1030     }
1031 
1032     /* Check for unified ideographs. */
1033     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1034         /* Four or five hexdigits must follow. */
1035         v = 0;
1036         name += 22;
1037         namelen -= 22;
1038         if (namelen != 4 && namelen != 5)
1039             return 0;
1040         while (namelen--) {
1041             v *= 16;
1042             if (*name >= '0' && *name <= '9')
1043                 v += *name - '0';
1044             else if (*name >= 'A' && *name <= 'F')
1045                 v += *name - 'A' + 10;
1046             else
1047                 return 0;
1048             name++;
1049         }
1050         if (!is_unified_ideograph(v))
1051             return 0;
1052         *code = v;
1053         return 1;
1054     }
1055 
1056     /* the following is the same as python's dictionary lookup, with
1057        only minor changes.  see the makeunicodedata script for more
1058        details */
1059 
1060     h = (unsigned int) _gethash(name, namelen, code_magic);
1061     i = (~h) & mask;
1062     v = code_hash[i];
1063     if (!v)
1064         return 0;
1065     if (_cmpname(self, v, name, namelen)) {
1066         *code = v;
1067         return 1;
1068     }
1069     incr = (h ^ (h >> 3)) & mask;
1070     if (!incr)
1071         incr = mask;
1072     for (;;) {
1073         i = (i + incr) & mask;
1074         v = code_hash[i];
1075         if (!v)
1076             return 0;
1077         if (_cmpname(self, v, name, namelen)) {
1078             *code = v;
1079             return 1;
1080         }
1081         incr = incr << 1;
1082         if (incr > mask)
1083             incr = incr ^ code_poly;
1084     }
1085 }
1086 
1087 static const _PyUnicode_Name_CAPI hashAPI =
1088 {
1089     sizeof(_PyUnicode_Name_CAPI),
1090     _getucname,
1091     _getcode
1092 };
1093 
1094 /* -------------------------------------------------------------------- */
1095 /* Python bindings */
1096 
1097 PyDoc_STRVAR(unicodedata_name__doc__,
1098 "name(unichr[, default])\n\
1099 Returns the name assigned to the Unicode character unichr as a\n\
1100 string. If no name is defined, default is returned, or, if not\n\
1101 given, ValueError is raised.");
1102 
1103 static PyObject *
1104 unicodedata_name(PyObject* self, PyObject* args)
1105 {
1106     char name[NAME_MAXLEN];
1107     Py_UCS4 c;
1108 
1109     PyUnicodeObject* v;
1110     PyObject* defobj = NULL;
1111     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1112         return NULL;
1113 
1114     c = getuchar(v);
1115     if (c == (Py_UCS4)-1)
1116         return NULL;
1117 
1118     if (!_getucname(self, c, name, sizeof(name))) {
1119         if (defobj == NULL) {
1120             PyErr_SetString(PyExc_ValueError, "no such name");
1121             return NULL;
1122         }
1123         else {
1124             Py_INCREF(defobj);
1125             return defobj;
1126         }
1127     }
1128 
1129     return Py_BuildValue("s", name);
1130 }
1131 
1132 PyDoc_STRVAR(unicodedata_lookup__doc__,
1133 "lookup(name)\n\
1134 \n\
1135 Look up character by name.  If a character with the\n\
1136 given name is found, return the corresponding Unicode\n\
1137 character.  If not found, KeyError is raised.");
1138 
1139 static PyObject *
1140 unicodedata_lookup(PyObject* self, PyObject* args)
1141 {
1142     Py_UCS4 code;
1143     Py_UNICODE str[2];
1144 
1145     char* name;
1146     int namelen;
1147     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1148         return NULL;
1149 
1150     if (!_getcode(self, name, namelen, &code)) {
1151         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1152                      name);
1153         return NULL;
1154     }
1155 
1156 #ifndef Py_UNICODE_WIDE
1157     if (code >= 0x10000) {
1158         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1159         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1160         return PyUnicode_FromUnicode(str, 2);
1161     }
1162 #endif
1163     str[0] = (Py_UNICODE) code;
1164     return PyUnicode_FromUnicode(str, 1);
1165 }
1166 
1167 /* XXX Add doc strings. */
1168 
1169 static PyMethodDef unicodedata_functions[] = {
1170     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1171     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1172     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1173     {"category", unicodedata_category, METH_VARARGS,
1174                  unicodedata_category__doc__},
1175     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1176                       unicodedata_bidirectional__doc__},
1177     {"combining", unicodedata_combining, METH_VARARGS,
1178                   unicodedata_combining__doc__},
1179     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1180                  unicodedata_mirrored__doc__},
1181     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1182                          unicodedata_east_asian_width__doc__},
1183     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1184                       unicodedata_decomposition__doc__},
1185     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1186     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1187     {"normalize", unicodedata_normalize, METH_VARARGS,
1188                   unicodedata_normalize__doc__},
1189     {NULL, NULL}                /* sentinel */
1190 };
1191 
1192 static PyTypeObject UCD_Type = {
1193         /* The ob_type field must be initialized in the module init function
1194          * to be portable to Windows without using C++. */
1195         PyVarObject_HEAD_INIT(NULL, 0)
1196         "unicodedata.UCD",              /*tp_name*/
1197         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1198         0,                      /*tp_itemsize*/
1199         /* methods */
1200         (destructor)PyObject_Del, /*tp_dealloc*/
1201         0,                      /*tp_print*/
1202         0,                      /*tp_getattr*/
1203         0,                      /*tp_setattr*/
1204         0,                      /*tp_compare*/
1205         0,                      /*tp_repr*/
1206         0,                      /*tp_as_number*/
1207         0,                      /*tp_as_sequence*/
1208         0,                      /*tp_as_mapping*/
1209         0,                      /*tp_hash*/
1210         0,                      /*tp_call*/
1211         0,                      /*tp_str*/
1212         PyObject_GenericGetAttr,/*tp_getattro*/
1213         0,                      /*tp_setattro*/
1214         0,                      /*tp_as_buffer*/
1215         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1216         0,                      /*tp_doc*/
1217         0,                      /*tp_traverse*/
1218         0,                      /*tp_clear*/
1219         0,                      /*tp_richcompare*/
1220         0,                      /*tp_weaklistoffset*/
1221         0,                      /*tp_iter*/
1222         0,                      /*tp_iternext*/
1223         unicodedata_functions,  /*tp_methods*/
1224         DB_members,             /*tp_members*/
1225         0,                      /*tp_getset*/
1226         0,                      /*tp_base*/
1227         0,                      /*tp_dict*/
1228         0,                      /*tp_descr_get*/
1229         0,                      /*tp_descr_set*/
1230         0,                      /*tp_dictoffset*/
1231         0,                      /*tp_init*/
1232         0,                      /*tp_alloc*/
1233         0,                      /*tp_new*/
1234         0,                      /*tp_free*/
1235         0,                      /*tp_is_gc*/
1236 };
1237 
1238 PyDoc_STRVAR(unicodedata_docstring,
1239 "This module provides access to the Unicode Character Database which\n\
1240 defines character properties for all Unicode characters. The data in\n\
1241 this database is based on the UnicodeData.txt file version\n\
1242 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1243 \n\
1244 The module uses the same names and symbols as defined by the\n\
1245 UnicodeData File Format 5.2.0 (see\n\
1246 http://www.unicode.org/reports/tr44/tr44-4.html).");
1247 
1248 PyMODINIT_FUNC
1249 initunicodedata(void)
1250 {
1251     PyObject *m, *v;
1252 
1253     Py_TYPE(&UCD_Type) = &PyType_Type;
1254 
1255     m = Py_InitModule3(
1256         "unicodedata", unicodedata_functions, unicodedata_docstring);
1257     if (!m)
1258         return;
1259 
1260     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1261     Py_INCREF(&UCD_Type);
1262     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1263 
1264     /* Previous versions */
1265     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1266     if (v != NULL)
1267         PyModule_AddObject(m, "ucd_3_2_0", v);
1268 
1269     /* Export C API */
1270     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1271     if (v != NULL)
1272         PyModule_AddObject(m, "ucnhash_CAPI", v);
1273 }
1274 
1275 /*
1276 Local variables:
1277 c-basic-offset: 4
1278 indent-tabs-mode: nil
1279 End:
1280 */