Location	Tool	Test ID	Function	Issue
/builddir/build/BUILD/Python-2.7.3/Parser/tokenizer.c:955:21	clang-analyzer			Potential leak of memory pointed to by 'newbuf'
   1 /* Tokenizer implementation */
   2 
   3 #include "Python.h"
   4 #include "pgenheaders.h"
   5 
   6 #include <ctype.h>
   7 #include <assert.h>
   8 
   9 #include "tokenizer.h"
  10 #include "errcode.h"
  11 
  12 #ifndef PGEN
  13 #include "unicodeobject.h"
  14 #include "stringobject.h"
  15 #include "fileobject.h"
  16 #include "codecs.h"
  17 #include "abstract.h"
  18 #include "pydebug.h"
  19 #endif /* PGEN */
  20 
  21 extern char *PyOS_Readline(FILE *, FILE *, char *);
  22 /* Return malloc'ed string including trailing \n;
  23    empty malloc'ed string for EOF;
  24    NULL if interrupted */
  25 
  26 /* Don't ever change this -- it would break the portability of Python code */
  27 #define TABSIZE 8
  28 
  29 /* Forward */
  30 static struct tok_state *tok_new(void);
  31 static int tok_nextc(struct tok_state *tok);
  32 static void tok_backup(struct tok_state *tok, int c);
  33 
  34 /* Token names */
  35 
  36 char *_PyParser_TokenNames[] = {
  37     "ENDMARKER",
  38     "NAME",
  39     "NUMBER",
  40     "STRING",
  41     "NEWLINE",
  42     "INDENT",
  43     "DEDENT",
  44     "LPAR",
  45     "RPAR",
  46     "LSQB",
  47     "RSQB",
  48     "COLON",
  49     "COMMA",
  50     "SEMI",
  51     "PLUS",
  52     "MINUS",
  53     "STAR",
  54     "SLASH",
  55     "VBAR",
  56     "AMPER",
  57     "LESS",
  58     "GREATER",
  59     "EQUAL",
  60     "DOT",
  61     "PERCENT",
  62     "BACKQUOTE",
  63     "LBRACE",
  64     "RBRACE",
  65     "EQEQUAL",
  66     "NOTEQUAL",
  67     "LESSEQUAL",
  68     "GREATEREQUAL",
  69     "TILDE",
  70     "CIRCUMFLEX",
  71     "LEFTSHIFT",
  72     "RIGHTSHIFT",
  73     "DOUBLESTAR",
  74     "PLUSEQUAL",
  75     "MINEQUAL",
  76     "STAREQUAL",
  77     "SLASHEQUAL",
  78     "PERCENTEQUAL",
  79     "AMPEREQUAL",
  80     "VBAREQUAL",
  81     "CIRCUMFLEXEQUAL",
  82     "LEFTSHIFTEQUAL",
  83     "RIGHTSHIFTEQUAL",
  84     "DOUBLESTAREQUAL",
  85     "DOUBLESLASH",
  86     "DOUBLESLASHEQUAL",
  87     "AT",
  88     /* This table must match the #defines in token.h! */
  89     "OP",
  90     "<ERRORTOKEN>",
  91     "<N_TOKENS>"
  92 };
  93 
  94 /* Create and initialize a new tok_state structure */
  95 
  96 static struct tok_state *
  97 tok_new(void)
  98 {
  99     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 100                                             sizeof(struct tok_state));
 101     if (tok == NULL)
 102         return NULL;
 103     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 104     tok->done = E_OK;
 105     tok->fp = NULL;
 106     tok->input = NULL;
 107     tok->tabsize = TABSIZE;
 108     tok->indent = 0;
 109     tok->indstack[0] = 0;
 110     tok->atbol = 1;
 111     tok->pendin = 0;
 112     tok->prompt = tok->nextprompt = NULL;
 113     tok->lineno = 0;
 114     tok->level = 0;
 115     tok->filename = NULL;
 116     tok->altwarning = 0;
 117     tok->alterror = 0;
 118     tok->alttabsize = 1;
 119     tok->altindstack[0] = 0;
 120     tok->decoding_state = 0;
 121     tok->decoding_erred = 0;
 122     tok->read_coding_spec = 0;
 123     tok->encoding = NULL;
 124     tok->cont_line = 0;
 125 #ifndef PGEN
 126     tok->decoding_readline = NULL;
 127     tok->decoding_buffer = NULL;
 128 #endif
 129     return tok;
 130 }
 131 
 132 static char *
 133 new_string(const char *s, Py_ssize_t len)
 134 {
 135     char* result = (char *)PyMem_MALLOC(len + 1);
 136     if (result != NULL) {
 137         memcpy(result, s, len);
 138         result[len] = '\0';
 139     }
 140     return result;
 141 }
 142 
 143 #ifdef PGEN
 144 
 145 static char *
 146 decoding_fgets(char *s, int size, struct tok_state *tok)
 147 {
 148     return fgets(s, size, tok->fp);
 149 }
 150 
 151 static int
 152 decoding_feof(struct tok_state *tok)
 153 {
 154     return feof(tok->fp);
 155 }
 156 
 157 static char *
 158 decode_str(const char *str, int exec_input, struct tok_state *tok)
 159 {
 160     return new_string(str, strlen(str));
 161 }
 162 
 163 #else /* PGEN */
 164 
 165 static char *
 166 error_ret(struct tok_state *tok) /* XXX */
 167 {
 168     tok->decoding_erred = 1;
 169     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 170         PyMem_FREE(tok->buf);
 171     tok->buf = NULL;
 172     return NULL;                /* as if it were EOF */
 173 }
 174 
 175 
 176 static char *
 177 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 178 {
 179     char buf[13];
 180     int i;
 181     for (i = 0; i < 12; i++) {
 182         int c = s[i];
 183         if (c == '\0')
 184             break;
 185         else if (c == '_')
 186             buf[i] = '-';
 187         else
 188             buf[i] = tolower(c);
 189     }
 190     buf[i] = '\0';
 191     if (strcmp(buf, "utf-8") == 0 ||
 192         strncmp(buf, "utf-8-", 6) == 0)
 193         return "utf-8";
 194     else if (strcmp(buf, "latin-1") == 0 ||
 195              strcmp(buf, "iso-8859-1") == 0 ||
 196              strcmp(buf, "iso-latin-1") == 0 ||
 197              strncmp(buf, "latin-1-", 8) == 0 ||
 198              strncmp(buf, "iso-8859-1-", 11) == 0 ||
 199              strncmp(buf, "iso-latin-1-", 12) == 0)
 200         return "iso-8859-1";
 201     else
 202         return s;
 203 }
 204 
 205 /* Return the coding spec in S, or NULL if none is found.  */
 206 
 207 static char *
 208 get_coding_spec(const char *s, Py_ssize_t size)
 209 {
 210     Py_ssize_t i;
 211     /* Coding spec must be in a comment, and that comment must be
 212      * the only statement on the source code line. */
 213     for (i = 0; i < size - 6; i++) {
 214         if (s[i] == '#')
 215             break;
 216         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 217             return NULL;
 218     }
 219     for (; i < size - 6; i++) { /* XXX inefficient search */
 220         const char* t = s + i;
 221         if (strncmp(t, "coding", 6) == 0) {
 222             const char* begin = NULL;
 223             t += 6;
 224             if (t[0] != ':' && t[0] != '=')
 225                 continue;
 226             do {
 227                 t++;
 228             } while (t[0] == '\x20' || t[0] == '\t');
 229 
 230             begin = t;
 231             while (Py_ISALNUM(t[0]) ||
 232                    t[0] == '-' || t[0] == '_' || t[0] == '.')
 233                 t++;
 234 
 235             if (begin < t) {
 236                 char* r = new_string(begin, t - begin);
 237                 char* q = get_normal_name(r);
 238                 if (r != q) {
 239                     PyMem_FREE(r);
 240                     r = new_string(q, strlen(q));
 241                 }
 242                 return r;
 243             }
 244         }
 245     }
 246     return NULL;
 247 }
 248 
 249 /* Check whether the line contains a coding spec. If it does,
 250    invoke the set_readline function for the new encoding.
 251    This function receives the tok_state and the new encoding.
 252    Return 1 on success, 0 on failure.  */
 253 
 254 static int
 255 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 256                   int set_readline(struct tok_state *, const char *))
 257 {
 258     char * cs;
 259     int r = 1;
 260 
 261     if (tok->cont_line)
 262         /* It's a continuation line, so it can't be a coding spec. */
 263         return 1;
 264     cs = get_coding_spec(line, size);
 265     if (cs != NULL) {
 266         tok->read_coding_spec = 1;
 267         if (tok->encoding == NULL) {
 268             assert(tok->decoding_state == 1); /* raw */
 269             if (strcmp(cs, "utf-8") == 0 ||
 270                 strcmp(cs, "iso-8859-1") == 0) {
 271                 tok->encoding = cs;
 272             } else {
 273 #ifdef Py_USING_UNICODE
 274                 r = set_readline(tok, cs);
 275                 if (r) {
 276                     tok->encoding = cs;
 277                     tok->decoding_state = -1;
 278                 }
 279                 else
 280                     PyMem_FREE(cs);
 281 #else
 282                 /* Without Unicode support, we cannot
 283                    process the coding spec. Since there
 284                    won't be any Unicode literals, that
 285                    won't matter. */
 286                 PyMem_FREE(cs);
 287 #endif
 288             }
 289         } else {                /* then, compare cs with BOM */
 290             r = (strcmp(tok->encoding, cs) == 0);
 291             PyMem_FREE(cs);
 292         }
 293     }
 294     if (!r) {
 295         cs = tok->encoding;
 296         if (!cs)
 297             cs = "with BOM";
 298         PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 299     }
 300     return r;
 301 }
 302 
 303 /* See whether the file starts with a BOM. If it does,
 304    invoke the set_readline function with the new encoding.
 305    Return 1 on success, 0 on failure.  */
 306 
 307 static int
 308 check_bom(int get_char(struct tok_state *),
 309           void unget_char(int, struct tok_state *),
 310           int set_readline(struct tok_state *, const char *),
 311           struct tok_state *tok)
 312 {
 313     int ch1, ch2, ch3;
 314     ch1 = get_char(tok);
 315     tok->decoding_state = 1;
 316     if (ch1 == EOF) {
 317         return 1;
 318     } else if (ch1 == 0xEF) {
 319         ch2 = get_char(tok);
 320         if (ch2 != 0xBB) {
 321             unget_char(ch2, tok);
 322             unget_char(ch1, tok);
 323             return 1;
 324         }
 325         ch3 = get_char(tok);
 326         if (ch3 != 0xBF) {
 327             unget_char(ch3, tok);
 328             unget_char(ch2, tok);
 329             unget_char(ch1, tok);
 330             return 1;
 331         }
 332 #if 0
 333     /* Disable support for UTF-16 BOMs until a decision
 334        is made whether this needs to be supported.  */
 335     } else if (ch1 == 0xFE) {
 336         ch2 = get_char(tok);
 337         if (ch2 != 0xFF) {
 338             unget_char(ch2, tok);
 339             unget_char(ch1, tok);
 340             return 1;
 341         }
 342         if (!set_readline(tok, "utf-16-be"))
 343             return 0;
 344         tok->decoding_state = -1;
 345     } else if (ch1 == 0xFF) {
 346         ch2 = get_char(tok);
 347         if (ch2 != 0xFE) {
 348             unget_char(ch2, tok);
 349             unget_char(ch1, tok);
 350             return 1;
 351         }
 352         if (!set_readline(tok, "utf-16-le"))
 353             return 0;
 354         tok->decoding_state = -1;
 355 #endif
 356     } else {
 357         unget_char(ch1, tok);
 358         return 1;
 359     }
 360     if (tok->encoding != NULL)
 361         PyMem_FREE(tok->encoding);
 362     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
 363     return 1;
 364 }
 365 
 366 /* Read a line of text from TOK into S, using the stream in TOK.
 367    Return NULL on failure, else S.
 368 
 369    On entry, tok->decoding_buffer will be one of:
 370      1) NULL: need to call tok->decoding_readline to get a new line
 371      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 372        stored the result in tok->decoding_buffer
 373      3) PyStringObject *: previous call to fp_readl did not have enough room
 374        (in the s buffer) to copy entire contents of the line read
 375        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 376        In this case, fp_readl is called in a loop (with an expanded buffer)
 377        until the buffer ends with a '\n' (or until the end of the file is
 378        reached): see tok_nextc and its calls to decoding_fgets.
 379 */
 380 
 381 static char *
 382 fp_readl(char *s, int size, struct tok_state *tok)
 383 {
 384 #ifndef Py_USING_UNICODE
 385     /* In a non-Unicode built, this should never be called. */
 386     Py_FatalError("fp_readl should not be called in this build.");
 387     return NULL; /* Keep compiler happy (not reachable) */
 388 #else
 389     PyObject* utf8 = NULL;
 390     PyObject* buf = tok->decoding_buffer;
 391     char *str;
 392     Py_ssize_t utf8len;
 393 
 394     /* Ask for one less byte so we can terminate it */
 395     assert(size > 0);
 396     size--;
 397 
 398     if (buf == NULL) {
 399         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 400         if (buf == NULL)
 401             return error_ret(tok);
 402     } else {
 403         tok->decoding_buffer = NULL;
 404         if (PyString_CheckExact(buf))
 405             utf8 = buf;
 406     }
 407     if (utf8 == NULL) {
 408         utf8 = PyUnicode_AsUTF8String(buf);
 409         Py_DECREF(buf);
 410         if (utf8 == NULL)
 411             return error_ret(tok);
 412     }
 413     str = PyString_AsString(utf8);
 414     utf8len = PyString_GET_SIZE(utf8);
 415     if (utf8len > size) {
 416         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 417         if (tok->decoding_buffer == NULL) {
 418             Py_DECREF(utf8);
 419             return error_ret(tok);
 420         }
 421         utf8len = size;
 422     }
 423     memcpy(s, str, utf8len);
 424     s[utf8len] = '\0';
 425     Py_DECREF(utf8);
 426     if (utf8len == 0)
 427         return NULL; /* EOF */
 428     return s;
 429 #endif
 430 }
 431 
 432 /* Set the readline function for TOK to a StreamReader's
 433    readline function. The StreamReader is named ENC.
 434 
 435    This function is called from check_bom and check_coding_spec.
 436 
 437    ENC is usually identical to the future value of tok->encoding,
 438    except for the (currently unsupported) case of UTF-16.
 439 
 440    Return 1 on success, 0 on failure. */
 441 
 442 static int
 443 fp_setreadl(struct tok_state *tok, const char* enc)
 444 {
 445     PyObject *reader, *stream, *readline;
 446 
 447     /* XXX: constify filename argument. */
 448     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 449     if (stream == NULL)
 450         return 0;
 451 
 452     reader = PyCodec_StreamReader(enc, stream, NULL);
 453     Py_DECREF(stream);
 454     if (reader == NULL)
 455         return 0;
 456 
 457     readline = PyObject_GetAttrString(reader, "readline");
 458     Py_DECREF(reader);
 459     if (readline == NULL)
 460         return 0;
 461 
 462     tok->decoding_readline = readline;
 463     return 1;
 464 }
 465 
 466 /* Fetch the next byte from TOK. */
 467 
 468 static int fp_getc(struct tok_state *tok) {
 469     return getc(tok->fp);
 470 }
 471 
 472 /* Unfetch the last byte back into TOK.  */
 473 
 474 static void fp_ungetc(int c, struct tok_state *tok) {
 475     ungetc(c, tok->fp);
 476 }
 477 
 478 /* Read a line of input from TOK. Determine encoding
 479    if necessary.  */
 480 
 481 static char *
 482 decoding_fgets(char *s, int size, struct tok_state *tok)
 483 {
 484     char *line = NULL;
 485     int badchar = 0;
 486     for (;;) {
 487         if (tok->decoding_state < 0) {
 488             /* We already have a codec associated with
 489                this input. */
 490             line = fp_readl(s, size, tok);
 491             break;
 492         } else if (tok->decoding_state > 0) {
 493             /* We want a 'raw' read. */
 494             line = Py_UniversalNewlineFgets(s, size,
 495                                             tok->fp, NULL);
 496             break;
 497         } else {
 498             /* We have not yet determined the encoding.
 499                If an encoding is found, use the file-pointer
 500                reader functions from now on. */
 501             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 502                 return error_ret(tok);
 503             assert(tok->decoding_state != 0);
 504         }
 505     }
 506     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 507         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 508             return error_ret(tok);
 509         }
 510     }
 511 #ifndef PGEN
 512     /* The default encoding is ASCII, so make sure we don't have any
 513        non-ASCII bytes in it. */
 514     if (line && !tok->encoding) {
 515         unsigned char *c;
 516         for (c = (unsigned char *)line; *c; c++)
 517             if (*c > 127) {
 518                 badchar = *c;
 519                 break;
 520             }
 521     }
 522     if (badchar) {
 523         char buf[500];
 524         /* Need to add 1 to the line number, since this line
 525            has not been counted, yet.  */
 526         sprintf(buf,
 527             "Non-ASCII character '\\x%.2x' "
 528             "in file %.200s on line %i, "
 529             "but no encoding declared; "
 530             "see http://www.python.org/peps/pep-0263.html for details",
 531             badchar, tok->filename, tok->lineno + 1);
 532         PyErr_SetString(PyExc_SyntaxError, buf);
 533         return error_ret(tok);
 534     }
 535 #endif
 536     return line;
 537 }
 538 
 539 static int
 540 decoding_feof(struct tok_state *tok)
 541 {
 542     if (tok->decoding_state >= 0) {
 543         return feof(tok->fp);
 544     } else {
 545         PyObject* buf = tok->decoding_buffer;
 546         if (buf == NULL) {
 547             buf = PyObject_CallObject(tok->decoding_readline, NULL);
 548             if (buf == NULL) {
 549                 error_ret(tok);
 550                 return 1;
 551             } else {
 552                 tok->decoding_buffer = buf;
 553             }
 554         }
 555         return PyObject_Length(buf) == 0;
 556     }
 557 }
 558 
 559 /* Fetch a byte from TOK, using the string buffer. */
 560 
 561 static int
 562 buf_getc(struct tok_state *tok) {
 563     return Py_CHARMASK(*tok->str++);
 564 }
 565 
 566 /* Unfetch a byte from TOK, using the string buffer. */
 567 
 568 static void
 569 buf_ungetc(int c, struct tok_state *tok) {
 570     tok->str--;
 571     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
 572 }
 573 
 574 /* Set the readline function for TOK to ENC. For the string-based
 575    tokenizer, this means to just record the encoding. */
 576 
 577 static int
 578 buf_setreadl(struct tok_state *tok, const char* enc) {
 579     tok->enc = enc;
 580     return 1;
 581 }
 582 
 583 /* Return a UTF-8 encoding Python string object from the
 584    C byte string STR, which is encoded with ENC. */
 585 
 586 #ifdef Py_USING_UNICODE
 587 static PyObject *
 588 translate_into_utf8(const char* str, const char* enc) {
 589     PyObject *utf8;
 590     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 591     if (buf == NULL)
 592         return NULL;
 593     utf8 = PyUnicode_AsUTF8String(buf);
 594     Py_DECREF(buf);
 595     return utf8;
 596 }
 597 #endif
 598 
 599 
 600 static char *
 601 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
 602     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
 603     char *buf, *current;
 604     char c = '\0';
 605     buf = PyMem_MALLOC(needed_length);
 606     if (buf == NULL) {
 607         tok->done = E_NOMEM;
 608         return NULL;
 609     }
 610     for (current = buf; *s; s++, current++) {
 611         c = *s;
 612         if (skip_next_lf) {
 613             skip_next_lf = 0;
 614             if (c == '\n') {
 615                 c = *++s;
 616                 if (!c)
 617                     break;
 618             }
 619         }
 620         if (c == '\r') {
 621             skip_next_lf = 1;
 622             c = '\n';
 623         }
 624         *current = c;
 625     }
 626     /* If this is exec input, add a newline to the end of the string if
 627        there isn't one already. */
 628     if (exec_input && c != '\n') {
 629         *current = '\n';
 630         current++;
 631     }
 632     *current = '\0';
 633     final_length = current - buf + 1;
 634     if (final_length < needed_length && final_length)
 635         /* should never fail */
 636         buf = PyMem_REALLOC(buf, final_length);
 637     return buf;
 638 }
 639 
 640 /* Decode a byte string STR for use as the buffer of TOK.
 641    Look for encoding declarations inside STR, and record them
 642    inside TOK.  */
 643 
 644 static const char *
 645 decode_str(const char *input, int single, struct tok_state *tok)
 646 {
 647     PyObject* utf8 = NULL;
 648     const char *str;
 649     const char *s;
 650     const char *newl[2] = {NULL, NULL};
 651     int lineno = 0;
 652     tok->input = str = translate_newlines(input, single, tok);
 653     if (str == NULL)
 654         return NULL;
 655     tok->enc = NULL;
 656     tok->str = str;
 657     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 658         return error_ret(tok);
 659     str = tok->str;             /* string after BOM if any */
 660     assert(str);
 661 #ifdef Py_USING_UNICODE
 662     if (tok->enc != NULL) {
 663         utf8 = translate_into_utf8(str, tok->enc);
 664         if (utf8 == NULL)
 665             return error_ret(tok);
 666         str = PyString_AsString(utf8);
 667     }
 668 #endif
 669     for (s = str;; s++) {
 670         if (*s == '\0') break;
 671         else if (*s == '\n') {
 672             assert(lineno < 2);
 673             newl[lineno] = s;
 674             lineno++;
 675             if (lineno == 2) break;
 676         }
 677     }
 678     tok->enc = NULL;
 679     /* need to check line 1 and 2 separately since check_coding_spec
 680        assumes a single line as input */
 681     if (newl[0]) {
 682         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 683             return error_ret(tok);
 684         if (tok->enc == NULL && newl[1]) {
 685             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 686                                    tok, buf_setreadl))
 687                 return error_ret(tok);
 688         }
 689     }
 690 #ifdef Py_USING_UNICODE
 691     if (tok->enc != NULL) {
 692         assert(utf8 == NULL);
 693         utf8 = translate_into_utf8(str, tok->enc);
 694         if (utf8 == NULL)
 695             return error_ret(tok);
 696         str = PyString_AsString(utf8);
 697     }
 698 #endif
 699     assert(tok->decoding_buffer == NULL);
 700     tok->decoding_buffer = utf8; /* CAUTION */
 701     return str;
 702 }
 703 
 704 #endif /* PGEN */
 705 
 706 /* Set up tokenizer for string */
 707 
 708 struct tok_state *
 709 PyTokenizer_FromString(const char *str, int exec_input)
 710 {
 711     struct tok_state *tok = tok_new();
 712     if (tok == NULL)
 713         return NULL;
 714     str = (char *)decode_str(str, exec_input, tok);
 715     if (str == NULL) {
 716         PyTokenizer_Free(tok);
 717         return NULL;
 718     }
 719 
 720     /* XXX: constify members. */
 721     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 722     return tok;
 723 }
 724 
 725 
 726 /* Set up tokenizer for file */
 727 
 728 struct tok_state *
 729 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 730 {
 731     struct tok_state *tok = tok_new();
 732     if (tok == NULL)
 733         return NULL;
 734     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 735         PyTokenizer_Free(tok);
 736         return NULL;
 737     }
 738     tok->cur = tok->inp = tok->buf;
 739     tok->end = tok->buf + BUFSIZ;
 740     tok->fp = fp;
 741     tok->prompt = ps1;
 742     tok->nextprompt = ps2;
 743     return tok;
 744 }
 745 
 746 
 747 /* Free a tok_state structure */
 748 
 749 void
 750 PyTokenizer_Free(struct tok_state *tok)
 751 {
 752     if (tok->encoding != NULL)
 753         PyMem_FREE(tok->encoding);
 754 #ifndef PGEN
 755     Py_XDECREF(tok->decoding_readline);
 756     Py_XDECREF(tok->decoding_buffer);
 757 #endif
 758     if (tok->fp != NULL && tok->buf != NULL)
 759         PyMem_FREE(tok->buf);
 760     if (tok->input)
 761         PyMem_FREE((char *)tok->input);
 762     PyMem_FREE(tok);
 763 }
 764 
 765 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 766 static int
 767 tok_stdin_decode(struct tok_state *tok, char **inp)
 768 {
 769     PyObject *enc, *sysstdin, *decoded, *utf8;
 770     const char *encoding;
 771     char *converted;
 772 
 773     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 774         return 0;
 775     sysstdin = PySys_GetObject("stdin");
 776     if (sysstdin == NULL || !PyFile_Check(sysstdin))
 777         return 0;
 778 
 779     enc = ((PyFileObject *)sysstdin)->f_encoding;
 780     if (enc == NULL || !PyString_Check(enc))
 781         return 0;
 782     Py_INCREF(enc);
 783 
 784     encoding = PyString_AsString(enc);
 785     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 786     if (decoded == NULL)
 787         goto error_clear;
 788 
 789     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 790     Py_DECREF(decoded);
 791     if (utf8 == NULL)
 792         goto error_clear;
 793 
 794     assert(PyString_Check(utf8));
 795     converted = new_string(PyString_AS_STRING(utf8),
 796                            PyString_GET_SIZE(utf8));
 797     Py_DECREF(utf8);
 798     if (converted == NULL)
 799         goto error_nomem;
 800 
 801     PyMem_FREE(*inp);
 802     *inp = converted;
 803     if (tok->encoding != NULL)
 804         PyMem_FREE(tok->encoding);
 805     tok->encoding = new_string(encoding, strlen(encoding));
 806     if (tok->encoding == NULL)
 807         goto error_nomem;
 808 
 809     Py_DECREF(enc);
 810     return 0;
 811 
 812 error_nomem:
 813     Py_DECREF(enc);
 814     tok->done = E_NOMEM;
 815     return -1;
 816 
 817 error_clear:
 818     Py_DECREF(enc);
 819     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
 820         tok->done = E_ERROR;
 821         return -1;
 822     }
 823     /* Fallback to iso-8859-1: for backward compatibility */
 824     PyErr_Clear();
 825     return 0;
 826 }
 827 #endif
 828 
 829 /* Get next char, updating state; error code goes into tok->done */
 830 
 831 static int
 832 tok_nextc(register struct tok_state *tok)
 833 {
 834     for (;;) {
 835         if (tok->cur != tok->inp) {
 836             return Py_CHARMASK(*tok->cur++); /* Fast path */
 837         }
 838         if (tok->done != E_OK)
 839             return EOF;
 840         if (tok->fp == NULL) {
 841             char *end = strchr(tok->inp, '\n');
 842             if (end != NULL)
 843                 end++;
 844             else {
 845                 end = strchr(tok->inp, '\0');
 846                 if (end == tok->inp) {
 847                     tok->done = E_EOF;
 848                     return EOF;
 849                 }
 850             }
 851             if (tok->start == NULL)
 852                 tok->buf = tok->cur;
 853             tok->line_start = tok->cur;
 854             tok->lineno++;
 855             tok->inp = end;
 856             return Py_CHARMASK(*tok->cur++);
 857         }
 858         if (tok->prompt != NULL) {
 859             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 860             if (tok->nextprompt != NULL)
 861                 tok->prompt = tok->nextprompt;
 862             if (newtok == NULL)
 863                 tok->done = E_INTR;
 864             else if (*newtok == '\0') {
 865                 PyMem_FREE(newtok);
 866                 tok->done = E_EOF;
 867             }
 868 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 869             else if (tok_stdin_decode(tok, &newtok) != 0)
 870                 PyMem_FREE(newtok);
 871 #endif
 872             else if (tok->start != NULL) {
 873                 size_t start = tok->start - tok->buf;
 874                 size_t oldlen = tok->cur - tok->buf;
 875                 size_t newlen = oldlen + strlen(newtok);
 876                 char *buf = tok->buf;
 877                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 878                 tok->lineno++;
 879                 if (buf == NULL) {
 880                     PyMem_FREE(tok->buf);
 881                     tok->buf = NULL;
 882                     PyMem_FREE(newtok);
 883                     tok->done = E_NOMEM;
 884                     return EOF;
 885                 }
 886                 tok->buf = buf;
 887                 tok->cur = tok->buf + oldlen;
 888                 tok->line_start = tok->cur;
 889                 strcpy(tok->buf + oldlen, newtok);
 890                 PyMem_FREE(newtok);
 891                 tok->inp = tok->buf + newlen;
 892                 tok->end = tok->inp + 1;
 893                 tok->start = tok->buf + start;
 894             }
 895             else {
 896                 tok->lineno++;
 897                 if (tok->buf != NULL)
 898                     PyMem_FREE(tok->buf);
 899                 tok->buf = newtok;
 900                 tok->line_start = tok->buf;
 901                 tok->cur = tok->buf;
 902                 tok->line_start = tok->buf;
 903                 tok->inp = strchr(tok->buf, '\0');
 904                 tok->end = tok->inp + 1;
 905             }
 906         }
 907         else {
 908             int done = 0;
 909             Py_ssize_t cur = 0;
 910             char *pt;
 911             if (tok->start == NULL) {
 912                 if (tok->buf == NULL) {
 913                     tok->buf = (char *)
 914                         PyMem_MALLOC(BUFSIZ);
 915                     if (tok->buf == NULL) {
 916                         tok->done = E_NOMEM;
 917                         return EOF;
 918                     }
 919                     tok->end = tok->buf + BUFSIZ;
 920                 }
 921                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 922                           tok) == NULL) {
 923                     tok->done = E_EOF;
 924                     done = 1;
 925                 }
 926                 else {
 927                     tok->done = E_OK;
 928                     tok->inp = strchr(tok->buf, '\0');
 929                     done = tok->inp[-1] == '\n';
 930                 }
 931             }
 932             else {
 933                 cur = tok->cur - tok->buf;
 934                 if (decoding_feof(tok)) {
 935                     tok->done = E_EOF;
 936                     done = 1;
 937                 }
 938                 else
 939                     tok->done = E_OK;
 940             }
 941             tok->lineno++;
 942             /* Read until '\n' or EOF */
 943             while (!done) {
 944                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 945                           tok->start - tok->buf;
 946                 Py_ssize_t curvalid = tok->inp - tok->buf;
 947                 Py_ssize_t newsize = curvalid + BUFSIZ;
 948                 char *newbuf = tok->buf;
 949                 newbuf = (char *)PyMem_REALLOC(newbuf,
 950                                                newsize);
 951                 if (newbuf == NULL) {
 952                     tok->done = E_NOMEM;
 953                     tok->cur = tok->inp;
 954                     return EOF;
 955                 }
   Potential leak of memory pointed to by 'newbuf'
   (emitted by clang-analyzer)
TODO: a detailed trace is available in the data model (not yet rendered in this report) 956                 tok->buf = newbuf;
 957                 tok->inp = tok->buf + curvalid;
 958                 tok->end = tok->buf + newsize;
 959                 tok->start = curstart < 0 ? NULL :
 960                          tok->buf + curstart;
 961                 if (decoding_fgets(tok->inp,
 962                                (int)(tok->end - tok->inp),
 963                                tok) == NULL) {
 964                     /* Break out early on decoding
 965                        errors, as tok->buf will be NULL
 966                      */
 967                     if (tok->decoding_erred)
 968                         return EOF;
 969                     /* Last line does not end in \n,
 970                        fake one */
 971                     strcpy(tok->inp, "\n");
 972                 }
 973                 tok->inp = strchr(tok->inp, '\0');
 974                 done = tok->inp[-1] == '\n';
 975             }
 976             if (tok->buf != NULL) {
 977                 tok->cur = tok->buf + cur;
 978                 tok->line_start = tok->cur;
 979                 /* replace "\r\n" with "\n" */
 980                 /* For Mac leave the \r, giving a syntax error */
 981                 pt = tok->inp - 2;
 982                 if (pt >= tok->buf && *pt == '\r') {
 983                     *pt++ = '\n';
 984                     *pt = '\0';
 985                     tok->inp = pt;
 986                 }
 987             }
 988         }
 989         if (tok->done != E_OK) {
 990             if (tok->prompt != NULL)
 991                 PySys_WriteStderr("\n");
 992             tok->cur = tok->inp;
 993             return EOF;
 994         }
 995     }
 996     /*NOTREACHED*/
 997 }
 998 
 999 
1000 /* Back-up one character */
1001 
1002 static void
1003 tok_backup(register struct tok_state *tok, register int c)
1004 {
1005     if (c != EOF) {
1006         if (--tok->cur < tok->buf)
1007             Py_FatalError("tok_backup: beginning of buffer");
1008         if (*tok->cur != c)
1009             *tok->cur = c;
1010     }
1011 }
1012 
1013 
1014 /* Return the token corresponding to a single character */
1015 
1016 int
1017 PyToken_OneChar(int c)
1018 {
1019     switch (c) {
1020     case '(':           return LPAR;
1021     case ')':           return RPAR;
1022     case '[':           return LSQB;
1023     case ']':           return RSQB;
1024     case ':':           return COLON;
1025     case ',':           return COMMA;
1026     case ';':           return SEMI;
1027     case '+':           return PLUS;
1028     case '-':           return MINUS;
1029     case '*':           return STAR;
1030     case '/':           return SLASH;
1031     case '|':           return VBAR;
1032     case '&':           return AMPER;
1033     case '<':           return LESS;
1034     case '>':           return GREATER;
1035     case '=':           return EQUAL;
1036     case '.':           return DOT;
1037     case '%':           return PERCENT;
1038     case '`':           return BACKQUOTE;
1039     case '{':           return LBRACE;
1040     case '}':           return RBRACE;
1041     case '^':           return CIRCUMFLEX;
1042     case '~':           return TILDE;
1043     case '@':       return AT;
1044     default:            return OP;
1045     }
1046 }
1047 
1048 
1049 int
1050 PyToken_TwoChars(int c1, int c2)
1051 {
1052     switch (c1) {
1053     case '=':
1054         switch (c2) {
1055         case '=':               return EQEQUAL;
1056         }
1057         break;
1058     case '!':
1059         switch (c2) {
1060         case '=':               return NOTEQUAL;
1061         }
1062         break;
1063     case '<':
1064         switch (c2) {
1065         case '>':               return NOTEQUAL;
1066         case '=':               return LESSEQUAL;
1067         case '<':               return LEFTSHIFT;
1068         }
1069         break;
1070     case '>':
1071         switch (c2) {
1072         case '=':               return GREATEREQUAL;
1073         case '>':               return RIGHTSHIFT;
1074         }
1075         break;
1076     case '+':
1077         switch (c2) {
1078         case '=':               return PLUSEQUAL;
1079         }
1080         break;
1081     case '-':
1082         switch (c2) {
1083         case '=':               return MINEQUAL;
1084         }
1085         break;
1086     case '*':
1087         switch (c2) {
1088         case '*':               return DOUBLESTAR;
1089         case '=':               return STAREQUAL;
1090         }
1091         break;
1092     case '/':
1093         switch (c2) {
1094         case '/':               return DOUBLESLASH;
1095         case '=':               return SLASHEQUAL;
1096         }
1097         break;
1098     case '|':
1099         switch (c2) {
1100         case '=':               return VBAREQUAL;
1101         }
1102         break;
1103     case '%':
1104         switch (c2) {
1105         case '=':               return PERCENTEQUAL;
1106         }
1107         break;
1108     case '&':
1109         switch (c2) {
1110         case '=':               return AMPEREQUAL;
1111         }
1112         break;
1113     case '^':
1114         switch (c2) {
1115         case '=':               return CIRCUMFLEXEQUAL;
1116         }
1117         break;
1118     }
1119     return OP;
1120 }
1121 
1122 int
1123 PyToken_ThreeChars(int c1, int c2, int c3)
1124 {
1125     switch (c1) {
1126     case '<':
1127         switch (c2) {
1128         case '<':
1129             switch (c3) {
1130             case '=':
1131                 return LEFTSHIFTEQUAL;
1132             }
1133             break;
1134         }
1135         break;
1136     case '>':
1137         switch (c2) {
1138         case '>':
1139             switch (c3) {
1140             case '=':
1141                 return RIGHTSHIFTEQUAL;
1142             }
1143             break;
1144         }
1145         break;
1146     case '*':
1147         switch (c2) {
1148         case '*':
1149             switch (c3) {
1150             case '=':
1151                 return DOUBLESTAREQUAL;
1152             }
1153             break;
1154         }
1155         break;
1156     case '/':
1157         switch (c2) {
1158         case '/':
1159             switch (c3) {
1160             case '=':
1161                 return DOUBLESLASHEQUAL;
1162             }
1163             break;
1164         }
1165         break;
1166     }
1167     return OP;
1168 }
1169 
1170 static int
1171 indenterror(struct tok_state *tok)
1172 {
1173     if (tok->alterror) {
1174         tok->done = E_TABSPACE;
1175         tok->cur = tok->inp;
1176         return 1;
1177     }
1178     if (tok->altwarning) {
1179         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1180                           "in indentation\n", tok->filename);
1181         tok->altwarning = 0;
1182     }
1183     return 0;
1184 }
1185 
1186 /* Get next token, after space stripping etc. */
1187 
1188 static int
1189 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1190 {
1191     register int c;
1192     int blankline;
1193 
1194     *p_start = *p_end = NULL;
1195   nextline:
1196     tok->start = NULL;
1197     blankline = 0;
1198 
1199     /* Get indentation level */
1200     if (tok->atbol) {
1201         register int col = 0;
1202         register int altcol = 0;
1203         tok->atbol = 0;
1204         for (;;) {
1205             c = tok_nextc(tok);
1206             if (c == ' ')
1207                 col++, altcol++;
1208             else if (c == '\t') {
1209                 col = (col/tok->tabsize + 1) * tok->tabsize;
1210                 altcol = (altcol/tok->alttabsize + 1)
1211                     * tok->alttabsize;
1212             }
1213             else if (c == '\014') /* Control-L (formfeed) */
1214                 col = altcol = 0; /* For Emacs users */
1215             else
1216                 break;
1217         }
1218         tok_backup(tok, c);
1219         if (c == '#' || c == '\n') {
1220             /* Lines with only whitespace and/or comments
1221                shouldn't affect the indentation and are
1222                not passed to the parser as NEWLINE tokens,
1223                except *totally* empty lines in interactive
1224                mode, which signal the end of a command group. */
1225             if (col == 0 && c == '\n' && tok->prompt != NULL)
1226                 blankline = 0; /* Let it through */
1227             else
1228                 blankline = 1; /* Ignore completely */
1229             /* We can't jump back right here since we still
1230                may need to skip to the end of a comment */
1231         }
1232         if (!blankline && tok->level == 0) {
1233             if (col == tok->indstack[tok->indent]) {
1234                 /* No change */
1235                 if (altcol != tok->altindstack[tok->indent]) {
1236                     if (indenterror(tok))
1237                         return ERRORTOKEN;
1238                 }
1239             }
1240             else if (col > tok->indstack[tok->indent]) {
1241                 /* Indent -- always one */
1242                 if (tok->indent+1 >= MAXINDENT) {
1243                     tok->done = E_TOODEEP;
1244                     tok->cur = tok->inp;
1245                     return ERRORTOKEN;
1246                 }
1247                 if (altcol <= tok->altindstack[tok->indent]) {
1248                     if (indenterror(tok))
1249                         return ERRORTOKEN;
1250                 }
1251                 tok->pendin++;
1252                 tok->indstack[++tok->indent] = col;
1253                 tok->altindstack[tok->indent] = altcol;
1254             }
1255             else /* col < tok->indstack[tok->indent] */ {
1256                 /* Dedent -- any number, must be consistent */
1257                 while (tok->indent > 0 &&
1258                     col < tok->indstack[tok->indent]) {
1259                     tok->pendin--;
1260                     tok->indent--;
1261                 }
1262                 if (col != tok->indstack[tok->indent]) {
1263                     tok->done = E_DEDENT;
1264                     tok->cur = tok->inp;
1265                     return ERRORTOKEN;
1266                 }
1267                 if (altcol != tok->altindstack[tok->indent]) {
1268                     if (indenterror(tok))
1269                         return ERRORTOKEN;
1270                 }
1271             }
1272         }
1273     }
1274 
1275     tok->start = tok->cur;
1276 
1277     /* Return pending indents/dedents */
1278     if (tok->pendin != 0) {
1279         if (tok->pendin < 0) {
1280             tok->pendin++;
1281             return DEDENT;
1282         }
1283         else {
1284             tok->pendin--;
1285             return INDENT;
1286         }
1287     }
1288 
1289  again:
1290     tok->start = NULL;
1291     /* Skip spaces */
1292     do {
1293         c = tok_nextc(tok);
1294     } while (c == ' ' || c == '\t' || c == '\014');
1295 
1296     /* Set start of current token */
1297     tok->start = tok->cur - 1;
1298 
1299     /* Skip comment, while looking for tab-setting magic */
1300     if (c == '#') {
1301         static char *tabforms[] = {
1302             "tab-width:",                       /* Emacs */
1303             ":tabstop=",                        /* vim, full form */
1304             ":ts=",                             /* vim, abbreviated form */
1305             "set tabsize=",                     /* will vi never die? */
1306         /* more templates can be added here to support other editors */
1307         };
1308         char cbuf[80];
1309         char *tp, **cp;
1310         tp = cbuf;
1311         do {
1312             *tp++ = c = tok_nextc(tok);
1313         } while (c != EOF && c != '\n' &&
1314                  (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1315         *tp = '\0';
1316         for (cp = tabforms;
1317              cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1318              cp++) {
1319             if ((tp = strstr(cbuf, *cp))) {
1320                 int newsize = atoi(tp + strlen(*cp));
1321 
1322                 if (newsize >= 1 && newsize <= 40) {
1323                     tok->tabsize = newsize;
1324                     if (Py_VerboseFlag)
1325                         PySys_WriteStderr(
1326                         "Tab size set to %d\n",
1327                         newsize);
1328                 }
1329             }
1330         }
1331         while (c != EOF && c != '\n')
1332             c = tok_nextc(tok);
1333     }
1334 
1335     /* Check for EOF and errors now */
1336     if (c == EOF) {
1337         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1338     }
1339 
1340     /* Identifier (most frequent token!) */
1341     if (Py_ISALPHA(c) || c == '_') {
1342         /* Process r"", u"" and ur"" */
1343         switch (c) {
1344         case 'b':
1345         case 'B':
1346             c = tok_nextc(tok);
1347             if (c == 'r' || c == 'R')
1348                 c = tok_nextc(tok);
1349             if (c == '"' || c == '\'')
1350                 goto letter_quote;
1351             break;
1352         case 'r':
1353         case 'R':
1354             c = tok_nextc(tok);
1355             if (c == '"' || c == '\'')
1356                 goto letter_quote;
1357             break;
1358         case 'u':
1359         case 'U':
1360             c = tok_nextc(tok);
1361             if (c == 'r' || c == 'R')
1362                 c = tok_nextc(tok);
1363             if (c == '"' || c == '\'')
1364                 goto letter_quote;
1365             break;
1366         }
1367         while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1368             c = tok_nextc(tok);
1369         }
1370         tok_backup(tok, c);
1371         *p_start = tok->start;
1372         *p_end = tok->cur;
1373         return NAME;
1374     }
1375 
1376     /* Newline */
1377     if (c == '\n') {
1378         tok->atbol = 1;
1379         if (blankline || tok->level > 0)
1380             goto nextline;
1381         *p_start = tok->start;
1382         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1383         tok->cont_line = 0;
1384         return NEWLINE;
1385     }
1386 
1387     /* Period or number starting with period? */
1388     if (c == '.') {
1389         c = tok_nextc(tok);
1390         if (isdigit(c)) {
1391             goto fraction;
1392         }
1393         else {
1394             tok_backup(tok, c);
1395             *p_start = tok->start;
1396             *p_end = tok->cur;
1397             return DOT;
1398         }
1399     }
1400 
1401     /* Number */
1402     if (isdigit(c)) {
1403         if (c == '0') {
1404             /* Hex, octal or binary -- maybe. */
1405             c = tok_nextc(tok);
1406             if (c == '.')
1407                 goto fraction;
1408 #ifndef WITHOUT_COMPLEX
1409             if (c == 'j' || c == 'J')
1410                 goto imaginary;
1411 #endif
1412             if (c == 'x' || c == 'X') {
1413 
1414                 /* Hex */
1415                 c = tok_nextc(tok);
1416                 if (!isxdigit(c)) {
1417                     tok->done = E_TOKEN;
1418                     tok_backup(tok, c);
1419                     return ERRORTOKEN;
1420                 }
1421                 do {
1422                     c = tok_nextc(tok);
1423                 } while (isxdigit(c));
1424             }
1425             else if (c == 'o' || c == 'O') {
1426                 /* Octal */
1427                 c = tok_nextc(tok);
1428                 if (c < '0' || c >= '8') {
1429                     tok->done = E_TOKEN;
1430                     tok_backup(tok, c);
1431                     return ERRORTOKEN;
1432                 }
1433                 do {
1434                     c = tok_nextc(tok);
1435                 } while ('0' <= c && c < '8');
1436             }
1437             else if (c == 'b' || c == 'B') {
1438                 /* Binary */
1439                 c = tok_nextc(tok);
1440                 if (c != '0' && c != '1') {
1441                     tok->done = E_TOKEN;
1442                     tok_backup(tok, c);
1443                     return ERRORTOKEN;
1444                 }
1445                 do {
1446                     c = tok_nextc(tok);
1447                 } while (c == '0' || c == '1');
1448             }
1449             else {
1450                 int found_decimal = 0;
1451                 /* Octal; c is first char of it */
1452                 /* There's no 'isoctdigit' macro, sigh */
1453                 while ('0' <= c && c < '8') {
1454                     c = tok_nextc(tok);
1455                 }
1456                 if (isdigit(c)) {
1457                     found_decimal = 1;
1458                     do {
1459                         c = tok_nextc(tok);
1460                     } while (isdigit(c));
1461                 }
1462                 if (c == '.')
1463                     goto fraction;
1464                 else if (c == 'e' || c == 'E')
1465                     goto exponent;
1466 #ifndef WITHOUT_COMPLEX
1467                 else if (c == 'j' || c == 'J')
1468                     goto imaginary;
1469 #endif
1470                 else if (found_decimal) {
1471                     tok->done = E_TOKEN;
1472                     tok_backup(tok, c);
1473                     return ERRORTOKEN;
1474                 }
1475             }
1476             if (c == 'l' || c == 'L')
1477                 c = tok_nextc(tok);
1478         }
1479         else {
1480             /* Decimal */
1481             do {
1482                 c = tok_nextc(tok);
1483             } while (isdigit(c));
1484             if (c == 'l' || c == 'L')
1485                 c = tok_nextc(tok);
1486             else {
1487                 /* Accept floating point numbers. */
1488                 if (c == '.') {
1489         fraction:
1490                     /* Fraction */
1491                     do {
1492                         c = tok_nextc(tok);
1493                     } while (isdigit(c));
1494                 }
1495                 if (c == 'e' || c == 'E') {
1496         exponent:
1497                     /* Exponent part */
1498                     c = tok_nextc(tok);
1499                     if (c == '+' || c == '-')
1500                         c = tok_nextc(tok);
1501                     if (!isdigit(c)) {
1502                         tok->done = E_TOKEN;
1503                         tok_backup(tok, c);
1504                         return ERRORTOKEN;
1505                     }
1506                     do {
1507                         c = tok_nextc(tok);
1508                     } while (isdigit(c));
1509                 }
1510 #ifndef WITHOUT_COMPLEX
1511                 if (c == 'j' || c == 'J')
1512                     /* Imaginary part */
1513         imaginary:
1514                     c = tok_nextc(tok);
1515 #endif
1516             }
1517         }
1518         tok_backup(tok, c);
1519         *p_start = tok->start;
1520         *p_end = tok->cur;
1521         return NUMBER;
1522     }
1523 
1524   letter_quote:
1525     /* String */
1526     if (c == '\'' || c == '"') {
1527         Py_ssize_t quote2 = tok->cur - tok->start + 1;
1528         int quote = c;
1529         int triple = 0;
1530         int tripcount = 0;
1531         for (;;) {
1532             c = tok_nextc(tok);
1533             if (c == '\n') {
1534                 if (!triple) {
1535                     tok->done = E_EOLS;
1536                     tok_backup(tok, c);
1537                     return ERRORTOKEN;
1538                 }
1539                 tripcount = 0;
1540                 tok->cont_line = 1; /* multiline string. */
1541             }
1542             else if (c == EOF) {
1543                 if (triple)
1544                     tok->done = E_EOFS;
1545                 else
1546                     tok->done = E_EOLS;
1547                 tok->cur = tok->inp;
1548                 return ERRORTOKEN;
1549             }
1550             else if (c == quote) {
1551                 tripcount++;
1552                 if (tok->cur - tok->start == quote2) {
1553                     c = tok_nextc(tok);
1554                     if (c == quote) {
1555                         triple = 1;
1556                         tripcount = 0;
1557                         continue;
1558                     }
1559                     tok_backup(tok, c);
1560                 }
1561                 if (!triple || tripcount == 3)
1562                     break;
1563             }
1564             else if (c == '\\') {
1565                 tripcount = 0;
1566                 c = tok_nextc(tok);
1567                 if (c == EOF) {
1568                     tok->done = E_EOLS;
1569                     tok->cur = tok->inp;
1570                     return ERRORTOKEN;
1571                 }
1572             }
1573             else
1574                 tripcount = 0;
1575         }
1576         *p_start = tok->start;
1577         *p_end = tok->cur;
1578         return STRING;
1579     }
1580 
1581     /* Line continuation */
1582     if (c == '\\') {
1583         c = tok_nextc(tok);
1584         if (c != '\n') {
1585             tok->done = E_LINECONT;
1586             tok->cur = tok->inp;
1587             return ERRORTOKEN;
1588         }
1589         tok->cont_line = 1;
1590         goto again; /* Read next line */
1591     }
1592 
1593     /* Check for two-character token */
1594     {
1595         int c2 = tok_nextc(tok);
1596         int token = PyToken_TwoChars(c, c2);
1597 #ifndef PGEN
1598         if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1599             if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1600                                    "<> not supported in 3.x; use !=",
1601                                    tok->filename, tok->lineno,
1602                                    NULL, NULL)) {
1603                 return ERRORTOKEN;
1604             }
1605         }
1606 #endif
1607         if (token != OP) {
1608             int c3 = tok_nextc(tok);
1609             int token3 = PyToken_ThreeChars(c, c2, c3);
1610             if (token3 != OP) {
1611                 token = token3;
1612             } else {
1613                 tok_backup(tok, c3);
1614             }
1615             *p_start = tok->start;
1616             *p_end = tok->cur;
1617             return token;
1618         }
1619         tok_backup(tok, c2);
1620     }
1621 
1622     /* Keep track of parentheses nesting level */
1623     switch (c) {
1624     case '(':
1625     case '[':
1626     case '{':
1627         tok->level++;
1628         break;
1629     case ')':
1630     case ']':
1631     case '}':
1632         tok->level--;
1633         break;
1634     }
1635 
1636     /* Punctuation character */
1637     *p_start = tok->start;
1638     *p_end = tok->cur;
1639     return PyToken_OneChar(c);
1640 }
1641 
1642 int
1643 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1644 {
1645     int result = tok_get(tok, p_start, p_end);
1646     if (tok->decoding_erred) {
1647         result = ERRORTOKEN;
1648         tok->done = E_DECODE;
1649     }
1650     return result;
1651 }
1652 
1653 /* This function is only called from parsetok. However, it cannot live
1654    there, as it must be empty for PGEN, and we can check for PGEN only
1655    in this file. */
1656 
1657 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1658 char*
1659 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1660 {
1661     return NULL;
1662 }
1663 #else
1664 #ifdef Py_USING_UNICODE
1665 static PyObject *
1666 dec_utf8(const char *enc, const char *text, size_t len) {
1667     PyObject *ret = NULL;
1668     PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1669     if (unicode_text) {
1670         ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1671         Py_DECREF(unicode_text);
1672     }
1673     if (!ret) {
1674         PyErr_Clear();
1675     }
1676     return ret;
1677 }
1678 char *
1679 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1680 {
1681     char *text = NULL;
1682     if (tok->encoding) {
1683         /* convert source to original encondig */
1684         PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1685         if (lineobj != NULL) {
1686             int linelen = PyString_Size(lineobj);
1687             const char *line = PyString_AsString(lineobj);
1688             text = PyObject_MALLOC(linelen + 1);
1689             if (text != NULL && line != NULL) {
1690                 if (linelen)
1691                     strncpy(text, line, linelen);
1692                 text[linelen] = '\0';
1693             }
1694             Py_DECREF(lineobj);
1695 
1696             /* adjust error offset */
1697             if (*offset > 1) {
1698                 PyObject *offsetobj = dec_utf8(tok->encoding,
1699                                                tok->buf, *offset-1);
1700                 if (offsetobj) {
1701                     *offset = PyString_Size(offsetobj) + 1;
1702                     Py_DECREF(offsetobj);
1703                 }
1704             }
1705 
1706         }
1707     }
1708     return text;
1709 
1710 }
1711 #endif /* defined(Py_USING_UNICODE) */
1712 #endif
1713 
1714 
1715 #ifdef Py_DEBUG
1716 
1717 void
1718 tok_dump(int type, char *start, char *end)
1719 {
1720     printf("%s", _PyParser_TokenNames[type]);
1721     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1722         printf("(%.*s)", (int)(end - start), start);
1723 }
1724 
1725 #endif
Python-2.7.3/Parser/tokenizer.c