Python-2.7.3/Parser/tokenizer.c

Location Tool Test ID Function Issue
/builddir/build/BUILD/Python-2.7.3/Parser/tokenizer.c:955:21 clang-analyzer Potential leak of memory pointed to by 'newbuf'
   1 /* Tokenizer implementation */
   2 
   3 #include "Python.h"
   4 #include "pgenheaders.h"
   5 
   6 #include <ctype.h>
   7 #include <assert.h>
   8 
   9 #include "tokenizer.h"
  10 #include "errcode.h"
  11 
  12 #ifndef PGEN
  13 #include "unicodeobject.h"
  14 #include "stringobject.h"
  15 #include "fileobject.h"
  16 #include "codecs.h"
  17 #include "abstract.h"
  18 #include "pydebug.h"
  19 #endif /* PGEN */
  20 
  21 extern char *PyOS_Readline(FILE *, FILE *, char *);
  22 /* Return malloc'ed string including trailing \n;
  23    empty malloc'ed string for EOF;
  24    NULL if interrupted */
  25 
  26 /* Don't ever change this -- it would break the portability of Python code */
  27 #define TABSIZE 8
  28 
  29 /* Forward */
  30 static struct tok_state *tok_new(void);
  31 static int tok_nextc(struct tok_state *tok);
  32 static void tok_backup(struct tok_state *tok, int c);
  33 
  34 /* Token names */
  35 
  36 char *_PyParser_TokenNames[] = {
  37     "ENDMARKER",
  38     "NAME",
  39     "NUMBER",
  40     "STRING",
  41     "NEWLINE",
  42     "INDENT",
  43     "DEDENT",
  44     "LPAR",
  45     "RPAR",
  46     "LSQB",
  47     "RSQB",
  48     "COLON",
  49     "COMMA",
  50     "SEMI",
  51     "PLUS",
  52     "MINUS",
  53     "STAR",
  54     "SLASH",
  55     "VBAR",
  56     "AMPER",
  57     "LESS",
  58     "GREATER",
  59     "EQUAL",
  60     "DOT",
  61     "PERCENT",
  62     "BACKQUOTE",
  63     "LBRACE",
  64     "RBRACE",
  65     "EQEQUAL",
  66     "NOTEQUAL",
  67     "LESSEQUAL",
  68     "GREATEREQUAL",
  69     "TILDE",
  70     "CIRCUMFLEX",
  71     "LEFTSHIFT",
  72     "RIGHTSHIFT",
  73     "DOUBLESTAR",
  74     "PLUSEQUAL",
  75     "MINEQUAL",
  76     "STAREQUAL",
  77     "SLASHEQUAL",
  78     "PERCENTEQUAL",
  79     "AMPEREQUAL",
  80     "VBAREQUAL",
  81     "CIRCUMFLEXEQUAL",
  82     "LEFTSHIFTEQUAL",
  83     "RIGHTSHIFTEQUAL",
  84     "DOUBLESTAREQUAL",
  85     "DOUBLESLASH",
  86     "DOUBLESLASHEQUAL",
  87     "AT",
  88     /* This table must match the #defines in token.h! */
  89     "OP",
  90     "<ERRORTOKEN>",
  91     "<N_TOKENS>"
  92 };
  93 
  94 /* Create and initialize a new tok_state structure */
  95 
  96 static struct tok_state *
  97 tok_new(void)
  98 {
  99     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 100                                             sizeof(struct tok_state));
 101     if (tok == NULL)
 102         return NULL;
 103     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 104     tok->done = E_OK;
 105     tok->fp = NULL;
 106     tok->input = NULL;
 107     tok->tabsize = TABSIZE;
 108     tok->indent = 0;
 109     tok->indstack[0] = 0;
 110     tok->atbol = 1;
 111     tok->pendin = 0;
 112     tok->prompt = tok->nextprompt = NULL;
 113     tok->lineno = 0;
 114     tok->level = 0;
 115     tok->filename = NULL;
 116     tok->altwarning = 0;
 117     tok->alterror = 0;
 118     tok->alttabsize = 1;
 119     tok->altindstack[0] = 0;
 120     tok->decoding_state = 0;
 121     tok->decoding_erred = 0;
 122     tok->read_coding_spec = 0;
 123     tok->encoding = NULL;
 124     tok->cont_line = 0;
 125 #ifndef PGEN
 126     tok->decoding_readline = NULL;
 127     tok->decoding_buffer = NULL;
 128 #endif
 129     return tok;
 130 }
 131 
 132 static char *
 133 new_string(const char *s, Py_ssize_t len)
 134 {
 135     char* result = (char *)PyMem_MALLOC(len + 1);
 136     if (result != NULL) {
 137         memcpy(result, s, len);
 138         result[len] = '\0';
 139     }
 140     return result;
 141 }
 142 
 143 #ifdef PGEN
 144 
 145 static char *
 146 decoding_fgets(char *s, int size, struct tok_state *tok)
 147 {
 148     return fgets(s, size, tok->fp);
 149 }
 150 
 151 static int
 152 decoding_feof(struct tok_state *tok)
 153 {
 154     return feof(tok->fp);
 155 }
 156 
 157 static char *
 158 decode_str(const char *str, int exec_input, struct tok_state *tok)
 159 {
 160     return new_string(str, strlen(str));
 161 }
 162 
 163 #else /* PGEN */
 164 
 165 static char *
 166 error_ret(struct tok_state *tok) /* XXX */
 167 {
 168     tok->decoding_erred = 1;
 169     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 170         PyMem_FREE(tok->buf);
 171     tok->buf = NULL;
 172     return NULL;                /* as if it were EOF */
 173 }
 174 
 175 
 176 static char *
 177 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 178 {
 179     char buf[13];
 180     int i;
 181     for (i = 0; i < 12; i++) {
 182         int c = s[i];
 183         if (c == '\0')
 184             break;
 185         else if (c == '_')
 186             buf[i] = '-';
 187         else
 188             buf[i] = tolower(c);
 189     }
 190     buf[i] = '\0';
 191     if (strcmp(buf, "utf-8") == 0 ||
 192         strncmp(buf, "utf-8-", 6) == 0)
 193         return "utf-8";
 194     else if (strcmp(buf, "latin-1") == 0 ||
 195              strcmp(buf, "iso-8859-1") == 0 ||
 196              strcmp(buf, "iso-latin-1") == 0 ||
 197              strncmp(buf, "latin-1-", 8) == 0 ||
 198              strncmp(buf, "iso-8859-1-", 11) == 0 ||
 199              strncmp(buf, "iso-latin-1-", 12) == 0)
 200         return "iso-8859-1";
 201     else
 202         return s;
 203 }
 204 
 205 /* Return the coding spec in S, or NULL if none is found.  */
 206 
 207 static char *
 208 get_coding_spec(const char *s, Py_ssize_t size)
 209 {
 210     Py_ssize_t i;
 211     /* Coding spec must be in a comment, and that comment must be
 212      * the only statement on the source code line. */
 213     for (i = 0; i < size - 6; i++) {
 214         if (s[i] == '#')
 215             break;
 216         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 217             return NULL;
 218     }
 219     for (; i < size - 6; i++) { /* XXX inefficient search */
 220         const char* t = s + i;
 221         if (strncmp(t, "coding", 6) == 0) {
 222             const char* begin = NULL;
 223             t += 6;
 224             if (t[0] != ':' && t[0] != '=')
 225                 continue;
 226             do {
 227                 t++;
 228             } while (t[0] == '\x20' || t[0] == '\t');
 229 
 230             begin = t;
 231             while (Py_ISALNUM(t[0]) ||
 232                    t[0] == '-' || t[0] == '_' || t[0] == '.')
 233                 t++;
 234 
 235             if (begin < t) {
 236                 char* r = new_string(begin, t - begin);
 237                 char* q = get_normal_name(r);
 238                 if (r != q) {
 239                     PyMem_FREE(r);
 240                     r = new_string(q, strlen(q));
 241                 }
 242                 return r;
 243             }
 244         }
 245     }
 246     return NULL;
 247 }
 248 
 249 /* Check whether the line contains a coding spec. If it does,
 250    invoke the set_readline function for the new encoding.
 251    This function receives the tok_state and the new encoding.
 252    Return 1 on success, 0 on failure.  */
 253 
 254 static int
 255 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 256                   int set_readline(struct tok_state *, const char *))
 257 {
 258     char * cs;
 259     int r = 1;
 260 
 261     if (tok->cont_line)
 262         /* It's a continuation line, so it can't be a coding spec. */
 263         return 1;
 264     cs = get_coding_spec(line, size);
 265     if (cs != NULL) {
 266         tok->read_coding_spec = 1;
 267         if (tok->encoding == NULL) {
 268             assert(tok->decoding_state == 1); /* raw */
 269             if (strcmp(cs, "utf-8") == 0 ||
 270                 strcmp(cs, "iso-8859-1") == 0) {
 271                 tok->encoding = cs;
 272             } else {
 273 #ifdef Py_USING_UNICODE
 274                 r = set_readline(tok, cs);
 275                 if (r) {
 276                     tok->encoding = cs;
 277                     tok->decoding_state = -1;
 278                 }
 279                 else
 280                     PyMem_FREE(cs);
 281 #else
 282                 /* Without Unicode support, we cannot
 283                    process the coding spec. Since there
 284                    won't be any Unicode literals, that
 285                    won't matter. */
 286                 PyMem_FREE(cs);
 287 #endif
 288             }
 289         } else {                /* then, compare cs with BOM */
 290             r = (strcmp(tok->encoding, cs) == 0);
 291             PyMem_FREE(cs);
 292         }
 293     }
 294     if (!r) {
 295         cs = tok->encoding;
 296         if (!cs)
 297             cs = "with BOM";
 298         PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 299     }
 300     return r;
 301 }
 302 
 303 /* See whether the file starts with a BOM. If it does,
 304    invoke the set_readline function with the new encoding.
 305    Return 1 on success, 0 on failure.  */
 306 
 307 static int
 308 check_bom(int get_char(struct tok_state *),
 309           void unget_char(int, struct tok_state *),
 310           int set_readline(struct tok_state *, const char *),
 311           struct tok_state *tok)
 312 {
 313     int ch1, ch2, ch3;
 314     ch1 = get_char(tok);
 315     tok->decoding_state = 1;
 316     if (ch1 == EOF) {
 317         return 1;
 318     } else if (ch1 == 0xEF) {
 319         ch2 = get_char(tok);
 320         if (ch2 != 0xBB) {
 321             unget_char(ch2, tok);
 322             unget_char(ch1, tok);
 323             return 1;
 324         }
 325         ch3 = get_char(tok);
 326         if (ch3 != 0xBF) {
 327             unget_char(ch3, tok);
 328             unget_char(ch2, tok);
 329             unget_char(ch1, tok);
 330             return 1;
 331         }
 332 #if 0
 333     /* Disable support for UTF-16 BOMs until a decision
 334        is made whether this needs to be supported.  */
 335     } else if (ch1 == 0xFE) {
 336         ch2 = get_char(tok);
 337         if (ch2 != 0xFF) {
 338             unget_char(ch2, tok);
 339             unget_char(ch1, tok);
 340             return 1;
 341         }
 342         if (!set_readline(tok, "utf-16-be"))
 343             return 0;
 344         tok->decoding_state = -1;
 345     } else if (ch1 == 0xFF) {
 346         ch2 = get_char(tok);
 347         if (ch2 != 0xFE) {
 348             unget_char(ch2, tok);
 349             unget_char(ch1, tok);
 350             return 1;
 351         }
 352         if (!set_readline(tok, "utf-16-le"))
 353             return 0;
 354         tok->decoding_state = -1;
 355 #endif
 356     } else {
 357         unget_char(ch1, tok);
 358         return 1;
 359     }
 360     if (tok->encoding != NULL)
 361         PyMem_FREE(tok->encoding);
 362     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
 363     return 1;
 364 }
 365 
 366 /* Read a line of text from TOK into S, using the stream in TOK.
 367    Return NULL on failure, else S.
 368 
 369    On entry, tok->decoding_buffer will be one of:
 370      1) NULL: need to call tok->decoding_readline to get a new line
 371      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 372        stored the result in tok->decoding_buffer
 373      3) PyStringObject *: previous call to fp_readl did not have enough room
 374        (in the s buffer) to copy entire contents of the line read
 375        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 376        In this case, fp_readl is called in a loop (with an expanded buffer)
 377        until the buffer ends with a '\n' (or until the end of the file is
 378        reached): see tok_nextc and its calls to decoding_fgets.
 379 */
 380 
 381 static char *
 382 fp_readl(char *s, int size, struct tok_state *tok)
 383 {
 384 #ifndef Py_USING_UNICODE
 385     /* In a non-Unicode built, this should never be called. */
 386     Py_FatalError("fp_readl should not be called in this build.");
 387     return NULL; /* Keep compiler happy (not reachable) */
 388 #else
 389     PyObject* utf8 = NULL;
 390     PyObject* buf = tok->decoding_buffer;
 391     char *str;
 392     Py_ssize_t utf8len;
 393 
 394     /* Ask for one less byte so we can terminate it */
 395     assert(size > 0);
 396     size--;
 397 
 398     if (buf == NULL) {
 399         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 400         if (buf == NULL)
 401             return error_ret(tok);
 402     } else {
 403         tok->decoding_buffer = NULL;
 404         if (PyString_CheckExact(buf))
 405             utf8 = buf;
 406     }
 407     if (utf8 == NULL) {
 408         utf8 = PyUnicode_AsUTF8String(buf);
 409         Py_DECREF(buf);
 410         if (utf8 == NULL)
 411             return error_ret(tok);
 412     }
 413     str = PyString_AsString(utf8);
 414     utf8len = PyString_GET_SIZE(utf8);
 415     if (utf8len > size) {
 416         tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 417         if (tok->decoding_buffer == NULL) {
 418             Py_DECREF(utf8);
 419             return error_ret(tok);
 420         }
 421         utf8len = size;
 422     }
 423     memcpy(s, str, utf8len);
 424     s[utf8len] = '\0';
 425     Py_DECREF(utf8);
 426     if (utf8len == 0)
 427         return NULL; /* EOF */
 428     return s;
 429 #endif
 430 }
 431 
 432 /* Set the readline function for TOK to a StreamReader's
 433    readline function. The StreamReader is named ENC.
 434 
 435    This function is called from check_bom and check_coding_spec.
 436 
 437    ENC is usually identical to the future value of tok->encoding,
 438    except for the (currently unsupported) case of UTF-16.
 439 
 440    Return 1 on success, 0 on failure. */
 441 
 442 static int
 443 fp_setreadl(struct tok_state *tok, const char* enc)
 444 {
 445     PyObject *reader, *stream, *readline;
 446 
 447     /* XXX: constify filename argument. */
 448     stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 449     if (stream == NULL)
 450         return 0;
 451 
 452     reader = PyCodec_StreamReader(enc, stream, NULL);
 453     Py_DECREF(stream);
 454     if (reader == NULL)
 455         return 0;
 456 
 457     readline = PyObject_GetAttrString(reader, "readline");
 458     Py_DECREF(reader);
 459     if (readline == NULL)
 460         return 0;
 461 
 462     tok->decoding_readline = readline;
 463     return 1;
 464 }
 465 
 466 /* Fetch the next byte from TOK. */
 467 
 468 static int fp_getc(struct tok_state *tok) {
 469     return getc(tok->fp);
 470 }
 471 
 472 /* Unfetch the last byte back into TOK.  */
 473 
 474 static void fp_ungetc(int c, struct tok_state *tok) {
 475     ungetc(c, tok->fp);
 476 }
 477 
 478 /* Read a line of input from TOK. Determine encoding
 479    if necessary.  */
 480 
 481 static char *
 482 decoding_fgets(char *s, int size, struct tok_state *tok)
 483 {
 484     char *line = NULL;
 485     int badchar = 0;
 486     for (;;) {
 487         if (tok->decoding_state < 0) {
 488             /* We already have a codec associated with
 489                this input. */
 490             line = fp_readl(s, size, tok);
 491             break;
 492         } else if (tok->decoding_state > 0) {
 493             /* We want a 'raw' read. */
 494             line = Py_UniversalNewlineFgets(s, size,
 495                                             tok->fp, NULL);
 496             break;
 497         } else {
 498             /* We have not yet determined the encoding.
 499                If an encoding is found, use the file-pointer
 500                reader functions from now on. */
 501             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 502                 return error_ret(tok);
 503             assert(tok->decoding_state != 0);
 504         }
 505     }
 506     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 507         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 508             return error_ret(tok);
 509         }
 510     }
 511 #ifndef PGEN
 512     /* The default encoding is ASCII, so make sure we don't have any
 513        non-ASCII bytes in it. */
 514     if (line && !tok->encoding) {
 515         unsigned char *c;
 516         for (c = (unsigned char *)line; *c; c++)
 517             if (*c > 127) {
 518                 badchar = *c;
 519                 break;
 520             }
 521     }
 522     if (badchar) {
 523         char buf[500];
 524         /* Need to add 1 to the line number, since this line
 525            has not been counted, yet.  */
 526         sprintf(buf,
 527             "Non-ASCII character '\\x%.2x' "
 528             "in file %.200s on line %i, "
 529             "but no encoding declared; "
 530             "see http://www.python.org/peps/pep-0263.html for details",
 531             badchar, tok->filename, tok->lineno + 1);
 532         PyErr_SetString(PyExc_SyntaxError, buf);
 533         return error_ret(tok);
 534     }
 535 #endif
 536     return line;
 537 }
 538 
 539 static int
 540 decoding_feof(struct tok_state *tok)
 541 {
 542     if (tok->decoding_state >= 0) {
 543         return feof(tok->fp);
 544     } else {
 545         PyObject* buf = tok->decoding_buffer;
 546         if (buf == NULL) {
 547             buf = PyObject_CallObject(tok->decoding_readline, NULL);
 548             if (buf == NULL) {
 549                 error_ret(tok);
 550                 return 1;
 551             } else {
 552                 tok->decoding_buffer = buf;
 553             }
 554         }
 555         return PyObject_Length(buf) == 0;
 556     }
 557 }
 558 
 559 /* Fetch a byte from TOK, using the string buffer. */
 560 
 561 static int
 562 buf_getc(struct tok_state *tok) {
 563     return Py_CHARMASK(*tok->str++);
 564 }
 565 
 566 /* Unfetch a byte from TOK, using the string buffer. */
 567 
 568 static void
 569 buf_ungetc(int c, struct tok_state *tok) {
 570     tok->str--;
 571     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
 572 }
 573 
 574 /* Set the readline function for TOK to ENC. For the string-based
 575    tokenizer, this means to just record the encoding. */
 576 
 577 static int
 578 buf_setreadl(struct tok_state *tok, const char* enc) {
 579     tok->enc = enc;
 580     return 1;
 581 }
 582 
 583 /* Return a UTF-8 encoding Python string object from the
 584    C byte string STR, which is encoded with ENC. */
 585 
 586 #ifdef Py_USING_UNICODE
 587 static PyObject *
 588 translate_into_utf8(const char* str, const char* enc) {
 589     PyObject *utf8;
 590     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 591     if (buf == NULL)
 592         return NULL;
 593     utf8 = PyUnicode_AsUTF8String(buf);
 594     Py_DECREF(buf);
 595     return utf8;
 596 }
 597 #endif
 598 
 599 
 600 static char *
 601 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
 602     int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
 603     char *buf, *current;
 604     char c = '\0';
 605     buf = PyMem_MALLOC(needed_length);
 606     if (buf == NULL) {
 607         tok->done = E_NOMEM;
 608         return NULL;
 609     }
 610     for (current = buf; *s; s++, current++) {
 611         c = *s;
 612         if (skip_next_lf) {
 613             skip_next_lf = 0;
 614             if (c == '\n') {
 615                 c = *++s;
 616                 if (!c)
 617                     break;
 618             }
 619         }
 620         if (c == '\r') {
 621             skip_next_lf = 1;
 622             c = '\n';
 623         }
 624         *current = c;
 625     }
 626     /* If this is exec input, add a newline to the end of the string if
 627        there isn't one already. */
 628     if (exec_input && c != '\n') {
 629         *current = '\n';
 630         current++;
 631     }
 632     *current = '\0';
 633     final_length = current - buf + 1;
 634     if (final_length < needed_length && final_length)
 635         /* should never fail */
 636         buf = PyMem_REALLOC(buf, final_length);
 637     return buf;
 638 }
 639 
 640 /* Decode a byte string STR for use as the buffer of TOK.
 641    Look for encoding declarations inside STR, and record them
 642    inside TOK.  */
 643 
 644 static const char *
 645 decode_str(const char *input, int single, struct tok_state *tok)
 646 {
 647     PyObject* utf8 = NULL;
 648     const char *str;
 649     const char *s;
 650     const char *newl[2] = {NULL, NULL};
 651     int lineno = 0;
 652     tok->input = str = translate_newlines(input, single, tok);
 653     if (str == NULL)
 654         return NULL;
 655     tok->enc = NULL;
 656     tok->str = str;
 657     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 658         return error_ret(tok);
 659     str = tok->str;             /* string after BOM if any */
 660     assert(str);
 661 #ifdef Py_USING_UNICODE
 662     if (tok->enc != NULL) {
 663         utf8 = translate_into_utf8(str, tok->enc);
 664         if (utf8 == NULL)
 665             return error_ret(tok);
 666         str = PyString_AsString(utf8);
 667     }
 668 #endif
 669     for (s = str;; s++) {
 670         if (*s == '\0') break;
 671         else if (*s == '\n') {
 672             assert(lineno < 2);
 673             newl[lineno] = s;
 674             lineno++;
 675             if (lineno == 2) break;
 676         }
 677     }
 678     tok->enc = NULL;
 679     /* need to check line 1 and 2 separately since check_coding_spec
 680        assumes a single line as input */
 681     if (newl[0]) {
 682         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 683             return error_ret(tok);
 684         if (tok->enc == NULL && newl[1]) {
 685             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 686                                    tok, buf_setreadl))
 687                 return error_ret(tok);
 688         }
 689     }
 690 #ifdef Py_USING_UNICODE
 691     if (tok->enc != NULL) {
 692         assert(utf8 == NULL);
 693         utf8 = translate_into_utf8(str, tok->enc);
 694         if (utf8 == NULL)
 695             return error_ret(tok);
 696         str = PyString_AsString(utf8);
 697     }
 698 #endif
 699     assert(tok->decoding_buffer == NULL);
 700     tok->decoding_buffer = utf8; /* CAUTION */
 701     return str;
 702 }
 703 
 704 #endif /* PGEN */
 705 
 706 /* Set up tokenizer for string */
 707 
 708 struct tok_state *
 709 PyTokenizer_FromString(const char *str, int exec_input)
 710 {
 711     struct tok_state *tok = tok_new();
 712     if (tok == NULL)
 713         return NULL;
 714     str = (char *)decode_str(str, exec_input, tok);
 715     if (str == NULL) {
 716         PyTokenizer_Free(tok);
 717         return NULL;
 718     }
 719 
 720     /* XXX: constify members. */
 721     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 722     return tok;
 723 }
 724 
 725 
 726 /* Set up tokenizer for file */
 727 
 728 struct tok_state *
 729 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 730 {
 731     struct tok_state *tok = tok_new();
 732     if (tok == NULL)
 733         return NULL;
 734     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 735         PyTokenizer_Free(tok);
 736         return NULL;
 737     }
 738     tok->cur = tok->inp = tok->buf;
 739     tok->end = tok->buf + BUFSIZ;
 740     tok->fp = fp;
 741     tok->prompt = ps1;
 742     tok->nextprompt = ps2;
 743     return tok;
 744 }
 745 
 746 
 747 /* Free a tok_state structure */
 748 
 749 void
 750 PyTokenizer_Free(struct tok_state *tok)
 751 {
 752     if (tok->encoding != NULL)
 753         PyMem_FREE(tok->encoding);
 754 #ifndef PGEN
 755     Py_XDECREF(tok->decoding_readline);
 756     Py_XDECREF(tok->decoding_buffer);
 757 #endif
 758     if (tok->fp != NULL && tok->buf != NULL)
 759         PyMem_FREE(tok->buf);
 760     if (tok->input)
 761         PyMem_FREE((char *)tok->input);
 762     PyMem_FREE(tok);
 763 }
 764 
 765 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 766 static int
 767 tok_stdin_decode(struct tok_state *tok, char **inp)
 768 {
 769     PyObject *enc, *sysstdin, *decoded, *utf8;
 770     const char *encoding;
 771     char *converted;
 772 
 773     if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 774         return 0;
 775     sysstdin = PySys_GetObject("stdin");
 776     if (sysstdin == NULL || !PyFile_Check(sysstdin))
 777         return 0;
 778 
 779     enc = ((PyFileObject *)sysstdin)->f_encoding;
 780     if (enc == NULL || !PyString_Check(enc))
 781         return 0;
 782     Py_INCREF(enc);
 783 
 784     encoding = PyString_AsString(enc);
 785     decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 786     if (decoded == NULL)
 787         goto error_clear;
 788 
 789     utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 790     Py_DECREF(decoded);
 791     if (utf8 == NULL)
 792         goto error_clear;
 793 
 794     assert(PyString_Check(utf8));
 795     converted = new_string(PyString_AS_STRING(utf8),
 796                            PyString_GET_SIZE(utf8));
 797     Py_DECREF(utf8);
 798     if (converted == NULL)
 799         goto error_nomem;
 800 
 801     PyMem_FREE(*inp);
 802     *inp = converted;
 803     if (tok->encoding != NULL)
 804         PyMem_FREE(tok->encoding);
 805     tok->encoding = new_string(encoding, strlen(encoding));
 806     if (tok->encoding == NULL)
 807         goto error_nomem;
 808 
 809     Py_DECREF(enc);
 810     return 0;
 811 
 812 error_nomem:
 813     Py_DECREF(enc);
 814     tok->done = E_NOMEM;
 815     return -1;
 816 
 817 error_clear:
 818     Py_DECREF(enc);
 819     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
 820         tok->done = E_ERROR;
 821         return -1;
 822     }
 823     /* Fallback to iso-8859-1: for backward compatibility */
 824     PyErr_Clear();
 825     return 0;
 826 }
 827 #endif
 828 
 829 /* Get next char, updating state; error code goes into tok->done */
 830 
 831 static int
 832 tok_nextc(register struct tok_state *tok)
 833 {
 834     for (;;) {
 835         if (tok->cur != tok->inp) {
 836             return Py_CHARMASK(*tok->cur++); /* Fast path */
 837         }
 838         if (tok->done != E_OK)
 839             return EOF;
 840         if (tok->fp == NULL) {
 841             char *end = strchr(tok->inp, '\n');
 842             if (end != NULL)
 843                 end++;
 844             else {
 845                 end = strchr(tok->inp, '\0');
 846                 if (end == tok->inp) {
 847                     tok->done = E_EOF;
 848                     return EOF;
 849                 }
 850             }
 851             if (tok->start == NULL)
 852                 tok->buf = tok->cur;
 853             tok->line_start = tok->cur;
 854             tok->lineno++;
 855             tok->inp = end;
 856             return Py_CHARMASK(*tok->cur++);
 857         }
 858         if (tok->prompt != NULL) {
 859             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 860             if (tok->nextprompt != NULL)
 861                 tok->prompt = tok->nextprompt;
 862             if (newtok == NULL)
 863                 tok->done = E_INTR;
 864             else if (*newtok == '\0') {
 865                 PyMem_FREE(newtok);
 866                 tok->done = E_EOF;
 867             }
 868 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 869             else if (tok_stdin_decode(tok, &newtok) != 0)
 870                 PyMem_FREE(newtok);
 871 #endif
 872             else if (tok->start != NULL) {
 873                 size_t start = tok->start - tok->buf;
 874                 size_t oldlen = tok->cur - tok->buf;
 875                 size_t newlen = oldlen + strlen(newtok);
 876                 char *buf = tok->buf;
 877                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 878                 tok->lineno++;
 879                 if (buf == NULL) {
 880                     PyMem_FREE(tok->buf);
 881                     tok->buf = NULL;
 882                     PyMem_FREE(newtok);
 883                     tok->done = E_NOMEM;
 884                     return EOF;
 885                 }
 886                 tok->buf = buf;
 887                 tok->cur = tok->buf + oldlen;
 888                 tok->line_start = tok->cur;
 889                 strcpy(tok->buf + oldlen, newtok);
 890                 PyMem_FREE(newtok);
 891                 tok->inp = tok->buf + newlen;
 892                 tok->end = tok->inp + 1;
 893                 tok->start = tok->buf + start;
 894             }
 895             else {
 896                 tok->lineno++;
 897                 if (tok->buf != NULL)
 898                     PyMem_FREE(tok->buf);
 899                 tok->buf = newtok;
 900                 tok->line_start = tok->buf;
 901                 tok->cur = tok->buf;
 902                 tok->line_start = tok->buf;
 903                 tok->inp = strchr(tok->buf, '\0');
 904                 tok->end = tok->inp + 1;
 905             }
 906         }
 907         else {
 908             int done = 0;
 909             Py_ssize_t cur = 0;
 910             char *pt;
 911             if (tok->start == NULL) {
 912                 if (tok->buf == NULL) {
 913                     tok->buf = (char *)
 914                         PyMem_MALLOC(BUFSIZ);
 915                     if (tok->buf == NULL) {
 916                         tok->done = E_NOMEM;
 917                         return EOF;
 918                     }
 919                     tok->end = tok->buf + BUFSIZ;
 920                 }
 921                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 922                           tok) == NULL) {
 923                     tok->done = E_EOF;
 924                     done = 1;
 925                 }
 926                 else {
 927                     tok->done = E_OK;
 928                     tok->inp = strchr(tok->buf, '\0');
 929                     done = tok->inp[-1] == '\n';
 930                 }
 931             }
 932             else {
 933                 cur = tok->cur - tok->buf;
 934                 if (decoding_feof(tok)) {
 935                     tok->done = E_EOF;
 936                     done = 1;
 937                 }
 938                 else
 939                     tok->done = E_OK;
 940             }
 941             tok->lineno++;
 942             /* Read until '\n' or EOF */
 943             while (!done) {
 944                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 945                           tok->start - tok->buf;
 946                 Py_ssize_t curvalid = tok->inp - tok->buf;
 947                 Py_ssize_t newsize = curvalid + BUFSIZ;
 948                 char *newbuf = tok->buf;
 949                 newbuf = (char *)PyMem_REALLOC(newbuf,
 950                                                newsize);
 951                 if (newbuf == NULL) {
 952                     tok->done = E_NOMEM;
 953                     tok->cur = tok->inp;
 954                     return EOF;
 955                 }
Potential leak of memory pointed to by 'newbuf'
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

956 tok->buf = newbuf; 957 tok->inp = tok->buf + curvalid; 958 tok->end = tok->buf + newsize; 959 tok->start = curstart < 0 ? NULL : 960 tok->buf + curstart; 961 if (decoding_fgets(tok->inp, 962 (int)(tok->end - tok->inp), 963 tok) == NULL) { 964 /* Break out early on decoding 965 errors, as tok->buf will be NULL 966 */ 967 if (tok->decoding_erred) 968 return EOF; 969 /* Last line does not end in \n, 970 fake one */ 971 strcpy(tok->inp, "\n"); 972 } 973 tok->inp = strchr(tok->inp, '\0'); 974 done = tok->inp[-1] == '\n'; 975 } 976 if (tok->buf != NULL) { 977 tok->cur = tok->buf + cur; 978 tok->line_start = tok->cur; 979 /* replace "\r\n" with "\n" */ 980 /* For Mac leave the \r, giving a syntax error */ 981 pt = tok->inp - 2; 982 if (pt >= tok->buf && *pt == '\r') { 983 *pt++ = '\n'; 984 *pt = '\0'; 985 tok->inp = pt; 986 } 987 } 988 } 989 if (tok->done != E_OK) { 990 if (tok->prompt != NULL) 991 PySys_WriteStderr("\n"); 992 tok->cur = tok->inp; 993 return EOF; 994 } 995 } 996 /*NOTREACHED*/ 997 } 998 999 1000 /* Back-up one character */ 1001 1002 static void 1003 tok_backup(register struct tok_state *tok, register int c) 1004 { 1005 if (c != EOF) { 1006 if (--tok->cur < tok->buf) 1007 Py_FatalError("tok_backup: beginning of buffer"); 1008 if (*tok->cur != c) 1009 *tok->cur = c; 1010 } 1011 } 1012 1013 1014 /* Return the token corresponding to a single character */ 1015 1016 int 1017 PyToken_OneChar(int c) 1018 { 1019 switch (c) { 1020 case '(': return LPAR; 1021 case ')': return RPAR; 1022 case '[': return LSQB; 1023 case ']': return RSQB; 1024 case ':': return COLON; 1025 case ',': return COMMA; 1026 case ';': return SEMI; 1027 case '+': return PLUS; 1028 case '-': return MINUS; 1029 case '*': return STAR; 1030 case '/': return SLASH; 1031 case '|': return VBAR; 1032 case '&': return AMPER; 1033 case '<': return LESS; 1034 case '>': return GREATER; 1035 case '=': return EQUAL; 1036 case '.': return DOT; 1037 case '%': return PERCENT; 1038 case '`': return BACKQUOTE; 1039 case '{': return LBRACE; 1040 case '}': return RBRACE; 1041 case '^': return CIRCUMFLEX; 1042 case '~': return TILDE; 1043 case '@': return AT; 1044 default: return OP; 1045 } 1046 } 1047 1048 1049 int 1050 PyToken_TwoChars(int c1, int c2) 1051 { 1052 switch (c1) { 1053 case '=': 1054 switch (c2) { 1055 case '=': return EQEQUAL; 1056 } 1057 break; 1058 case '!': 1059 switch (c2) { 1060 case '=': return NOTEQUAL; 1061 } 1062 break; 1063 case '<': 1064 switch (c2) { 1065 case '>': return NOTEQUAL; 1066 case '=': return LESSEQUAL; 1067 case '<': return LEFTSHIFT; 1068 } 1069 break; 1070 case '>': 1071 switch (c2) { 1072 case '=': return GREATEREQUAL; 1073 case '>': return RIGHTSHIFT; 1074 } 1075 break; 1076 case '+': 1077 switch (c2) { 1078 case '=': return PLUSEQUAL; 1079 } 1080 break; 1081 case '-': 1082 switch (c2) { 1083 case '=': return MINEQUAL; 1084 } 1085 break; 1086 case '*': 1087 switch (c2) { 1088 case '*': return DOUBLESTAR; 1089 case '=': return STAREQUAL; 1090 } 1091 break; 1092 case '/': 1093 switch (c2) { 1094 case '/': return DOUBLESLASH; 1095 case '=': return SLASHEQUAL; 1096 } 1097 break; 1098 case '|': 1099 switch (c2) { 1100 case '=': return VBAREQUAL; 1101 } 1102 break; 1103 case '%': 1104 switch (c2) { 1105 case '=': return PERCENTEQUAL; 1106 } 1107 break; 1108 case '&': 1109 switch (c2) { 1110 case '=': return AMPEREQUAL; 1111 } 1112 break; 1113 case '^': 1114 switch (c2) { 1115 case '=': return CIRCUMFLEXEQUAL; 1116 } 1117 break; 1118 } 1119 return OP; 1120 } 1121 1122 int 1123 PyToken_ThreeChars(int c1, int c2, int c3) 1124 { 1125 switch (c1) { 1126 case '<': 1127 switch (c2) { 1128 case '<': 1129 switch (c3) { 1130 case '=': 1131 return LEFTSHIFTEQUAL; 1132 } 1133 break; 1134 } 1135 break; 1136 case '>': 1137 switch (c2) { 1138 case '>': 1139 switch (c3) { 1140 case '=': 1141 return RIGHTSHIFTEQUAL; 1142 } 1143 break; 1144 } 1145 break; 1146 case '*': 1147 switch (c2) { 1148 case '*': 1149 switch (c3) { 1150 case '=': 1151 return DOUBLESTAREQUAL; 1152 } 1153 break; 1154 } 1155 break; 1156 case '/': 1157 switch (c2) { 1158 case '/': 1159 switch (c3) { 1160 case '=': 1161 return DOUBLESLASHEQUAL; 1162 } 1163 break; 1164 } 1165 break; 1166 } 1167 return OP; 1168 } 1169 1170 static int 1171 indenterror(struct tok_state *tok) 1172 { 1173 if (tok->alterror) { 1174 tok->done = E_TABSPACE; 1175 tok->cur = tok->inp; 1176 return 1; 1177 } 1178 if (tok->altwarning) { 1179 PySys_WriteStderr("%s: inconsistent use of tabs and spaces " 1180 "in indentation\n", tok->filename); 1181 tok->altwarning = 0; 1182 } 1183 return 0; 1184 } 1185 1186 /* Get next token, after space stripping etc. */ 1187 1188 static int 1189 tok_get(register struct tok_state *tok, char **p_start, char **p_end) 1190 { 1191 register int c; 1192 int blankline; 1193 1194 *p_start = *p_end = NULL; 1195 nextline: 1196 tok->start = NULL; 1197 blankline = 0; 1198 1199 /* Get indentation level */ 1200 if (tok->atbol) { 1201 register int col = 0; 1202 register int altcol = 0; 1203 tok->atbol = 0; 1204 for (;;) { 1205 c = tok_nextc(tok); 1206 if (c == ' ') 1207 col++, altcol++; 1208 else if (c == '\t') { 1209 col = (col/tok->tabsize + 1) * tok->tabsize; 1210 altcol = (altcol/tok->alttabsize + 1) 1211 * tok->alttabsize; 1212 } 1213 else if (c == '\014') /* Control-L (formfeed) */ 1214 col = altcol = 0; /* For Emacs users */ 1215 else 1216 break; 1217 } 1218 tok_backup(tok, c); 1219 if (c == '#' || c == '\n') { 1220 /* Lines with only whitespace and/or comments 1221 shouldn't affect the indentation and are 1222 not passed to the parser as NEWLINE tokens, 1223 except *totally* empty lines in interactive 1224 mode, which signal the end of a command group. */ 1225 if (col == 0 && c == '\n' && tok->prompt != NULL) 1226 blankline = 0; /* Let it through */ 1227 else 1228 blankline = 1; /* Ignore completely */ 1229 /* We can't jump back right here since we still 1230 may need to skip to the end of a comment */ 1231 } 1232 if (!blankline && tok->level == 0) { 1233 if (col == tok->indstack[tok->indent]) { 1234 /* No change */ 1235 if (altcol != tok->altindstack[tok->indent]) { 1236 if (indenterror(tok)) 1237 return ERRORTOKEN; 1238 } 1239 } 1240 else if (col > tok->indstack[tok->indent]) { 1241 /* Indent -- always one */ 1242 if (tok->indent+1 >= MAXINDENT) { 1243 tok->done = E_TOODEEP; 1244 tok->cur = tok->inp; 1245 return ERRORTOKEN; 1246 } 1247 if (altcol <= tok->altindstack[tok->indent]) { 1248 if (indenterror(tok)) 1249 return ERRORTOKEN; 1250 } 1251 tok->pendin++; 1252 tok->indstack[++tok->indent] = col; 1253 tok->altindstack[tok->indent] = altcol; 1254 } 1255 else /* col < tok->indstack[tok->indent] */ { 1256 /* Dedent -- any number, must be consistent */ 1257 while (tok->indent > 0 && 1258 col < tok->indstack[tok->indent]) { 1259 tok->pendin--; 1260 tok->indent--; 1261 } 1262 if (col != tok->indstack[tok->indent]) { 1263 tok->done = E_DEDENT; 1264 tok->cur = tok->inp; 1265 return ERRORTOKEN; 1266 } 1267 if (altcol != tok->altindstack[tok->indent]) { 1268 if (indenterror(tok)) 1269 return ERRORTOKEN; 1270 } 1271 } 1272 } 1273 } 1274 1275 tok->start = tok->cur; 1276 1277 /* Return pending indents/dedents */ 1278 if (tok->pendin != 0) { 1279 if (tok->pendin < 0) { 1280 tok->pendin++; 1281 return DEDENT; 1282 } 1283 else { 1284 tok->pendin--; 1285 return INDENT; 1286 } 1287 } 1288 1289 again: 1290 tok->start = NULL; 1291 /* Skip spaces */ 1292 do { 1293 c = tok_nextc(tok); 1294 } while (c == ' ' || c == '\t' || c == '\014'); 1295 1296 /* Set start of current token */ 1297 tok->start = tok->cur - 1; 1298 1299 /* Skip comment, while looking for tab-setting magic */ 1300 if (c == '#') { 1301 static char *tabforms[] = { 1302 "tab-width:", /* Emacs */ 1303 ":tabstop=", /* vim, full form */ 1304 ":ts=", /* vim, abbreviated form */ 1305 "set tabsize=", /* will vi never die? */ 1306 /* more templates can be added here to support other editors */ 1307 }; 1308 char cbuf[80]; 1309 char *tp, **cp; 1310 tp = cbuf; 1311 do { 1312 *tp++ = c = tok_nextc(tok); 1313 } while (c != EOF && c != '\n' && 1314 (size_t)(tp - cbuf + 1) < sizeof(cbuf)); 1315 *tp = '\0'; 1316 for (cp = tabforms; 1317 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); 1318 cp++) { 1319 if ((tp = strstr(cbuf, *cp))) { 1320 int newsize = atoi(tp + strlen(*cp)); 1321 1322 if (newsize >= 1 && newsize <= 40) { 1323 tok->tabsize = newsize; 1324 if (Py_VerboseFlag) 1325 PySys_WriteStderr( 1326 "Tab size set to %d\n", 1327 newsize); 1328 } 1329 } 1330 } 1331 while (c != EOF && c != '\n') 1332 c = tok_nextc(tok); 1333 } 1334 1335 /* Check for EOF and errors now */ 1336 if (c == EOF) { 1337 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; 1338 } 1339 1340 /* Identifier (most frequent token!) */ 1341 if (Py_ISALPHA(c) || c == '_') { 1342 /* Process r"", u"" and ur"" */ 1343 switch (c) { 1344 case 'b': 1345 case 'B': 1346 c = tok_nextc(tok); 1347 if (c == 'r' || c == 'R') 1348 c = tok_nextc(tok); 1349 if (c == '"' || c == '\'') 1350 goto letter_quote; 1351 break; 1352 case 'r': 1353 case 'R': 1354 c = tok_nextc(tok); 1355 if (c == '"' || c == '\'') 1356 goto letter_quote; 1357 break; 1358 case 'u': 1359 case 'U': 1360 c = tok_nextc(tok); 1361 if (c == 'r' || c == 'R') 1362 c = tok_nextc(tok); 1363 if (c == '"' || c == '\'') 1364 goto letter_quote; 1365 break; 1366 } 1367 while (c != EOF && (Py_ISALNUM(c) || c == '_')) { 1368 c = tok_nextc(tok); 1369 } 1370 tok_backup(tok, c); 1371 *p_start = tok->start; 1372 *p_end = tok->cur; 1373 return NAME; 1374 } 1375 1376 /* Newline */ 1377 if (c == '\n') { 1378 tok->atbol = 1; 1379 if (blankline || tok->level > 0) 1380 goto nextline; 1381 *p_start = tok->start; 1382 *p_end = tok->cur - 1; /* Leave '\n' out of the string */ 1383 tok->cont_line = 0; 1384 return NEWLINE; 1385 } 1386 1387 /* Period or number starting with period? */ 1388 if (c == '.') { 1389 c = tok_nextc(tok); 1390 if (isdigit(c)) { 1391 goto fraction; 1392 } 1393 else { 1394 tok_backup(tok, c); 1395 *p_start = tok->start; 1396 *p_end = tok->cur; 1397 return DOT; 1398 } 1399 } 1400 1401 /* Number */ 1402 if (isdigit(c)) { 1403 if (c == '0') { 1404 /* Hex, octal or binary -- maybe. */ 1405 c = tok_nextc(tok); 1406 if (c == '.') 1407 goto fraction; 1408 #ifndef WITHOUT_COMPLEX 1409 if (c == 'j' || c == 'J') 1410 goto imaginary; 1411 #endif 1412 if (c == 'x' || c == 'X') { 1413 1414 /* Hex */ 1415 c = tok_nextc(tok); 1416 if (!isxdigit(c)) { 1417 tok->done = E_TOKEN; 1418 tok_backup(tok, c); 1419 return ERRORTOKEN; 1420 } 1421 do { 1422 c = tok_nextc(tok); 1423 } while (isxdigit(c)); 1424 } 1425 else if (c == 'o' || c == 'O') { 1426 /* Octal */ 1427 c = tok_nextc(tok); 1428 if (c < '0' || c >= '8') { 1429 tok->done = E_TOKEN; 1430 tok_backup(tok, c); 1431 return ERRORTOKEN; 1432 } 1433 do { 1434 c = tok_nextc(tok); 1435 } while ('0' <= c && c < '8'); 1436 } 1437 else if (c == 'b' || c == 'B') { 1438 /* Binary */ 1439 c = tok_nextc(tok); 1440 if (c != '0' && c != '1') { 1441 tok->done = E_TOKEN; 1442 tok_backup(tok, c); 1443 return ERRORTOKEN; 1444 } 1445 do { 1446 c = tok_nextc(tok); 1447 } while (c == '0' || c == '1'); 1448 } 1449 else { 1450 int found_decimal = 0; 1451 /* Octal; c is first char of it */ 1452 /* There's no 'isoctdigit' macro, sigh */ 1453 while ('0' <= c && c < '8') { 1454 c = tok_nextc(tok); 1455 } 1456 if (isdigit(c)) { 1457 found_decimal = 1; 1458 do { 1459 c = tok_nextc(tok); 1460 } while (isdigit(c)); 1461 } 1462 if (c == '.') 1463 goto fraction; 1464 else if (c == 'e' || c == 'E') 1465 goto exponent; 1466 #ifndef WITHOUT_COMPLEX 1467 else if (c == 'j' || c == 'J') 1468 goto imaginary; 1469 #endif 1470 else if (found_decimal) { 1471 tok->done = E_TOKEN; 1472 tok_backup(tok, c); 1473 return ERRORTOKEN; 1474 } 1475 } 1476 if (c == 'l' || c == 'L') 1477 c = tok_nextc(tok); 1478 } 1479 else { 1480 /* Decimal */ 1481 do { 1482 c = tok_nextc(tok); 1483 } while (isdigit(c)); 1484 if (c == 'l' || c == 'L') 1485 c = tok_nextc(tok); 1486 else { 1487 /* Accept floating point numbers. */ 1488 if (c == '.') { 1489 fraction: 1490 /* Fraction */ 1491 do { 1492 c = tok_nextc(tok); 1493 } while (isdigit(c)); 1494 } 1495 if (c == 'e' || c == 'E') { 1496 exponent: 1497 /* Exponent part */ 1498 c = tok_nextc(tok); 1499 if (c == '+' || c == '-') 1500 c = tok_nextc(tok); 1501 if (!isdigit(c)) { 1502 tok->done = E_TOKEN; 1503 tok_backup(tok, c); 1504 return ERRORTOKEN; 1505 } 1506 do { 1507 c = tok_nextc(tok); 1508 } while (isdigit(c)); 1509 } 1510 #ifndef WITHOUT_COMPLEX 1511 if (c == 'j' || c == 'J') 1512 /* Imaginary part */ 1513 imaginary: 1514 c = tok_nextc(tok); 1515 #endif 1516 } 1517 } 1518 tok_backup(tok, c); 1519 *p_start = tok->start; 1520 *p_end = tok->cur; 1521 return NUMBER; 1522 } 1523 1524 letter_quote: 1525 /* String */ 1526 if (c == '\'' || c == '"') { 1527 Py_ssize_t quote2 = tok->cur - tok->start + 1; 1528 int quote = c; 1529 int triple = 0; 1530 int tripcount = 0; 1531 for (;;) { 1532 c = tok_nextc(tok); 1533 if (c == '\n') { 1534 if (!triple) { 1535 tok->done = E_EOLS; 1536 tok_backup(tok, c); 1537 return ERRORTOKEN; 1538 } 1539 tripcount = 0; 1540 tok->cont_line = 1; /* multiline string. */ 1541 } 1542 else if (c == EOF) { 1543 if (triple) 1544 tok->done = E_EOFS; 1545 else 1546 tok->done = E_EOLS; 1547 tok->cur = tok->inp; 1548 return ERRORTOKEN; 1549 } 1550 else if (c == quote) { 1551 tripcount++; 1552 if (tok->cur - tok->start == quote2) { 1553 c = tok_nextc(tok); 1554 if (c == quote) { 1555 triple = 1; 1556 tripcount = 0; 1557 continue; 1558 } 1559 tok_backup(tok, c); 1560 } 1561 if (!triple || tripcount == 3) 1562 break; 1563 } 1564 else if (c == '\\') { 1565 tripcount = 0; 1566 c = tok_nextc(tok); 1567 if (c == EOF) { 1568 tok->done = E_EOLS; 1569 tok->cur = tok->inp; 1570 return ERRORTOKEN; 1571 } 1572 } 1573 else 1574 tripcount = 0; 1575 } 1576 *p_start = tok->start; 1577 *p_end = tok->cur; 1578 return STRING; 1579 } 1580 1581 /* Line continuation */ 1582 if (c == '\\') { 1583 c = tok_nextc(tok); 1584 if (c != '\n') { 1585 tok->done = E_LINECONT; 1586 tok->cur = tok->inp; 1587 return ERRORTOKEN; 1588 } 1589 tok->cont_line = 1; 1590 goto again; /* Read next line */ 1591 } 1592 1593 /* Check for two-character token */ 1594 { 1595 int c2 = tok_nextc(tok); 1596 int token = PyToken_TwoChars(c, c2); 1597 #ifndef PGEN 1598 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { 1599 if (PyErr_WarnExplicit(PyExc_DeprecationWarning, 1600 "<> not supported in 3.x; use !=", 1601 tok->filename, tok->lineno, 1602 NULL, NULL)) { 1603 return ERRORTOKEN; 1604 } 1605 } 1606 #endif 1607 if (token != OP) { 1608 int c3 = tok_nextc(tok); 1609 int token3 = PyToken_ThreeChars(c, c2, c3); 1610 if (token3 != OP) { 1611 token = token3; 1612 } else { 1613 tok_backup(tok, c3); 1614 } 1615 *p_start = tok->start; 1616 *p_end = tok->cur; 1617 return token; 1618 } 1619 tok_backup(tok, c2); 1620 } 1621 1622 /* Keep track of parentheses nesting level */ 1623 switch (c) { 1624 case '(': 1625 case '[': 1626 case '{': 1627 tok->level++; 1628 break; 1629 case ')': 1630 case ']': 1631 case '}': 1632 tok->level--; 1633 break; 1634 } 1635 1636 /* Punctuation character */ 1637 *p_start = tok->start; 1638 *p_end = tok->cur; 1639 return PyToken_OneChar(c); 1640 } 1641 1642 int 1643 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) 1644 { 1645 int result = tok_get(tok, p_start, p_end); 1646 if (tok->decoding_erred) { 1647 result = ERRORTOKEN; 1648 tok->done = E_DECODE; 1649 } 1650 return result; 1651 } 1652 1653 /* This function is only called from parsetok. However, it cannot live 1654 there, as it must be empty for PGEN, and we can check for PGEN only 1655 in this file. */ 1656 1657 #if defined(PGEN) || !defined(Py_USING_UNICODE) 1658 char* 1659 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) 1660 { 1661 return NULL; 1662 } 1663 #else 1664 #ifdef Py_USING_UNICODE 1665 static PyObject * 1666 dec_utf8(const char *enc, const char *text, size_t len) { 1667 PyObject *ret = NULL; 1668 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); 1669 if (unicode_text) { 1670 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); 1671 Py_DECREF(unicode_text); 1672 } 1673 if (!ret) { 1674 PyErr_Clear(); 1675 } 1676 return ret; 1677 } 1678 char * 1679 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) 1680 { 1681 char *text = NULL; 1682 if (tok->encoding) { 1683 /* convert source to original encondig */ 1684 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); 1685 if (lineobj != NULL) { 1686 int linelen = PyString_Size(lineobj); 1687 const char *line = PyString_AsString(lineobj); 1688 text = PyObject_MALLOC(linelen + 1); 1689 if (text != NULL && line != NULL) { 1690 if (linelen) 1691 strncpy(text, line, linelen); 1692 text[linelen] = '\0'; 1693 } 1694 Py_DECREF(lineobj); 1695 1696 /* adjust error offset */ 1697 if (*offset > 1) { 1698 PyObject *offsetobj = dec_utf8(tok->encoding, 1699 tok->buf, *offset-1); 1700 if (offsetobj) { 1701 *offset = PyString_Size(offsetobj) + 1; 1702 Py_DECREF(offsetobj); 1703 } 1704 } 1705 1706 } 1707 } 1708 return text; 1709 1710 } 1711 #endif /* defined(Py_USING_UNICODE) */ 1712 #endif 1713 1714 1715 #ifdef Py_DEBUG 1716 1717 void 1718 tok_dump(int type, char *start, char *end) 1719 { 1720 printf("%s", _PyParser_TokenNames[type]); 1721 if (type == NAME || type == NUMBER || type == STRING || type == OP) 1722 printf("(%.*s)", (int)(end - start), start); 1723 } 1724 1725 #endif