Location | Tool | Test ID | Function | Issue |
---|---|---|---|---|
/builddir/build/BUILD/Python-2.7.3/Parser/tokenizer.c:955:21 | clang-analyzer | Potential leak of memory pointed to by 'newbuf' |
1 /* Tokenizer implementation */
2
3 #include "Python.h"
4 #include "pgenheaders.h"
5
6 #include <ctype.h>
7 #include <assert.h>
8
9 #include "tokenizer.h"
10 #include "errcode.h"
11
12 #ifndef PGEN
13 #include "unicodeobject.h"
14 #include "stringobject.h"
15 #include "fileobject.h"
16 #include "codecs.h"
17 #include "abstract.h"
18 #include "pydebug.h"
19 #endif /* PGEN */
20
21 extern char *PyOS_Readline(FILE *, FILE *, char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
25
26 /* Don't ever change this -- it would break the portability of Python code */
27 #define TABSIZE 8
28
29 /* Forward */
30 static struct tok_state *tok_new(void);
31 static int tok_nextc(struct tok_state *tok);
32 static void tok_backup(struct tok_state *tok, int c);
33
34 /* Token names */
35
36 char *_PyParser_TokenNames[] = {
37 "ENDMARKER",
38 "NAME",
39 "NUMBER",
40 "STRING",
41 "NEWLINE",
42 "INDENT",
43 "DEDENT",
44 "LPAR",
45 "RPAR",
46 "LSQB",
47 "RSQB",
48 "COLON",
49 "COMMA",
50 "SEMI",
51 "PLUS",
52 "MINUS",
53 "STAR",
54 "SLASH",
55 "VBAR",
56 "AMPER",
57 "LESS",
58 "GREATER",
59 "EQUAL",
60 "DOT",
61 "PERCENT",
62 "BACKQUOTE",
63 "LBRACE",
64 "RBRACE",
65 "EQEQUAL",
66 "NOTEQUAL",
67 "LESSEQUAL",
68 "GREATEREQUAL",
69 "TILDE",
70 "CIRCUMFLEX",
71 "LEFTSHIFT",
72 "RIGHTSHIFT",
73 "DOUBLESTAR",
74 "PLUSEQUAL",
75 "MINEQUAL",
76 "STAREQUAL",
77 "SLASHEQUAL",
78 "PERCENTEQUAL",
79 "AMPEREQUAL",
80 "VBAREQUAL",
81 "CIRCUMFLEXEQUAL",
82 "LEFTSHIFTEQUAL",
83 "RIGHTSHIFTEQUAL",
84 "DOUBLESTAREQUAL",
85 "DOUBLESLASH",
86 "DOUBLESLASHEQUAL",
87 "AT",
88 /* This table must match the #defines in token.h! */
89 "OP",
90 "<ERRORTOKEN>",
91 "<N_TOKENS>"
92 };
93
94 /* Create and initialize a new tok_state structure */
95
96 static struct tok_state *
97 tok_new(void)
98 {
99 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
100 sizeof(struct tok_state));
101 if (tok == NULL)
102 return NULL;
103 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
104 tok->done = E_OK;
105 tok->fp = NULL;
106 tok->input = NULL;
107 tok->tabsize = TABSIZE;
108 tok->indent = 0;
109 tok->indstack[0] = 0;
110 tok->atbol = 1;
111 tok->pendin = 0;
112 tok->prompt = tok->nextprompt = NULL;
113 tok->lineno = 0;
114 tok->level = 0;
115 tok->filename = NULL;
116 tok->altwarning = 0;
117 tok->alterror = 0;
118 tok->alttabsize = 1;
119 tok->altindstack[0] = 0;
120 tok->decoding_state = 0;
121 tok->decoding_erred = 0;
122 tok->read_coding_spec = 0;
123 tok->encoding = NULL;
124 tok->cont_line = 0;
125 #ifndef PGEN
126 tok->decoding_readline = NULL;
127 tok->decoding_buffer = NULL;
128 #endif
129 return tok;
130 }
131
132 static char *
133 new_string(const char *s, Py_ssize_t len)
134 {
135 char* result = (char *)PyMem_MALLOC(len + 1);
136 if (result != NULL) {
137 memcpy(result, s, len);
138 result[len] = '\0';
139 }
140 return result;
141 }
142
143 #ifdef PGEN
144
145 static char *
146 decoding_fgets(char *s, int size, struct tok_state *tok)
147 {
148 return fgets(s, size, tok->fp);
149 }
150
151 static int
152 decoding_feof(struct tok_state *tok)
153 {
154 return feof(tok->fp);
155 }
156
157 static char *
158 decode_str(const char *str, int exec_input, struct tok_state *tok)
159 {
160 return new_string(str, strlen(str));
161 }
162
163 #else /* PGEN */
164
165 static char *
166 error_ret(struct tok_state *tok) /* XXX */
167 {
168 tok->decoding_erred = 1;
169 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
170 PyMem_FREE(tok->buf);
171 tok->buf = NULL;
172 return NULL; /* as if it were EOF */
173 }
174
175
176 static char *
177 get_normal_name(char *s) /* for utf-8 and latin-1 */
178 {
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0')
184 break;
185 else if (c == '_')
186 buf[i] = '-';
187 else
188 buf[i] = tolower(c);
189 }
190 buf[i] = '\0';
191 if (strcmp(buf, "utf-8") == 0 ||
192 strncmp(buf, "utf-8-", 6) == 0)
193 return "utf-8";
194 else if (strcmp(buf, "latin-1") == 0 ||
195 strcmp(buf, "iso-8859-1") == 0 ||
196 strcmp(buf, "iso-latin-1") == 0 ||
197 strncmp(buf, "latin-1-", 8) == 0 ||
198 strncmp(buf, "iso-8859-1-", 11) == 0 ||
199 strncmp(buf, "iso-latin-1-", 12) == 0)
200 return "iso-8859-1";
201 else
202 return s;
203 }
204
205 /* Return the coding spec in S, or NULL if none is found. */
206
207 static char *
208 get_coding_spec(const char *s, Py_ssize_t size)
209 {
210 Py_ssize_t i;
211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
218 }
219 for (; i < size - 6; i++) { /* XXX inefficient search */
220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
229
230 begin = t;
231 while (Py_ISALNUM(t[0]) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
233 t++;
234
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
239 PyMem_FREE(r);
240 r = new_string(q, strlen(q));
241 }
242 return r;
243 }
244 }
245 }
246 return NULL;
247 }
248
249 /* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
253
254 static int
255 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
257 {
258 char * cs;
259 int r = 1;
260
261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
264 cs = get_coding_spec(line, size);
265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
273 #ifdef Py_USING_UNICODE
274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
278 }
279 else
280 PyMem_FREE(cs);
281 #else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
286 PyMem_FREE(cs);
287 #endif
288 }
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
291 PyMem_FREE(cs);
292 }
293 }
294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299 }
300 return r;
301 }
302
303 /* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
306
307 static int
308 check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
312 {
313 int ch1, ch2, ch3;
314 ch1 = get_char(tok);
315 tok->decoding_state = 1;
316 if (ch1 == EOF) {
317 return 1;
318 } else if (ch1 == 0xEF) {
319 ch2 = get_char(tok);
320 if (ch2 != 0xBB) {
321 unget_char(ch2, tok);
322 unget_char(ch1, tok);
323 return 1;
324 }
325 ch3 = get_char(tok);
326 if (ch3 != 0xBF) {
327 unget_char(ch3, tok);
328 unget_char(ch2, tok);
329 unget_char(ch1, tok);
330 return 1;
331 }
332 #if 0
333 /* Disable support for UTF-16 BOMs until a decision
334 is made whether this needs to be supported. */
335 } else if (ch1 == 0xFE) {
336 ch2 = get_char(tok);
337 if (ch2 != 0xFF) {
338 unget_char(ch2, tok);
339 unget_char(ch1, tok);
340 return 1;
341 }
342 if (!set_readline(tok, "utf-16-be"))
343 return 0;
344 tok->decoding_state = -1;
345 } else if (ch1 == 0xFF) {
346 ch2 = get_char(tok);
347 if (ch2 != 0xFE) {
348 unget_char(ch2, tok);
349 unget_char(ch1, tok);
350 return 1;
351 }
352 if (!set_readline(tok, "utf-16-le"))
353 return 0;
354 tok->decoding_state = -1;
355 #endif
356 } else {
357 unget_char(ch1, tok);
358 return 1;
359 }
360 if (tok->encoding != NULL)
361 PyMem_FREE(tok->encoding);
362 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
363 return 1;
364 }
365
366 /* Read a line of text from TOK into S, using the stream in TOK.
367 Return NULL on failure, else S.
368
369 On entry, tok->decoding_buffer will be one of:
370 1) NULL: need to call tok->decoding_readline to get a new line
371 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
372 stored the result in tok->decoding_buffer
373 3) PyStringObject *: previous call to fp_readl did not have enough room
374 (in the s buffer) to copy entire contents of the line read
375 by tok->decoding_readline. tok->decoding_buffer has the overflow.
376 In this case, fp_readl is called in a loop (with an expanded buffer)
377 until the buffer ends with a '\n' (or until the end of the file is
378 reached): see tok_nextc and its calls to decoding_fgets.
379 */
380
381 static char *
382 fp_readl(char *s, int size, struct tok_state *tok)
383 {
384 #ifndef Py_USING_UNICODE
385 /* In a non-Unicode built, this should never be called. */
386 Py_FatalError("fp_readl should not be called in this build.");
387 return NULL; /* Keep compiler happy (not reachable) */
388 #else
389 PyObject* utf8 = NULL;
390 PyObject* buf = tok->decoding_buffer;
391 char *str;
392 Py_ssize_t utf8len;
393
394 /* Ask for one less byte so we can terminate it */
395 assert(size > 0);
396 size--;
397
398 if (buf == NULL) {
399 buf = PyObject_CallObject(tok->decoding_readline, NULL);
400 if (buf == NULL)
401 return error_ret(tok);
402 } else {
403 tok->decoding_buffer = NULL;
404 if (PyString_CheckExact(buf))
405 utf8 = buf;
406 }
407 if (utf8 == NULL) {
408 utf8 = PyUnicode_AsUTF8String(buf);
409 Py_DECREF(buf);
410 if (utf8 == NULL)
411 return error_ret(tok);
412 }
413 str = PyString_AsString(utf8);
414 utf8len = PyString_GET_SIZE(utf8);
415 if (utf8len > size) {
416 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
417 if (tok->decoding_buffer == NULL) {
418 Py_DECREF(utf8);
419 return error_ret(tok);
420 }
421 utf8len = size;
422 }
423 memcpy(s, str, utf8len);
424 s[utf8len] = '\0';
425 Py_DECREF(utf8);
426 if (utf8len == 0)
427 return NULL; /* EOF */
428 return s;
429 #endif
430 }
431
432 /* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
434
435 This function is called from check_bom and check_coding_spec.
436
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
439
440 Return 1 on success, 0 on failure. */
441
442 static int
443 fp_setreadl(struct tok_state *tok, const char* enc)
444 {
445 PyObject *reader, *stream, *readline;
446
447 /* XXX: constify filename argument. */
448 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
449 if (stream == NULL)
450 return 0;
451
452 reader = PyCodec_StreamReader(enc, stream, NULL);
453 Py_DECREF(stream);
454 if (reader == NULL)
455 return 0;
456
457 readline = PyObject_GetAttrString(reader, "readline");
458 Py_DECREF(reader);
459 if (readline == NULL)
460 return 0;
461
462 tok->decoding_readline = readline;
463 return 1;
464 }
465
466 /* Fetch the next byte from TOK. */
467
468 static int fp_getc(struct tok_state *tok) {
469 return getc(tok->fp);
470 }
471
472 /* Unfetch the last byte back into TOK. */
473
474 static void fp_ungetc(int c, struct tok_state *tok) {
475 ungetc(c, tok->fp);
476 }
477
478 /* Read a line of input from TOK. Determine encoding
479 if necessary. */
480
481 static char *
482 decoding_fgets(char *s, int size, struct tok_state *tok)
483 {
484 char *line = NULL;
485 int badchar = 0;
486 for (;;) {
487 if (tok->decoding_state < 0) {
488 /* We already have a codec associated with
489 this input. */
490 line = fp_readl(s, size, tok);
491 break;
492 } else if (tok->decoding_state > 0) {
493 /* We want a 'raw' read. */
494 line = Py_UniversalNewlineFgets(s, size,
495 tok->fp, NULL);
496 break;
497 } else {
498 /* We have not yet determined the encoding.
499 If an encoding is found, use the file-pointer
500 reader functions from now on. */
501 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
502 return error_ret(tok);
503 assert(tok->decoding_state != 0);
504 }
505 }
506 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
507 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
508 return error_ret(tok);
509 }
510 }
511 #ifndef PGEN
512 /* The default encoding is ASCII, so make sure we don't have any
513 non-ASCII bytes in it. */
514 if (line && !tok->encoding) {
515 unsigned char *c;
516 for (c = (unsigned char *)line; *c; c++)
517 if (*c > 127) {
518 badchar = *c;
519 break;
520 }
521 }
522 if (badchar) {
523 char buf[500];
524 /* Need to add 1 to the line number, since this line
525 has not been counted, yet. */
526 sprintf(buf,
527 "Non-ASCII character '\\x%.2x' "
528 "in file %.200s on line %i, "
529 "but no encoding declared; "
530 "see http://www.python.org/peps/pep-0263.html for details",
531 badchar, tok->filename, tok->lineno + 1);
532 PyErr_SetString(PyExc_SyntaxError, buf);
533 return error_ret(tok);
534 }
535 #endif
536 return line;
537 }
538
539 static int
540 decoding_feof(struct tok_state *tok)
541 {
542 if (tok->decoding_state >= 0) {
543 return feof(tok->fp);
544 } else {
545 PyObject* buf = tok->decoding_buffer;
546 if (buf == NULL) {
547 buf = PyObject_CallObject(tok->decoding_readline, NULL);
548 if (buf == NULL) {
549 error_ret(tok);
550 return 1;
551 } else {
552 tok->decoding_buffer = buf;
553 }
554 }
555 return PyObject_Length(buf) == 0;
556 }
557 }
558
559 /* Fetch a byte from TOK, using the string buffer. */
560
561 static int
562 buf_getc(struct tok_state *tok) {
563 return Py_CHARMASK(*tok->str++);
564 }
565
566 /* Unfetch a byte from TOK, using the string buffer. */
567
568 static void
569 buf_ungetc(int c, struct tok_state *tok) {
570 tok->str--;
571 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
572 }
573
574 /* Set the readline function for TOK to ENC. For the string-based
575 tokenizer, this means to just record the encoding. */
576
577 static int
578 buf_setreadl(struct tok_state *tok, const char* enc) {
579 tok->enc = enc;
580 return 1;
581 }
582
583 /* Return a UTF-8 encoding Python string object from the
584 C byte string STR, which is encoded with ENC. */
585
586 #ifdef Py_USING_UNICODE
587 static PyObject *
588 translate_into_utf8(const char* str, const char* enc) {
589 PyObject *utf8;
590 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
591 if (buf == NULL)
592 return NULL;
593 utf8 = PyUnicode_AsUTF8String(buf);
594 Py_DECREF(buf);
595 return utf8;
596 }
597 #endif
598
599
600 static char *
601 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
602 int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
603 char *buf, *current;
604 char c = '\0';
605 buf = PyMem_MALLOC(needed_length);
606 if (buf == NULL) {
607 tok->done = E_NOMEM;
608 return NULL;
609 }
610 for (current = buf; *s; s++, current++) {
611 c = *s;
612 if (skip_next_lf) {
613 skip_next_lf = 0;
614 if (c == '\n') {
615 c = *++s;
616 if (!c)
617 break;
618 }
619 }
620 if (c == '\r') {
621 skip_next_lf = 1;
622 c = '\n';
623 }
624 *current = c;
625 }
626 /* If this is exec input, add a newline to the end of the string if
627 there isn't one already. */
628 if (exec_input && c != '\n') {
629 *current = '\n';
630 current++;
631 }
632 *current = '\0';
633 final_length = current - buf + 1;
634 if (final_length < needed_length && final_length)
635 /* should never fail */
636 buf = PyMem_REALLOC(buf, final_length);
637 return buf;
638 }
639
640 /* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
644 static const char *
645 decode_str(const char *input, int single, struct tok_state *tok)
646 {
647 PyObject* utf8 = NULL;
648 const char *str;
649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 #ifdef Py_USING_UNICODE
662 if (tok->enc != NULL) {
663 utf8 = translate_into_utf8(str, tok->enc);
664 if (utf8 == NULL)
665 return error_ret(tok);
666 str = PyString_AsString(utf8);
667 }
668 #endif
669 for (s = str;; s++) {
670 if (*s == '\0') break;
671 else if (*s == '\n') {
672 assert(lineno < 2);
673 newl[lineno] = s;
674 lineno++;
675 if (lineno == 2) break;
676 }
677 }
678 tok->enc = NULL;
679 /* need to check line 1 and 2 separately since check_coding_spec
680 assumes a single line as input */
681 if (newl[0]) {
682 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
683 return error_ret(tok);
684 if (tok->enc == NULL && newl[1]) {
685 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
686 tok, buf_setreadl))
687 return error_ret(tok);
688 }
689 }
690 #ifdef Py_USING_UNICODE
691 if (tok->enc != NULL) {
692 assert(utf8 == NULL);
693 utf8 = translate_into_utf8(str, tok->enc);
694 if (utf8 == NULL)
695 return error_ret(tok);
696 str = PyString_AsString(utf8);
697 }
698 #endif
699 assert(tok->decoding_buffer == NULL);
700 tok->decoding_buffer = utf8; /* CAUTION */
701 return str;
702 }
703
704 #endif /* PGEN */
705
706 /* Set up tokenizer for string */
707
708 struct tok_state *
709 PyTokenizer_FromString(const char *str, int exec_input)
710 {
711 struct tok_state *tok = tok_new();
712 if (tok == NULL)
713 return NULL;
714 str = (char *)decode_str(str, exec_input, tok);
715 if (str == NULL) {
716 PyTokenizer_Free(tok);
717 return NULL;
718 }
719
720 /* XXX: constify members. */
721 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
722 return tok;
723 }
724
725
726 /* Set up tokenizer for file */
727
728 struct tok_state *
729 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
730 {
731 struct tok_state *tok = tok_new();
732 if (tok == NULL)
733 return NULL;
734 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
735 PyTokenizer_Free(tok);
736 return NULL;
737 }
738 tok->cur = tok->inp = tok->buf;
739 tok->end = tok->buf + BUFSIZ;
740 tok->fp = fp;
741 tok->prompt = ps1;
742 tok->nextprompt = ps2;
743 return tok;
744 }
745
746
747 /* Free a tok_state structure */
748
749 void
750 PyTokenizer_Free(struct tok_state *tok)
751 {
752 if (tok->encoding != NULL)
753 PyMem_FREE(tok->encoding);
754 #ifndef PGEN
755 Py_XDECREF(tok->decoding_readline);
756 Py_XDECREF(tok->decoding_buffer);
757 #endif
758 if (tok->fp != NULL && tok->buf != NULL)
759 PyMem_FREE(tok->buf);
760 if (tok->input)
761 PyMem_FREE((char *)tok->input);
762 PyMem_FREE(tok);
763 }
764
765 #if !defined(PGEN) && defined(Py_USING_UNICODE)
766 static int
767 tok_stdin_decode(struct tok_state *tok, char **inp)
768 {
769 PyObject *enc, *sysstdin, *decoded, *utf8;
770 const char *encoding;
771 char *converted;
772
773 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
774 return 0;
775 sysstdin = PySys_GetObject("stdin");
776 if (sysstdin == NULL || !PyFile_Check(sysstdin))
777 return 0;
778
779 enc = ((PyFileObject *)sysstdin)->f_encoding;
780 if (enc == NULL || !PyString_Check(enc))
781 return 0;
782 Py_INCREF(enc);
783
784 encoding = PyString_AsString(enc);
785 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
786 if (decoded == NULL)
787 goto error_clear;
788
789 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
790 Py_DECREF(decoded);
791 if (utf8 == NULL)
792 goto error_clear;
793
794 assert(PyString_Check(utf8));
795 converted = new_string(PyString_AS_STRING(utf8),
796 PyString_GET_SIZE(utf8));
797 Py_DECREF(utf8);
798 if (converted == NULL)
799 goto error_nomem;
800
801 PyMem_FREE(*inp);
802 *inp = converted;
803 if (tok->encoding != NULL)
804 PyMem_FREE(tok->encoding);
805 tok->encoding = new_string(encoding, strlen(encoding));
806 if (tok->encoding == NULL)
807 goto error_nomem;
808
809 Py_DECREF(enc);
810 return 0;
811
812 error_nomem:
813 Py_DECREF(enc);
814 tok->done = E_NOMEM;
815 return -1;
816
817 error_clear:
818 Py_DECREF(enc);
819 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
820 tok->done = E_ERROR;
821 return -1;
822 }
823 /* Fallback to iso-8859-1: for backward compatibility */
824 PyErr_Clear();
825 return 0;
826 }
827 #endif
828
829 /* Get next char, updating state; error code goes into tok->done */
830
831 static int
832 tok_nextc(register struct tok_state *tok)
833 {
834 for (;;) {
835 if (tok->cur != tok->inp) {
836 return Py_CHARMASK(*tok->cur++); /* Fast path */
837 }
838 if (tok->done != E_OK)
839 return EOF;
840 if (tok->fp == NULL) {
841 char *end = strchr(tok->inp, '\n');
842 if (end != NULL)
843 end++;
844 else {
845 end = strchr(tok->inp, '\0');
846 if (end == tok->inp) {
847 tok->done = E_EOF;
848 return EOF;
849 }
850 }
851 if (tok->start == NULL)
852 tok->buf = tok->cur;
853 tok->line_start = tok->cur;
854 tok->lineno++;
855 tok->inp = end;
856 return Py_CHARMASK(*tok->cur++);
857 }
858 if (tok->prompt != NULL) {
859 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
860 if (tok->nextprompt != NULL)
861 tok->prompt = tok->nextprompt;
862 if (newtok == NULL)
863 tok->done = E_INTR;
864 else if (*newtok == '\0') {
865 PyMem_FREE(newtok);
866 tok->done = E_EOF;
867 }
868 #if !defined(PGEN) && defined(Py_USING_UNICODE)
869 else if (tok_stdin_decode(tok, &newtok) != 0)
870 PyMem_FREE(newtok);
871 #endif
872 else if (tok->start != NULL) {
873 size_t start = tok->start - tok->buf;
874 size_t oldlen = tok->cur - tok->buf;
875 size_t newlen = oldlen + strlen(newtok);
876 char *buf = tok->buf;
877 buf = (char *)PyMem_REALLOC(buf, newlen+1);
878 tok->lineno++;
879 if (buf == NULL) {
880 PyMem_FREE(tok->buf);
881 tok->buf = NULL;
882 PyMem_FREE(newtok);
883 tok->done = E_NOMEM;
884 return EOF;
885 }
886 tok->buf = buf;
887 tok->cur = tok->buf + oldlen;
888 tok->line_start = tok->cur;
889 strcpy(tok->buf + oldlen, newtok);
890 PyMem_FREE(newtok);
891 tok->inp = tok->buf + newlen;
892 tok->end = tok->inp + 1;
893 tok->start = tok->buf + start;
894 }
895 else {
896 tok->lineno++;
897 if (tok->buf != NULL)
898 PyMem_FREE(tok->buf);
899 tok->buf = newtok;
900 tok->line_start = tok->buf;
901 tok->cur = tok->buf;
902 tok->line_start = tok->buf;
903 tok->inp = strchr(tok->buf, '\0');
904 tok->end = tok->inp + 1;
905 }
906 }
907 else {
908 int done = 0;
909 Py_ssize_t cur = 0;
910 char *pt;
911 if (tok->start == NULL) {
912 if (tok->buf == NULL) {
913 tok->buf = (char *)
914 PyMem_MALLOC(BUFSIZ);
915 if (tok->buf == NULL) {
916 tok->done = E_NOMEM;
917 return EOF;
918 }
919 tok->end = tok->buf + BUFSIZ;
920 }
921 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
922 tok) == NULL) {
923 tok->done = E_EOF;
924 done = 1;
925 }
926 else {
927 tok->done = E_OK;
928 tok->inp = strchr(tok->buf, '\0');
929 done = tok->inp[-1] == '\n';
930 }
931 }
932 else {
933 cur = tok->cur - tok->buf;
934 if (decoding_feof(tok)) {
935 tok->done = E_EOF;
936 done = 1;
937 }
938 else
939 tok->done = E_OK;
940 }
941 tok->lineno++;
942 /* Read until '\n' or EOF */
943 while (!done) {
944 Py_ssize_t curstart = tok->start == NULL ? -1 :
945 tok->start - tok->buf;
946 Py_ssize_t curvalid = tok->inp - tok->buf;
947 Py_ssize_t newsize = curvalid + BUFSIZ;
948 char *newbuf = tok->buf;
949 newbuf = (char *)PyMem_REALLOC(newbuf,
950 newsize);
951 if (newbuf == NULL) {
952 tok->done = E_NOMEM;
953 tok->cur = tok->inp;
954 return EOF;
955 }
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
956 tok->buf = newbuf;
957 tok->inp = tok->buf + curvalid;
958 tok->end = tok->buf + newsize;
959 tok->start = curstart < 0 ? NULL :
960 tok->buf + curstart;
961 if (decoding_fgets(tok->inp,
962 (int)(tok->end - tok->inp),
963 tok) == NULL) {
964 /* Break out early on decoding
965 errors, as tok->buf will be NULL
966 */
967 if (tok->decoding_erred)
968 return EOF;
969 /* Last line does not end in \n,
970 fake one */
971 strcpy(tok->inp, "\n");
972 }
973 tok->inp = strchr(tok->inp, '\0');
974 done = tok->inp[-1] == '\n';
975 }
976 if (tok->buf != NULL) {
977 tok->cur = tok->buf + cur;
978 tok->line_start = tok->cur;
979 /* replace "\r\n" with "\n" */
980 /* For Mac leave the \r, giving a syntax error */
981 pt = tok->inp - 2;
982 if (pt >= tok->buf && *pt == '\r') {
983 *pt++ = '\n';
984 *pt = '\0';
985 tok->inp = pt;
986 }
987 }
988 }
989 if (tok->done != E_OK) {
990 if (tok->prompt != NULL)
991 PySys_WriteStderr("\n");
992 tok->cur = tok->inp;
993 return EOF;
994 }
995 }
996 /*NOTREACHED*/
997 }
998
999
1000 /* Back-up one character */
1001
1002 static void
1003 tok_backup(register struct tok_state *tok, register int c)
1004 {
1005 if (c != EOF) {
1006 if (--tok->cur < tok->buf)
1007 Py_FatalError("tok_backup: beginning of buffer");
1008 if (*tok->cur != c)
1009 *tok->cur = c;
1010 }
1011 }
1012
1013
1014 /* Return the token corresponding to a single character */
1015
1016 int
1017 PyToken_OneChar(int c)
1018 {
1019 switch (c) {
1020 case '(': return LPAR;
1021 case ')': return RPAR;
1022 case '[': return LSQB;
1023 case ']': return RSQB;
1024 case ':': return COLON;
1025 case ',': return COMMA;
1026 case ';': return SEMI;
1027 case '+': return PLUS;
1028 case '-': return MINUS;
1029 case '*': return STAR;
1030 case '/': return SLASH;
1031 case '|': return VBAR;
1032 case '&': return AMPER;
1033 case '<': return LESS;
1034 case '>': return GREATER;
1035 case '=': return EQUAL;
1036 case '.': return DOT;
1037 case '%': return PERCENT;
1038 case '`': return BACKQUOTE;
1039 case '{': return LBRACE;
1040 case '}': return RBRACE;
1041 case '^': return CIRCUMFLEX;
1042 case '~': return TILDE;
1043 case '@': return AT;
1044 default: return OP;
1045 }
1046 }
1047
1048
1049 int
1050 PyToken_TwoChars(int c1, int c2)
1051 {
1052 switch (c1) {
1053 case '=':
1054 switch (c2) {
1055 case '=': return EQEQUAL;
1056 }
1057 break;
1058 case '!':
1059 switch (c2) {
1060 case '=': return NOTEQUAL;
1061 }
1062 break;
1063 case '<':
1064 switch (c2) {
1065 case '>': return NOTEQUAL;
1066 case '=': return LESSEQUAL;
1067 case '<': return LEFTSHIFT;
1068 }
1069 break;
1070 case '>':
1071 switch (c2) {
1072 case '=': return GREATEREQUAL;
1073 case '>': return RIGHTSHIFT;
1074 }
1075 break;
1076 case '+':
1077 switch (c2) {
1078 case '=': return PLUSEQUAL;
1079 }
1080 break;
1081 case '-':
1082 switch (c2) {
1083 case '=': return MINEQUAL;
1084 }
1085 break;
1086 case '*':
1087 switch (c2) {
1088 case '*': return DOUBLESTAR;
1089 case '=': return STAREQUAL;
1090 }
1091 break;
1092 case '/':
1093 switch (c2) {
1094 case '/': return DOUBLESLASH;
1095 case '=': return SLASHEQUAL;
1096 }
1097 break;
1098 case '|':
1099 switch (c2) {
1100 case '=': return VBAREQUAL;
1101 }
1102 break;
1103 case '%':
1104 switch (c2) {
1105 case '=': return PERCENTEQUAL;
1106 }
1107 break;
1108 case '&':
1109 switch (c2) {
1110 case '=': return AMPEREQUAL;
1111 }
1112 break;
1113 case '^':
1114 switch (c2) {
1115 case '=': return CIRCUMFLEXEQUAL;
1116 }
1117 break;
1118 }
1119 return OP;
1120 }
1121
1122 int
1123 PyToken_ThreeChars(int c1, int c2, int c3)
1124 {
1125 switch (c1) {
1126 case '<':
1127 switch (c2) {
1128 case '<':
1129 switch (c3) {
1130 case '=':
1131 return LEFTSHIFTEQUAL;
1132 }
1133 break;
1134 }
1135 break;
1136 case '>':
1137 switch (c2) {
1138 case '>':
1139 switch (c3) {
1140 case '=':
1141 return RIGHTSHIFTEQUAL;
1142 }
1143 break;
1144 }
1145 break;
1146 case '*':
1147 switch (c2) {
1148 case '*':
1149 switch (c3) {
1150 case '=':
1151 return DOUBLESTAREQUAL;
1152 }
1153 break;
1154 }
1155 break;
1156 case '/':
1157 switch (c2) {
1158 case '/':
1159 switch (c3) {
1160 case '=':
1161 return DOUBLESLASHEQUAL;
1162 }
1163 break;
1164 }
1165 break;
1166 }
1167 return OP;
1168 }
1169
1170 static int
1171 indenterror(struct tok_state *tok)
1172 {
1173 if (tok->alterror) {
1174 tok->done = E_TABSPACE;
1175 tok->cur = tok->inp;
1176 return 1;
1177 }
1178 if (tok->altwarning) {
1179 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1180 "in indentation\n", tok->filename);
1181 tok->altwarning = 0;
1182 }
1183 return 0;
1184 }
1185
1186 /* Get next token, after space stripping etc. */
1187
1188 static int
1189 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1190 {
1191 register int c;
1192 int blankline;
1193
1194 *p_start = *p_end = NULL;
1195 nextline:
1196 tok->start = NULL;
1197 blankline = 0;
1198
1199 /* Get indentation level */
1200 if (tok->atbol) {
1201 register int col = 0;
1202 register int altcol = 0;
1203 tok->atbol = 0;
1204 for (;;) {
1205 c = tok_nextc(tok);
1206 if (c == ' ')
1207 col++, altcol++;
1208 else if (c == '\t') {
1209 col = (col/tok->tabsize + 1) * tok->tabsize;
1210 altcol = (altcol/tok->alttabsize + 1)
1211 * tok->alttabsize;
1212 }
1213 else if (c == '\014') /* Control-L (formfeed) */
1214 col = altcol = 0; /* For Emacs users */
1215 else
1216 break;
1217 }
1218 tok_backup(tok, c);
1219 if (c == '#' || c == '\n') {
1220 /* Lines with only whitespace and/or comments
1221 shouldn't affect the indentation and are
1222 not passed to the parser as NEWLINE tokens,
1223 except *totally* empty lines in interactive
1224 mode, which signal the end of a command group. */
1225 if (col == 0 && c == '\n' && tok->prompt != NULL)
1226 blankline = 0; /* Let it through */
1227 else
1228 blankline = 1; /* Ignore completely */
1229 /* We can't jump back right here since we still
1230 may need to skip to the end of a comment */
1231 }
1232 if (!blankline && tok->level == 0) {
1233 if (col == tok->indstack[tok->indent]) {
1234 /* No change */
1235 if (altcol != tok->altindstack[tok->indent]) {
1236 if (indenterror(tok))
1237 return ERRORTOKEN;
1238 }
1239 }
1240 else if (col > tok->indstack[tok->indent]) {
1241 /* Indent -- always one */
1242 if (tok->indent+1 >= MAXINDENT) {
1243 tok->done = E_TOODEEP;
1244 tok->cur = tok->inp;
1245 return ERRORTOKEN;
1246 }
1247 if (altcol <= tok->altindstack[tok->indent]) {
1248 if (indenterror(tok))
1249 return ERRORTOKEN;
1250 }
1251 tok->pendin++;
1252 tok->indstack[++tok->indent] = col;
1253 tok->altindstack[tok->indent] = altcol;
1254 }
1255 else /* col < tok->indstack[tok->indent] */ {
1256 /* Dedent -- any number, must be consistent */
1257 while (tok->indent > 0 &&
1258 col < tok->indstack[tok->indent]) {
1259 tok->pendin--;
1260 tok->indent--;
1261 }
1262 if (col != tok->indstack[tok->indent]) {
1263 tok->done = E_DEDENT;
1264 tok->cur = tok->inp;
1265 return ERRORTOKEN;
1266 }
1267 if (altcol != tok->altindstack[tok->indent]) {
1268 if (indenterror(tok))
1269 return ERRORTOKEN;
1270 }
1271 }
1272 }
1273 }
1274
1275 tok->start = tok->cur;
1276
1277 /* Return pending indents/dedents */
1278 if (tok->pendin != 0) {
1279 if (tok->pendin < 0) {
1280 tok->pendin++;
1281 return DEDENT;
1282 }
1283 else {
1284 tok->pendin--;
1285 return INDENT;
1286 }
1287 }
1288
1289 again:
1290 tok->start = NULL;
1291 /* Skip spaces */
1292 do {
1293 c = tok_nextc(tok);
1294 } while (c == ' ' || c == '\t' || c == '\014');
1295
1296 /* Set start of current token */
1297 tok->start = tok->cur - 1;
1298
1299 /* Skip comment, while looking for tab-setting magic */
1300 if (c == '#') {
1301 static char *tabforms[] = {
1302 "tab-width:", /* Emacs */
1303 ":tabstop=", /* vim, full form */
1304 ":ts=", /* vim, abbreviated form */
1305 "set tabsize=", /* will vi never die? */
1306 /* more templates can be added here to support other editors */
1307 };
1308 char cbuf[80];
1309 char *tp, **cp;
1310 tp = cbuf;
1311 do {
1312 *tp++ = c = tok_nextc(tok);
1313 } while (c != EOF && c != '\n' &&
1314 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1315 *tp = '\0';
1316 for (cp = tabforms;
1317 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1318 cp++) {
1319 if ((tp = strstr(cbuf, *cp))) {
1320 int newsize = atoi(tp + strlen(*cp));
1321
1322 if (newsize >= 1 && newsize <= 40) {
1323 tok->tabsize = newsize;
1324 if (Py_VerboseFlag)
1325 PySys_WriteStderr(
1326 "Tab size set to %d\n",
1327 newsize);
1328 }
1329 }
1330 }
1331 while (c != EOF && c != '\n')
1332 c = tok_nextc(tok);
1333 }
1334
1335 /* Check for EOF and errors now */
1336 if (c == EOF) {
1337 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1338 }
1339
1340 /* Identifier (most frequent token!) */
1341 if (Py_ISALPHA(c) || c == '_') {
1342 /* Process r"", u"" and ur"" */
1343 switch (c) {
1344 case 'b':
1345 case 'B':
1346 c = tok_nextc(tok);
1347 if (c == 'r' || c == 'R')
1348 c = tok_nextc(tok);
1349 if (c == '"' || c == '\'')
1350 goto letter_quote;
1351 break;
1352 case 'r':
1353 case 'R':
1354 c = tok_nextc(tok);
1355 if (c == '"' || c == '\'')
1356 goto letter_quote;
1357 break;
1358 case 'u':
1359 case 'U':
1360 c = tok_nextc(tok);
1361 if (c == 'r' || c == 'R')
1362 c = tok_nextc(tok);
1363 if (c == '"' || c == '\'')
1364 goto letter_quote;
1365 break;
1366 }
1367 while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1368 c = tok_nextc(tok);
1369 }
1370 tok_backup(tok, c);
1371 *p_start = tok->start;
1372 *p_end = tok->cur;
1373 return NAME;
1374 }
1375
1376 /* Newline */
1377 if (c == '\n') {
1378 tok->atbol = 1;
1379 if (blankline || tok->level > 0)
1380 goto nextline;
1381 *p_start = tok->start;
1382 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1383 tok->cont_line = 0;
1384 return NEWLINE;
1385 }
1386
1387 /* Period or number starting with period? */
1388 if (c == '.') {
1389 c = tok_nextc(tok);
1390 if (isdigit(c)) {
1391 goto fraction;
1392 }
1393 else {
1394 tok_backup(tok, c);
1395 *p_start = tok->start;
1396 *p_end = tok->cur;
1397 return DOT;
1398 }
1399 }
1400
1401 /* Number */
1402 if (isdigit(c)) {
1403 if (c == '0') {
1404 /* Hex, octal or binary -- maybe. */
1405 c = tok_nextc(tok);
1406 if (c == '.')
1407 goto fraction;
1408 #ifndef WITHOUT_COMPLEX
1409 if (c == 'j' || c == 'J')
1410 goto imaginary;
1411 #endif
1412 if (c == 'x' || c == 'X') {
1413
1414 /* Hex */
1415 c = tok_nextc(tok);
1416 if (!isxdigit(c)) {
1417 tok->done = E_TOKEN;
1418 tok_backup(tok, c);
1419 return ERRORTOKEN;
1420 }
1421 do {
1422 c = tok_nextc(tok);
1423 } while (isxdigit(c));
1424 }
1425 else if (c == 'o' || c == 'O') {
1426 /* Octal */
1427 c = tok_nextc(tok);
1428 if (c < '0' || c >= '8') {
1429 tok->done = E_TOKEN;
1430 tok_backup(tok, c);
1431 return ERRORTOKEN;
1432 }
1433 do {
1434 c = tok_nextc(tok);
1435 } while ('0' <= c && c < '8');
1436 }
1437 else if (c == 'b' || c == 'B') {
1438 /* Binary */
1439 c = tok_nextc(tok);
1440 if (c != '0' && c != '1') {
1441 tok->done = E_TOKEN;
1442 tok_backup(tok, c);
1443 return ERRORTOKEN;
1444 }
1445 do {
1446 c = tok_nextc(tok);
1447 } while (c == '0' || c == '1');
1448 }
1449 else {
1450 int found_decimal = 0;
1451 /* Octal; c is first char of it */
1452 /* There's no 'isoctdigit' macro, sigh */
1453 while ('0' <= c && c < '8') {
1454 c = tok_nextc(tok);
1455 }
1456 if (isdigit(c)) {
1457 found_decimal = 1;
1458 do {
1459 c = tok_nextc(tok);
1460 } while (isdigit(c));
1461 }
1462 if (c == '.')
1463 goto fraction;
1464 else if (c == 'e' || c == 'E')
1465 goto exponent;
1466 #ifndef WITHOUT_COMPLEX
1467 else if (c == 'j' || c == 'J')
1468 goto imaginary;
1469 #endif
1470 else if (found_decimal) {
1471 tok->done = E_TOKEN;
1472 tok_backup(tok, c);
1473 return ERRORTOKEN;
1474 }
1475 }
1476 if (c == 'l' || c == 'L')
1477 c = tok_nextc(tok);
1478 }
1479 else {
1480 /* Decimal */
1481 do {
1482 c = tok_nextc(tok);
1483 } while (isdigit(c));
1484 if (c == 'l' || c == 'L')
1485 c = tok_nextc(tok);
1486 else {
1487 /* Accept floating point numbers. */
1488 if (c == '.') {
1489 fraction:
1490 /* Fraction */
1491 do {
1492 c = tok_nextc(tok);
1493 } while (isdigit(c));
1494 }
1495 if (c == 'e' || c == 'E') {
1496 exponent:
1497 /* Exponent part */
1498 c = tok_nextc(tok);
1499 if (c == '+' || c == '-')
1500 c = tok_nextc(tok);
1501 if (!isdigit(c)) {
1502 tok->done = E_TOKEN;
1503 tok_backup(tok, c);
1504 return ERRORTOKEN;
1505 }
1506 do {
1507 c = tok_nextc(tok);
1508 } while (isdigit(c));
1509 }
1510 #ifndef WITHOUT_COMPLEX
1511 if (c == 'j' || c == 'J')
1512 /* Imaginary part */
1513 imaginary:
1514 c = tok_nextc(tok);
1515 #endif
1516 }
1517 }
1518 tok_backup(tok, c);
1519 *p_start = tok->start;
1520 *p_end = tok->cur;
1521 return NUMBER;
1522 }
1523
1524 letter_quote:
1525 /* String */
1526 if (c == '\'' || c == '"') {
1527 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1528 int quote = c;
1529 int triple = 0;
1530 int tripcount = 0;
1531 for (;;) {
1532 c = tok_nextc(tok);
1533 if (c == '\n') {
1534 if (!triple) {
1535 tok->done = E_EOLS;
1536 tok_backup(tok, c);
1537 return ERRORTOKEN;
1538 }
1539 tripcount = 0;
1540 tok->cont_line = 1; /* multiline string. */
1541 }
1542 else if (c == EOF) {
1543 if (triple)
1544 tok->done = E_EOFS;
1545 else
1546 tok->done = E_EOLS;
1547 tok->cur = tok->inp;
1548 return ERRORTOKEN;
1549 }
1550 else if (c == quote) {
1551 tripcount++;
1552 if (tok->cur - tok->start == quote2) {
1553 c = tok_nextc(tok);
1554 if (c == quote) {
1555 triple = 1;
1556 tripcount = 0;
1557 continue;
1558 }
1559 tok_backup(tok, c);
1560 }
1561 if (!triple || tripcount == 3)
1562 break;
1563 }
1564 else if (c == '\\') {
1565 tripcount = 0;
1566 c = tok_nextc(tok);
1567 if (c == EOF) {
1568 tok->done = E_EOLS;
1569 tok->cur = tok->inp;
1570 return ERRORTOKEN;
1571 }
1572 }
1573 else
1574 tripcount = 0;
1575 }
1576 *p_start = tok->start;
1577 *p_end = tok->cur;
1578 return STRING;
1579 }
1580
1581 /* Line continuation */
1582 if (c == '\\') {
1583 c = tok_nextc(tok);
1584 if (c != '\n') {
1585 tok->done = E_LINECONT;
1586 tok->cur = tok->inp;
1587 return ERRORTOKEN;
1588 }
1589 tok->cont_line = 1;
1590 goto again; /* Read next line */
1591 }
1592
1593 /* Check for two-character token */
1594 {
1595 int c2 = tok_nextc(tok);
1596 int token = PyToken_TwoChars(c, c2);
1597 #ifndef PGEN
1598 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1599 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1600 "<> not supported in 3.x; use !=",
1601 tok->filename, tok->lineno,
1602 NULL, NULL)) {
1603 return ERRORTOKEN;
1604 }
1605 }
1606 #endif
1607 if (token != OP) {
1608 int c3 = tok_nextc(tok);
1609 int token3 = PyToken_ThreeChars(c, c2, c3);
1610 if (token3 != OP) {
1611 token = token3;
1612 } else {
1613 tok_backup(tok, c3);
1614 }
1615 *p_start = tok->start;
1616 *p_end = tok->cur;
1617 return token;
1618 }
1619 tok_backup(tok, c2);
1620 }
1621
1622 /* Keep track of parentheses nesting level */
1623 switch (c) {
1624 case '(':
1625 case '[':
1626 case '{':
1627 tok->level++;
1628 break;
1629 case ')':
1630 case ']':
1631 case '}':
1632 tok->level--;
1633 break;
1634 }
1635
1636 /* Punctuation character */
1637 *p_start = tok->start;
1638 *p_end = tok->cur;
1639 return PyToken_OneChar(c);
1640 }
1641
1642 int
1643 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1644 {
1645 int result = tok_get(tok, p_start, p_end);
1646 if (tok->decoding_erred) {
1647 result = ERRORTOKEN;
1648 tok->done = E_DECODE;
1649 }
1650 return result;
1651 }
1652
1653 /* This function is only called from parsetok. However, it cannot live
1654 there, as it must be empty for PGEN, and we can check for PGEN only
1655 in this file. */
1656
1657 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1658 char*
1659 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1660 {
1661 return NULL;
1662 }
1663 #else
1664 #ifdef Py_USING_UNICODE
1665 static PyObject *
1666 dec_utf8(const char *enc, const char *text, size_t len) {
1667 PyObject *ret = NULL;
1668 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1669 if (unicode_text) {
1670 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1671 Py_DECREF(unicode_text);
1672 }
1673 if (!ret) {
1674 PyErr_Clear();
1675 }
1676 return ret;
1677 }
1678 char *
1679 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1680 {
1681 char *text = NULL;
1682 if (tok->encoding) {
1683 /* convert source to original encondig */
1684 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1685 if (lineobj != NULL) {
1686 int linelen = PyString_Size(lineobj);
1687 const char *line = PyString_AsString(lineobj);
1688 text = PyObject_MALLOC(linelen + 1);
1689 if (text != NULL && line != NULL) {
1690 if (linelen)
1691 strncpy(text, line, linelen);
1692 text[linelen] = '\0';
1693 }
1694 Py_DECREF(lineobj);
1695
1696 /* adjust error offset */
1697 if (*offset > 1) {
1698 PyObject *offsetobj = dec_utf8(tok->encoding,
1699 tok->buf, *offset-1);
1700 if (offsetobj) {
1701 *offset = PyString_Size(offsetobj) + 1;
1702 Py_DECREF(offsetobj);
1703 }
1704 }
1705
1706 }
1707 }
1708 return text;
1709
1710 }
1711 #endif /* defined(Py_USING_UNICODE) */
1712 #endif
1713
1714
1715 #ifdef Py_DEBUG
1716
1717 void
1718 tok_dump(int type, char *start, char *end)
1719 {
1720 printf("%s", _PyParser_TokenNames[type]);
1721 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1722 printf("(%.*s)", (int)(end - start), start);
1723 }
1724
1725 #endif