@@ -213,6 +213,14 @@ def untokenize(self, iterable):
213213 self .tokens .append (indent )
214214 self .prev_col = len (indent )
215215 startline = False
216+ elif tok_type == FSTRING_MIDDLE :
217+ if '{' in token or '}' in token :
218+ end_line , end_col = end
219+ end = (end_line , end_col + token .count ('{' ) + token .count ('}' ))
220+ token = re .sub ('{' , '{{' , token )
221+ token = re .sub ('}' , '}}' , token )
222+
223+
216224 self .add_whitespace (start )
217225 self .tokens .append (token )
218226 self .prev_row , self .prev_col = end
@@ -255,6 +263,11 @@ def compat(self, token, iterable):
255263 elif startline and indents :
256264 toks_append (indents [- 1 ])
257265 startline = False
266+ elif toknum == FSTRING_MIDDLE :
267+ if '{' in tokval or '}' in tokval :
268+ tokval = re .sub ('{' , '{{' , tokval )
269+ tokval = re .sub ('}' , '}}' , tokval )
270+
258271 toks_append (tokval )
259272
260273
@@ -404,36 +417,6 @@ def open(filename):
404417 buffer .close ()
405418 raise
406419
407- def tokenize2 (readline ):
408- encoding , consumed = detect_encoding (readline )
409- rl_gen = _itertools .chain (consumed , iter (readline , b"" ))
410- if encoding is not None :
411- if encoding == "utf-8-sig" :
412- # BOM will already have been stripped.
413- encoding = "utf-8"
414- yield TokenInfo (ENCODING , encoding , (0 , 0 ), (0 , 0 ), '' )
415- yield from _tokenize2 (rl_gen , encoding )
416-
417- def _tokenize2 (rl_gen , encoding ):
418- source = b"" .join (rl_gen )
419- token = None
420- for token in _generate_tokens_from_c_tokenizer (source .decode (encoding ), extra_tokens = True ):
421- # TODO: Marta -> limpiar esto
422- if 6 < token .type <= 54 :
423- token = token ._replace (type = OP )
424- if token .type in {ASYNC , AWAIT }:
425- token = token ._replace (type = NAME )
426- if token .type == NEWLINE :
427- l_start , c_start = token .start
428- l_end , c_end = token .end
429- token = token ._replace (string = '\n ' , start = (l_start , c_start ), end = (l_end , c_end + 1 ))
430-
431- yield token
432- if token is not None :
433- last_line , _ = token .start
434- yield TokenInfo (ENDMARKER , '' , (last_line + 1 , 0 ), (last_line + 1 , 0 ), '' )
435-
436-
437420def tokenize (readline ):
438421 """
439422 The tokenize() generator requires one argument, readline, which
@@ -454,194 +437,33 @@ def tokenize(readline):
454437 which tells you which encoding was used to decode the bytes stream.
455438 """
456439 encoding , consumed = detect_encoding (readline )
457- empty = _itertools .repeat (b"" )
458- rl_gen = _itertools .chain (consumed , iter (readline , b"" ), empty )
459- return _tokenize (rl_gen .__next__ , encoding )
460-
461-
462- def _tokenize (readline , encoding ):
463- lnum = parenlev = continued = 0
464- numchars = '0123456789'
465- contstr , needcont = '' , 0
466- contline = None
467- indents = [0 ]
468-
440+ rl_gen = _itertools .chain (consumed , iter (readline , b"" ))
469441 if encoding is not None :
470442 if encoding == "utf-8-sig" :
471443 # BOM will already have been stripped.
472444 encoding = "utf-8"
473445 yield TokenInfo (ENCODING , encoding , (0 , 0 ), (0 , 0 ), '' )
474- last_line = b''
475- line = b''
476- while True : # loop over lines in stream
477- try :
478- # We capture the value of the line variable here because
479- # readline uses the empty string '' to signal end of input,
480- # hence `line` itself will always be overwritten at the end
481- # of this loop.
482- last_line = line
483- line = readline ()
484- except StopIteration :
485- line = b''
486-
487- if encoding is not None :
488- line = line .decode (encoding )
489- lnum += 1
490- pos , max = 0 , len (line )
491-
492- if contstr : # continued string
493- if not line :
494- raise TokenError ("EOF in multi-line string" , strstart )
495- endmatch = endprog .match (line )
496- if endmatch :
497- pos = end = endmatch .end (0 )
498- yield TokenInfo (STRING , contstr + line [:end ],
499- strstart , (lnum , end ), contline + line )
500- contstr , needcont = '' , 0
501- contline = None
502- elif needcont and line [- 2 :] != '\\ \n ' and line [- 3 :] != '\\ \r \n ' :
503- yield TokenInfo (ERRORTOKEN , contstr + line ,
504- strstart , (lnum , len (line )), contline )
505- contstr = ''
506- contline = None
507- continue
508- else :
509- contstr = contstr + line
510- contline = contline + line
511- continue
512-
513- elif parenlev == 0 and not continued : # new statement
514- if not line : break
515- column = 0
516- while pos < max : # measure leading whitespace
517- if line [pos ] == ' ' :
518- column += 1
519- elif line [pos ] == '\t ' :
520- column = (column // tabsize + 1 )* tabsize
521- elif line [pos ] == '\f ' :
522- column = 0
523- else :
524- break
525- pos += 1
526- if pos == max :
527- break
528-
529- if line [pos ] in '#\r \n ' : # skip comments or blank lines
530- if line [pos ] == '#' :
531- comment_token = line [pos :].rstrip ('\r \n ' )
532- yield TokenInfo (COMMENT , comment_token ,
533- (lnum , pos ), (lnum , pos + len (comment_token )), line )
534- pos += len (comment_token )
535-
536- yield TokenInfo (NL , line [pos :],
537- (lnum , pos ), (lnum , len (line )), line )
538- continue
539-
540- if column > indents [- 1 ]: # count indents or dedents
541- indents .append (column )
542- yield TokenInfo (INDENT , line [:pos ], (lnum , 0 ), (lnum , pos ), line )
543- while column < indents [- 1 ]:
544- if column not in indents :
545- raise IndentationError (
546- "unindent does not match any outer indentation level" ,
547- ("<tokenize>" , lnum , pos , line ))
548- indents = indents [:- 1 ]
549-
550- yield TokenInfo (DEDENT , '' , (lnum , pos ), (lnum , pos ), line )
551-
552- else : # continued statement
553- if not line :
554- raise TokenError ("EOF in multi-line statement" , (lnum , 0 ))
555- continued = 0
556-
557- while pos < max :
558- pseudomatch = _compile (PseudoToken ).match (line , pos )
559- if pseudomatch : # scan for tokens
560- start , end = pseudomatch .span (1 )
561- spos , epos , pos = (lnum , start ), (lnum , end ), end
562- if start == end :
563- continue
564- token , initial = line [start :end ], line [start ]
565-
566- if (initial in numchars or # ordinary number
567- (initial == '.' and token != '.' and token != '...' )):
568- yield TokenInfo (NUMBER , token , spos , epos , line )
569- elif initial in '\r \n ' :
570- if parenlev > 0 :
571- yield TokenInfo (NL , token , spos , epos , line )
572- else :
573- yield TokenInfo (NEWLINE , token , spos , epos , line )
574-
575- elif initial == '#' :
576- assert not token .endswith ("\n " )
577- yield TokenInfo (COMMENT , token , spos , epos , line )
578-
579- elif token in triple_quoted :
580- endprog = _compile (endpats [token ])
581- endmatch = endprog .match (line , pos )
582- if endmatch : # all on one line
583- pos = endmatch .end (0 )
584- token = line [start :pos ]
585- yield TokenInfo (STRING , token , spos , (lnum , pos ), line )
586- else :
587- strstart = (lnum , start ) # multiple lines
588- contstr = line [start :]
589- contline = line
590- break
591-
592- # Check up to the first 3 chars of the token to see if
593- # they're in the single_quoted set. If so, they start
594- # a string.
595- # We're using the first 3, because we're looking for
596- # "rb'" (for example) at the start of the token. If
597- # we switch to longer prefixes, this needs to be
598- # adjusted.
599- # Note that initial == token[:1].
600- # Also note that single quote checking must come after
601- # triple quote checking (above).
602- elif (initial in single_quoted or
603- token [:2 ] in single_quoted or
604- token [:3 ] in single_quoted ):
605- if token [- 1 ] == '\n ' : # continued string
606- strstart = (lnum , start )
607- # Again, using the first 3 chars of the
608- # token. This is looking for the matching end
609- # regex for the correct type of quote
610- # character. So it's really looking for
611- # endpats["'"] or endpats['"'], by trying to
612- # skip string prefix characters, if any.
613- endprog = _compile (endpats .get (initial ) or
614- endpats .get (token [1 ]) or
615- endpats .get (token [2 ]))
616- contstr , needcont = line [start :], 1
617- contline = line
618- break
619- else : # ordinary string
620- yield TokenInfo (STRING , token , spos , epos , line )
621-
622- elif initial .isidentifier (): # ordinary name
623- yield TokenInfo (NAME , token , spos , epos , line )
624- elif initial == '\\ ' : # continued stmt
625- continued = 1
626- else :
627- if initial in '([{' :
628- parenlev += 1
629- elif initial in ')]}' :
630- parenlev -= 1
631- yield TokenInfo (OP , token , spos , epos , line )
632- else :
633- yield TokenInfo (ERRORTOKEN , line [pos ],
634- (lnum , pos ), (lnum , pos + 1 ), line )
635- pos += 1
446+ yield from _tokenize (rl_gen , encoding )
447+
448+ def _tokenize (rl_gen , encoding ):
449+ source = b"" .join (rl_gen ).decode (encoding )
450+ token = None
451+ for token in _generate_tokens_from_c_tokenizer (source , extra_tokens = True ):
452+ # TODO: Marta -> limpiar esto
453+ if 6 < token .type <= 54 :
454+ token = token ._replace (type = OP )
455+ if token .type in {ASYNC , AWAIT }:
456+ token = token ._replace (type = NAME )
457+ if token .type == NEWLINE :
458+ l_start , c_start = token .start
459+ l_end , c_end = token .end
460+ token = token ._replace (string = '\n ' , start = (l_start , c_start ), end = (l_end , c_end + 1 ))
636461
637- # Add an implicit NEWLINE if the input doesn't end in one
638- if last_line and last_line [- 1 ] not in '\r \n ' and not last_line .strip ().startswith ("#" ):
639- yield TokenInfo (NEWLINE , '' , (lnum - 1 , len (last_line )), (lnum - 1 , len (last_line ) + 1 ), '' )
640- for indent in indents [1 :]: # pop remaining indent levels
641- yield TokenInfo (DEDENT , '' , (lnum , 0 ), (lnum , 0 ), '' )
642- yield TokenInfo (ENDMARKER , '' , (lnum , 0 ), (lnum , 0 ), '' )
462+ yield token
463+ if token is not None :
464+ last_line , _ = token .start
465+ yield TokenInfo (ENDMARKER , '' , (last_line + 1 , 0 ), (last_line + 1 , 0 ), '' )
643466
644- tokenize = tokenize2
645467
646468def generate_tokens (readline ):
647469 """Tokenize a source reading Python code as unicode strings.
@@ -658,7 +480,7 @@ def _gen():
658480 if not line :
659481 return
660482 yield line .encode ()
661- return _tokenize2 (_gen (), 'utf-8' )
483+ return _tokenize (_gen (), 'utf-8' )
662484
663485def main ():
664486 import argparse
0 commit comments