Skip to content

Commit f4b1a64

Browse files
committed
* Parser/tokenizer.c: backup over illegal newline in string
literal (for "completeness" test)
1 parent bd0389d commit f4b1a64

File tree

1 file changed

+119
-92
lines changed

1 file changed

+119
-92
lines changed

Parser/tokenizer.c

Lines changed: 119 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***********************************************************
2-
Copyright 1991, 1992, 1993 by Stichting Mathematisch Centrum,
2+
Copyright 1991, 1992, 1993, 1994 by Stichting Mathematisch Centrum,
33
Amsterdam, The Netherlands.
44
55
All Rights Reserved
@@ -24,19 +24,18 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
2424

2525
/* Tokenizer implementation */
2626

27-
/* XXX This is rather old, should be restructured perhaps */
28-
/* XXX Need a better interface to report errors than writing to stderr */
29-
/* XXX Should use editor resource to fetch true tab size on Macintosh */
30-
3127
#include "pgenheaders.h"
3228

3329
#include <ctype.h>
34-
#include "string.h"
3530

36-
#include "fgetsintr.h"
3731
#include "tokenizer.h"
3832
#include "errcode.h"
3933

34+
extern char *my_readline PROTO((char *));
35+
/* Return malloc'ed string including trailing \n;
36+
empty malloc'ed string for EOF;
37+
NULL if interrupted */
38+
4039
/* Don't ever change this -- it would break the portability of Python code */
4140
#define TABSIZE 8
4241

@@ -99,7 +98,7 @@ tok_new()
9998
struct tok_state *tok = NEW(struct tok_state, 1);
10099
if (tok == NULL)
101100
return NULL;
102-
tok->buf = tok->cur = tok->end = tok->inp = NULL;
101+
tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
103102
tok->done = E_OK;
104103
tok->fp = NULL;
105104
tok->tabsize = TABSIZE;
@@ -158,7 +157,6 @@ void
158157
tok_free(tok)
159158
struct tok_state *tok;
160159
{
161-
/* XXX really need a separate flag to say 'my buffer' */
162160
if (tok->fp != NULL && tok->buf != NULL)
163161
DEL(tok->buf);
164162
DEL(tok);
@@ -180,58 +178,78 @@ tok_nextc(tok)
180178
tok->done = E_EOF;
181179
return EOF;
182180
}
183-
#ifdef USE_READLINE
184181
if (tok->prompt != NULL) {
185-
extern char *readline PROTO((char *prompt));
186-
static int been_here;
187-
if (!been_here) {
188-
/* Force rebind of TAB to insert-tab */
189-
extern int rl_insert();
190-
rl_bind_key('\t', rl_insert);
191-
been_here++;
192-
}
193-
if (tok->buf != NULL)
194-
free(tok->buf);
195-
tok->buf = readline(tok->prompt);
196-
(void) intrcheck(); /* Clear pending interrupt */
182+
char *new = my_readline(tok->prompt);
197183
if (tok->nextprompt != NULL)
198184
tok->prompt = tok->nextprompt;
199-
if (tok->buf == NULL) {
185+
if (new == NULL)
186+
tok->done = E_INTR;
187+
else if (*new == '\0') {
188+
free(new);
200189
tok->done = E_EOF;
201190
}
191+
else if (tok->start != NULL) {
192+
int start = tok->start - tok->buf;
193+
int oldlen = tok->cur - tok->buf;
194+
int newlen = oldlen + strlen(new);
195+
char *buf = realloc(tok->buf, newlen+1);
196+
tok->lineno++;
197+
if (buf == NULL) {
198+
free(tok->buf);
199+
free(new);
200+
tok->done = E_NOMEM;
201+
return EOF;
202+
}
203+
tok->buf = buf;
204+
tok->cur = tok->buf + oldlen;
205+
strcpy(tok->buf + oldlen, new);
206+
free(new);
207+
tok->inp = tok->buf + newlen;
208+
tok->end = tok->inp + 1;
209+
tok->start = tok->buf + start;
210+
}
202211
else {
203-
tok->end = strchr(tok->buf, '\0');
204-
if (tok->end > tok->buf)
205-
add_history(tok->buf);
206-
/* Replace trailing '\n' by '\0'
207-
(we don't need a '\0', but the
208-
tokenizer wants a '\n'...) */
209-
*tok->end++ = '\n';
210-
tok->inp = tok->end;
212+
tok->lineno++;
213+
if (tok->buf != NULL)
214+
free(tok->buf);
215+
tok->buf = new;
211216
tok->cur = tok->buf;
217+
tok->inp = strchr(tok->buf, '\0');
218+
tok->end = tok->inp + 1;
212219
}
213220
}
214-
else
215-
#endif
216-
{
217-
if (tok->prompt != NULL) {
218-
fprintf(stderr, "%s", tok->prompt);
219-
if (tok->nextprompt != NULL)
220-
tok->prompt = tok->nextprompt;
221-
}
222-
if (tok->buf == NULL) {
223-
tok->buf = NEW(char, BUFSIZ);
221+
else {
222+
int done = 0;
223+
int cur = 0;
224+
if (tok->start == NULL) {
224225
if (tok->buf == NULL) {
225-
tok->done = E_NOMEM;
226-
return EOF;
226+
tok->buf = NEW(char, BUFSIZ);
227+
if (tok->buf == NULL) {
228+
tok->done = E_NOMEM;
229+
return EOF;
230+
}
231+
tok->end = tok->buf + BUFSIZ;
232+
}
233+
if (fgets(tok->buf, (int)(tok->end - tok->buf),
234+
tok->fp) == NULL) {
235+
tok->done = E_EOF;
236+
done = 1;
237+
}
238+
else {
239+
tok->done = E_OK;
240+
tok->inp = strchr(tok->buf, '\0');
241+
done = tok->inp[-1] == '\n';
227242
}
228-
tok->end = tok->buf + BUFSIZ;
229243
}
230-
tok->done = fgets_intr(tok->buf,
231-
(int)(tok->end - tok->buf), tok->fp);
232-
tok->inp = strchr(tok->buf, '\0');
244+
else {
245+
cur = tok->cur - tok->buf;
246+
tok->done = E_OK;
247+
}
248+
tok->lineno++;
233249
/* Read until '\n' or EOF */
234-
while (tok->inp+1==tok->end && tok->inp[-1]!='\n') {
250+
while (!done) {
251+
int curstart = tok->start == NULL ? -1 :
252+
tok->start - tok->buf;
235253
int curvalid = tok->inp - tok->buf;
236254
int cursize = tok->end - tok->buf;
237255
int newsize = cursize + BUFSIZ;
@@ -245,13 +263,19 @@ tok_nextc(tok)
245263
tok->buf = newbuf;
246264
tok->inp = tok->buf + curvalid;
247265
tok->end = tok->buf + newsize;
248-
if (fgets_intr(tok->inp,
266+
tok->start = curstart < 0 ? NULL :
267+
tok->buf + curstart;
268+
if (fgets(tok->inp,
249269
(int)(tok->end - tok->inp),
250-
tok->fp) != E_OK)
251-
break;
270+
tok->fp) == NULL) {
271+
/* Last line does not end in \n,
272+
fake one */
273+
strcpy(tok->inp, "\n");
274+
}
252275
tok->inp = strchr(tok->inp, '\0');
276+
done = tok->inp[-1] == '\n';
253277
}
254-
tok->cur = tok->buf;
278+
tok->cur = tok->buf + cur;
255279
}
256280
if (tok->done != E_OK) {
257281
if (tok->prompt != NULL)
@@ -360,14 +384,15 @@ tok_get(tok, p_start, p_end)
360384
register int c;
361385
int blankline;
362386

387+
*p_start = *p_end = NULL;
363388
nextline:
389+
tok->start = NULL;
364390
blankline = 0;
365391

366392
/* Get indentation level */
367393
if (tok->atbol) {
368394
register int col = 0;
369395
tok->atbol = 0;
370-
tok->lineno++;
371396
for (;;) {
372397
c = tok_nextc(tok);
373398
if (c == ' ')
@@ -423,7 +448,7 @@ tok_get(tok, p_start, p_end)
423448
}
424449
}
425450

426-
*p_start = *p_end = tok->cur;
451+
tok->start = tok->cur;
427452

428453
/* Return pending indents/dedents */
429454
if (tok->pendin != 0) {
@@ -438,13 +463,14 @@ tok_get(tok, p_start, p_end)
438463
}
439464

440465
again:
466+
tok->start = NULL;
441467
/* Skip spaces */
442468
do {
443469
c = tok_nextc(tok);
444470
} while (c == ' ' || c == '\t');
445471

446472
/* Set start of current token */
447-
*p_start = tok->cur - 1;
473+
tok->start = tok->cur - 1;
448474

449475
/* Skip comment */
450476
if (c == '#') {
@@ -467,7 +493,6 @@ tok_get(tok, p_start, p_end)
467493

468494
/* Check for EOF and errors now */
469495
if (c == EOF) {
470-
*p_start = *p_end = tok->cur;
471496
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
472497
}
473498

@@ -477,6 +502,7 @@ tok_get(tok, p_start, p_end)
477502
c = tok_nextc(tok);
478503
} while (isalnum(c) || c == '_');
479504
tok_backup(tok, c);
505+
*p_start = tok->start;
480506
*p_end = tok->cur;
481507
return NAME;
482508
}
@@ -486,6 +512,7 @@ tok_get(tok, p_start, p_end)
486512
tok->atbol = 1;
487513
if (blankline || tok->level > 0)
488514
goto nextline;
515+
*p_start = tok->start;
489516
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
490517
return NEWLINE;
491518
}
@@ -498,6 +525,7 @@ tok_get(tok, p_start, p_end)
498525
}
499526
else {
500527
tok_backup(tok, c);
528+
*p_start = tok->start;
501529
*p_end = tok->cur;
502530
return DOT;
503531
}
@@ -538,9 +566,7 @@ tok_get(tok, p_start, p_end)
538566
else {
539567
/* Accept floating point numbers.
540568
XXX This accepts incomplete things like
541-
XXX 12e or 1e+; worry run-time.
542-
XXX Doesn't accept numbers
543-
XXX starting with a dot */
569+
XXX 12e or 1e+; worry run-time */
544570
if (c == '.') {
545571
fraction:
546572
/* Fraction */
@@ -560,58 +586,58 @@ tok_get(tok, p_start, p_end)
560586
}
561587
}
562588
tok_backup(tok, c);
589+
*p_start = tok->start;
563590
*p_end = tok->cur;
564591
return NUMBER;
565592
}
566593

567-
/* String (single quotes) */
568-
if (c == '\'') {
594+
/* String */
595+
if (c == '\'' || c == '"') {
596+
int quote = c;
597+
int triple = 0;
598+
int tripcount = 0;
569599
for (;;) {
570600
c = tok_nextc(tok);
571-
if (c == '\n' || c == EOF) {
572-
tok->done = E_TOKEN;
573-
tok->cur = tok->inp;
574-
return ERRORTOKEN;
575-
}
576-
if (c == '\\') {
577-
c = tok_nextc(tok);
578-
*p_end = tok->cur;
579-
if (c == '\n' || c == EOF) {
601+
if (c == '\n') {
602+
if (!triple) {
580603
tok->done = E_TOKEN;
581-
tok->cur = tok->inp;
604+
tok_backup(tok, c);
582605
return ERRORTOKEN;
583606
}
584-
continue;
607+
tripcount = 0;
585608
}
586-
if (c == '\'')
587-
break;
588-
}
589-
*p_end = tok->cur;
590-
return STRING;
591-
}
592-
593-
/* String (double quotes) */
594-
if (c == '\"') {
595-
for (;;) {
596-
c = tok_nextc(tok);
597-
if (c == '\n' || c == EOF) {
609+
else if (c == EOF) {
598610
tok->done = E_TOKEN;
599611
tok->cur = tok->inp;
600612
return ERRORTOKEN;
601613
}
602-
if (c == '\\') {
614+
else if (c == quote) {
615+
tripcount++;
616+
if (tok->cur == tok->start+2) {
617+
c = tok_nextc(tok);
618+
if (c == quote) {
619+
triple = 1;
620+
tripcount = 0;
621+
continue;
622+
}
623+
tok_backup(tok, c);
624+
}
625+
if (!triple || tripcount == 3)
626+
break;
627+
}
628+
else if (c == '\\') {
629+
tripcount = 0;
603630
c = tok_nextc(tok);
604-
*p_end = tok->cur;
605-
if (c == '\n' || c == EOF) {
631+
if (c == EOF) {
606632
tok->done = E_TOKEN;
607633
tok->cur = tok->inp;
608634
return ERRORTOKEN;
609635
}
610-
continue;
611636
}
612-
if (c == '\"')
613-
break;
637+
else
638+
tripcount = 0;
614639
}
640+
*p_start = tok->start;
615641
*p_end = tok->cur;
616642
return STRING;
617643
}
@@ -624,7 +650,6 @@ tok_get(tok, p_start, p_end)
624650
tok->cur = tok->inp;
625651
return ERRORTOKEN;
626652
}
627-
tok->lineno++;
628653
goto again; /* Read next line */
629654
}
630655

@@ -633,13 +658,14 @@ tok_get(tok, p_start, p_end)
633658
int c2 = tok_nextc(tok);
634659
int token = tok_2char(c, c2);
635660
if (token != OP) {
661+
*p_start = tok->start;
636662
*p_end = tok->cur;
637663
return token;
638664
}
639665
tok_backup(tok, c2);
640666
}
641667

642-
/* Keep track of parenteses nesting level */
668+
/* Keep track of parentheses nesting level */
643669
switch (c) {
644670
case '(':
645671
case '[':
@@ -654,6 +680,7 @@ tok_get(tok, p_start, p_end)
654680
}
655681

656682
/* Punctuation character */
683+
*p_start = tok->start;
657684
*p_end = tok->cur;
658685
return tok_1char(c);
659686
}

0 commit comments

Comments
 (0)