Skip to content

Commit 71780a0

Browse files
committed
detect invalid utf8 source
1 parent 144987f commit 71780a0

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

lexer.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
#include "lexer.h"
22

3+
#include <algorithm>
34
#include <iso646.h>
45
#include <sstream>
56

7+
#include <utf8.h>
8+
69
#include "util.h"
710

811
std::string Token::tostring() const
@@ -70,6 +73,17 @@ std::string Token::tostring() const
7073

7174
std::vector<Token> tokenize(const std::string &source)
7275
{
76+
auto inv = utf8::find_invalid(source.begin(), source.end());
77+
if (inv != source.end()) {
78+
int line = 1 + std::count(source.begin(), inv, '\n');
79+
auto bol = source.rfind('\n', inv-source.begin());
80+
if (bol == std::string::npos) {
81+
bol = -1;
82+
}
83+
int column = inv - (source.begin() + bol);
84+
fprintf(stderr, "%d:%d invalid utf8 data in source\n", line, column);
85+
exit(1);
86+
}
7387
int line = 1;
7488
int column = 1;
7589
std::vector<Token> tokens;

run_test.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
def run(fn):
88
print ("Running {}...".format(fn))
99

10-
src = codecs.open(fn, encoding="UTF-8").read().replace("\r\n", "\n")
10+
try:
11+
src = codecs.open(fn, encoding="UTF-8").read().replace("\r\n", "\n")
12+
except UnicodeDecodeError:
13+
# One test has invalid UTF-8 data, so read it as the default encoding.
14+
src = open(fn).read().replace("\r\n", "\n")
1115

1216
all_comments = re.findall("^%(.*)$", src, re.MULTILINE)
1317
todo = any("TODO" in x for x in all_comments)

0 commit comments

Comments
 (0)