From e7908c0a6676db2dbf29c46ecf66d61a315e74cf Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Thu, 6 Feb 2014 23:56:52 +1100 Subject: [PATCH 1/4] extra::json: remove the use of `unsafe` char transmutes. Avoid using -1 as a char sentinel, when Option is the perfect thing. --- src/libextra/json.rs | 278 +++++++++++++++++++++---------------------- 1 file changed, 139 insertions(+), 139 deletions(-) diff --git a/src/libextra/json.rs b/src/libextra/json.rs index 75bbd3d8a019d..3938f1a89942b 100644 --- a/src/libextra/json.rs +++ b/src/libextra/json.rs @@ -229,7 +229,6 @@ fn main() { */ use std::char; -use std::cast::transmute; use std::f64; use std::hashmap::HashMap; use std::io; @@ -718,7 +717,7 @@ impl Json { pub struct Parser { priv rdr: T, - priv ch: char, + priv ch: Option, priv line: uint, priv col: uint, } @@ -728,7 +727,7 @@ impl> Parser { pub fn new(rdr: T) -> Parser { let mut p = Parser { rdr: rdr, - ch: '\x00', + ch: Some('\x00'), line: 1, col: 0, }; @@ -756,16 +755,12 @@ impl> Parser { } impl> Parser { - // FIXME: #8971: unsound - fn eof(&self) -> bool { self.ch == unsafe { transmute(-1u32) } } - + fn eof(&self) -> bool { self.ch.is_none() } + fn ch_or_null(&self) -> char { self.ch.unwrap_or('\x00') } fn bump(&mut self) { - match self.rdr.next() { - Some(ch) => self.ch = ch, - None() => self.ch = unsafe { transmute(-1u32) }, // FIXME: #8971: unsound - } + self.ch = self.rdr.next(); - if self.ch == '\n' { + if self.ch_is('\n') { self.line += 1u; self.col = 1u; } else { @@ -773,10 +768,13 @@ impl> Parser { } } - fn next_char(&mut self) -> char { + fn next_char(&mut self) -> Option { self.bump(); self.ch } + fn ch_is(&self, c: char) -> bool { + self.ch == Some(c) + } fn error(&self, msg: ~str) -> Result { Err(Error { line: self.line, col: self.col, msg: msg }) @@ -787,31 +785,32 @@ impl> Parser { if self.eof() { return self.error(~"EOF while parsing value"); } - match self.ch { - 'n' => self.parse_ident("ull", Null), - 't' => self.parse_ident("rue", Boolean(true)), - 'f' => self.parse_ident("alse", Boolean(false)), - '0' .. '9' | '-' => self.parse_number(), - '"' => - match self.parse_str() { - Ok(s) => Ok(String(s)), - Err(e) => Err(e), + match self.ch_or_null() { + 'n' => self.parse_ident("ull", Null), + 't' => self.parse_ident("rue", Boolean(true)), + 'f' => self.parse_ident("alse", Boolean(false)), + '0' .. '9' | '-' => self.parse_number(), + '"' => { + match self.parse_str() { + Ok(s) => Ok(String(s)), + Err(e) => Err(e), + } }, - '[' => self.parse_list(), - '{' => self.parse_object(), - _ => self.error(~"invalid syntax") + '[' => self.parse_list(), + '{' => self.parse_object(), + _ => self.error(~"invalid syntax"), } } fn parse_whitespace(&mut self) { - while self.ch == ' ' || - self.ch == '\n' || - self.ch == '\t' || - self.ch == '\r' { self.bump(); } + while self.ch_is(' ') || + self.ch_is('\n') || + self.ch_is('\t') || + self.ch_is('\r') { self.bump(); } } fn parse_ident(&mut self, ident: &str, value: Json) -> Result { - if ident.chars().all(|c| c == self.next_char()) { + if ident.chars().all(|c| Some(c) == self.next_char()) { self.bump(); Ok(value) } else { @@ -822,7 +821,7 @@ impl> Parser { fn parse_number(&mut self) -> Result { let mut neg = 1.0; - if self.ch == '-' { + if self.ch_is('-') { self.bump(); neg = -1.0; } @@ -832,14 +831,14 @@ impl> Parser { Err(e) => return Err(e) }; - if self.ch == '.' { + if self.ch_is('.') { match self.parse_decimal(res) { Ok(r) => res = r, Err(e) => return Err(e) } } - if self.ch == 'e' || self.ch == 'E' { + if self.ch_is('e') || self.ch_is('E') { match self.parse_exponent(res) { Ok(r) => res = r, Err(e) => return Err(e) @@ -852,32 +851,31 @@ impl> Parser { fn parse_integer(&mut self) -> Result { let mut res = 0.0; - match self.ch { - '0' => { - self.bump(); - - // There can be only one leading '0'. - match self.ch { - '0' .. '9' => return self.error(~"invalid number"), - _ => () - } - } - '1' .. '9' => { - while !self.eof() { - match self.ch { - '0' .. '9' => { - res *= 10.0; - res += ((self.ch as int) - ('0' as int)) as f64; + match self.ch_or_null() { + '0' => { + self.bump(); - self.bump(); - } - _ => break + // There can be only one leading '0'. + match self.ch_or_null() { + '0' .. '9' => return self.error(~"invalid number"), + _ => () + } + }, + '1' .. '9' => { + while !self.eof() { + match self.ch_or_null() { + c @ '0' .. '9' => { + res *= 10.0; + res += ((c as int) - ('0' as int)) as f64; + + self.bump(); + } + _ => break, + } } } - } - _ => return self.error(~"invalid number") + _ => return self.error(~"invalid number"), } - Ok(res) } @@ -885,22 +883,22 @@ impl> Parser { self.bump(); // Make sure a digit follows the decimal place. - match self.ch { - '0' .. '9' => (), - _ => return self.error(~"invalid number") + match self.ch_or_null() { + '0' .. '9' => (), + _ => return self.error(~"invalid number") } let mut res = res; let mut dec = 1.0; while !self.eof() { - match self.ch { - '0' .. '9' => { - dec /= 10.0; - res += (((self.ch as int) - ('0' as int)) as f64) * dec; + match self.ch_or_null() { + c @ '0' .. '9' => { + dec /= 10.0; + res += (((c as int) - ('0' as int)) as f64) * dec; - self.bump(); - } - _ => break + self.bump(); + } + _ => break, } } @@ -913,27 +911,27 @@ impl> Parser { let mut exp = 0u; let mut neg_exp = false; - match self.ch { - '+' => self.bump(), - '-' => { self.bump(); neg_exp = true; } - _ => () + if self.ch_is('+') { + self.bump(); + } else if self.ch_is('-') { + self.bump(); + neg_exp = true; } // Make sure a digit follows the exponent place. - match self.ch { - '0' .. '9' => (), - _ => return self.error(~"invalid number") + match self.ch_or_null() { + '0' .. '9' => (), + _ => return self.error(~"invalid number") } - while !self.eof() { - match self.ch { - '0' .. '9' => { - exp *= 10u; - exp += (self.ch as uint) - ('0' as uint); + match self.ch_or_null() { + c @ '0' .. '9' => { + exp *= 10; + exp += (c as uint) - ('0' as uint); - self.bump(); - } - _ => break + self.bump(); + } + _ => break } } @@ -958,56 +956,55 @@ impl> Parser { } if escape { - match self.ch { - '"' => res.push_char('"'), - '\\' => res.push_char('\\'), - '/' => res.push_char('/'), - 'b' => res.push_char('\x08'), - 'f' => res.push_char('\x0c'), - 'n' => res.push_char('\n'), - 'r' => res.push_char('\r'), - 't' => res.push_char('\t'), - 'u' => { - // Parse \u1234. - let mut i = 0u; - let mut n = 0u; - while i < 4u { - match self.next_char() { - '0' .. '9' => { - n = n * 16u + (self.ch as uint) - - ('0' as uint); - }, - 'a' | 'A' => n = n * 16u + 10u, - 'b' | 'B' => n = n * 16u + 11u, - 'c' | 'C' => n = n * 16u + 12u, - 'd' | 'D' => n = n * 16u + 13u, - 'e' | 'E' => n = n * 16u + 14u, - 'f' | 'F' => n = n * 16u + 15u, - _ => return self.error( - ~"invalid \\u escape (unrecognized hex)") - } - i += 1u; - } - - // Error out if we didn't parse 4 digits. - if i != 4u { - return self.error( - ~"invalid \\u escape (not four digits)"); - } - - res.push_char(char::from_u32(n as u32).unwrap()); - } - _ => return self.error(~"invalid escape") + match self.ch_or_null() { + '"' => res.push_char('"'), + '\\' => res.push_char('\\'), + '/' => res.push_char('/'), + 'b' => res.push_char('\x08'), + 'f' => res.push_char('\x0c'), + 'n' => res.push_char('\n'), + 'r' => res.push_char('\r'), + 't' => res.push_char('\t'), + 'u' => { + // Parse \u1234. + let mut i = 0u; + let mut n = 0u; + while i < 4u && !self.eof() { + self.bump(); + n = match self.ch_or_null() { + c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint), + 'a' | 'A' => n * 16u + 10u, + 'b' | 'B' => n * 16u + 11u, + 'c' | 'C' => n * 16u + 12u, + 'd' | 'D' => n * 16u + 13u, + 'e' | 'E' => n * 16u + 14u, + 'f' | 'F' => n * 16u + 15u, + _ => return self.error( + ~"invalid \\u escape (unrecognized hex)") + }; + + i += 1u; + } + + // Error out if we didn't parse 4 digits. + if i != 4u { + return self.error( + ~"invalid \\u escape (not four digits)"); + } + + res.push_char(char::from_u32(n as u32).unwrap()); + } + _ => return self.error(~"invalid escape"), } escape = false; - } else if self.ch == '\\' { + } else if self.ch_is('\\') { escape = true; } else { - if self.ch == '"' { - self.bump(); - return Ok(res); + match self.ch { + Some('"') => { self.bump(); return Ok(res); }, + Some(c) => res.push_char(c), + None => unreachable!() } - res.push_char(self.ch); } } } @@ -1018,7 +1015,7 @@ impl> Parser { let mut values = ~[]; - if self.ch == ']' { + if self.ch_is(']') { self.bump(); return Ok(List(values)); } @@ -1034,10 +1031,13 @@ impl> Parser { return self.error(~"EOF while parsing list"); } - match self.ch { - ',' => self.bump(), - ']' => { self.bump(); return Ok(List(values)); } - _ => return self.error(~"expected `,` or `]`") + if self.ch_is(',') { + self.bump(); + } else if self.ch_is(']') { + self.bump(); + return Ok(List(values)); + } else { + return self.error(~"expected `,` or `]`") } }; } @@ -1048,7 +1048,7 @@ impl> Parser { let mut values = ~TreeMap::new(); - if self.ch == '}' { + if self.ch_is('}') { self.bump(); return Ok(Object(values)); } @@ -1056,7 +1056,7 @@ impl> Parser { while !self.eof() { self.parse_whitespace(); - if self.ch != '"' { + if !self.ch_is('"') { return self.error(~"key must be a string"); } @@ -1067,7 +1067,7 @@ impl> Parser { self.parse_whitespace(); - if self.ch != ':' { + if !self.ch_is(':') { if self.eof() { break; } return self.error(~"expected `:`"); } @@ -1079,13 +1079,13 @@ impl> Parser { } self.parse_whitespace(); - match self.ch { - ',' => self.bump(), - '}' => { self.bump(); return Ok(Object(values)); } - _ => { - if self.eof() { break; } - return self.error(~"expected `,` or `}`"); - } + match self.ch_or_null() { + ',' => self.bump(), + '}' => { self.bump(); return Ok(Object(values)); }, + _ => { + if self.eof() { break; } + return self.error(~"expected `,` or `}`"); + } } } From 1dd18801216b26595a7b374e5dc8ee32eb577954 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Fri, 7 Feb 2014 01:36:59 +1100 Subject: [PATCH 2/4] syntax: convert the lexer to use Option over transmute(-1). The transmute was unsound. There are many instances of .unwrap_or('\x00') for "ignoring" EOF which either do not make the situation worse than it was (well, actually make it better, since it's easy to grep for places that don't handle EOF) or can never ever be read. Fixes #8971. --- src/libsyntax/parse/comments.rs | 44 +++--- src/libsyntax/parse/lexer.rs | 229 +++++++++++++++++++------------- 2 files changed, 157 insertions(+), 116 deletions(-) diff --git a/src/libsyntax/parse/comments.rs b/src/libsyntax/parse/comments.rs index 77b047a63c386..138f9d7164067 100644 --- a/src/libsyntax/parse/comments.rs +++ b/src/libsyntax/parse/comments.rs @@ -12,7 +12,7 @@ use ast; use codemap::{BytePos, CharPos, CodeMap, Pos}; use diagnostic; use parse::lexer::{is_whitespace, with_str_from, Reader}; -use parse::lexer::{StringReader, bump, is_eof, nextch, TokenAndSpan}; +use parse::lexer::{StringReader, bump, is_eof, nextch_is, TokenAndSpan}; use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment}; use parse::lexer; use parse::token; @@ -136,11 +136,11 @@ pub fn strip_doc_comment_decoration(comment: &str) -> ~str { fn read_to_eol(rdr: &StringReader) -> ~str { let mut val = ~""; - while rdr.curr.get() != '\n' && !is_eof(rdr) { - val.push_char(rdr.curr.get()); + while !rdr.curr_is('\n') && !is_eof(rdr) { + val.push_char(rdr.curr.get().unwrap()); bump(rdr); } - if rdr.curr.get() == '\n' { bump(rdr); } + if rdr.curr_is('\n') { bump(rdr); } return val; } @@ -152,7 +152,7 @@ fn read_one_line_comment(rdr: &StringReader) -> ~str { } fn consume_non_eol_whitespace(rdr: &StringReader) { - while is_whitespace(rdr.curr.get()) && rdr.curr.get() != '\n' && + while is_whitespace(rdr.curr.get()) && !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); } @@ -171,7 +171,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut ~[Comment]) { fn consume_whitespace_counting_blank_lines(rdr: &StringReader, comments: &mut ~[Comment]) { while is_whitespace(rdr.curr.get()) && !is_eof(rdr) { - if rdr.col.get() == CharPos(0u) && rdr.curr.get() == '\n' { + if rdr.col.get() == CharPos(0u) && rdr.curr_is('\n') { push_blank_line_comment(rdr, &mut *comments); } bump(rdr); @@ -196,7 +196,7 @@ fn read_line_comments(rdr: &StringReader, code_to_the_left: bool, debug!(">>> line comments"); let p = rdr.last_pos.get(); let mut lines: ~[~str] = ~[]; - while rdr.curr.get() == '/' && nextch(rdr) == '/' { + while rdr.curr_is('/') && nextch_is(rdr, '/') { let line = read_one_line_comment(rdr); debug!("{}", line); if is_doc_comment(line) { // doc-comments are not put in comments @@ -261,9 +261,9 @@ fn read_block_comment(rdr: &StringReader, let mut curr_line = ~"/*"; // doc-comments are not really comments, they are attributes - if rdr.curr.get() == '*' || rdr.curr.get() == '!' { - while !(rdr.curr.get() == '*' && nextch(rdr) == '/') && !is_eof(rdr) { - curr_line.push_char(rdr.curr.get()); + if rdr.curr_is('*') || rdr.curr_is('!') { + while !(rdr.curr_is('*') && nextch_is(rdr, '/')) && !is_eof(rdr) { + curr_line.push_char(rdr.curr.get().unwrap()); bump(rdr); } if !is_eof(rdr) { @@ -281,20 +281,20 @@ fn read_block_comment(rdr: &StringReader, if is_eof(rdr) { rdr.fatal(~"unterminated block comment"); } - if rdr.curr.get() == '\n' { + if rdr.curr_is('\n') { trim_whitespace_prefix_and_push_line(&mut lines, curr_line, col); curr_line = ~""; bump(rdr); } else { - curr_line.push_char(rdr.curr.get()); - if rdr.curr.get() == '/' && nextch(rdr) == '*' { + curr_line.push_char(rdr.curr.get().unwrap()); + if rdr.curr_is('/') && nextch_is(rdr, '*') { bump(rdr); bump(rdr); curr_line.push_char('*'); level += 1; } else { - if rdr.curr.get() == '*' && nextch(rdr) == '/' { + if rdr.curr_is('*') && nextch_is(rdr, '/') { bump(rdr); bump(rdr); curr_line.push_char('/'); @@ -310,7 +310,7 @@ fn read_block_comment(rdr: &StringReader, let mut style = if code_to_the_left { Trailing } else { Isolated }; consume_non_eol_whitespace(rdr); - if !is_eof(rdr) && rdr.curr.get() != '\n' && lines.len() == 1u { + if !is_eof(rdr) && !rdr.curr_is('\n') && lines.len() == 1u { style = Mixed; } debug!("<<< block comment"); @@ -318,20 +318,20 @@ fn read_block_comment(rdr: &StringReader, } fn peeking_at_comment(rdr: &StringReader) -> bool { - return ((rdr.curr.get() == '/' && nextch(rdr) == '/') || - (rdr.curr.get() == '/' && nextch(rdr) == '*')) || - (rdr.curr.get() == '#' && nextch(rdr) == '!'); + return (rdr.curr_is('/') && nextch_is(rdr, '/')) || + (rdr.curr_is('/') && nextch_is(rdr, '*')) || + (rdr.curr_is('#') && nextch_is(rdr, '!')); } fn consume_comment(rdr: &StringReader, code_to_the_left: bool, comments: &mut ~[Comment]) { debug!(">>> consume comment"); - if rdr.curr.get() == '/' && nextch(rdr) == '/' { + if rdr.curr_is('/') && nextch_is(rdr, '/') { read_line_comments(rdr, code_to_the_left, comments); - } else if rdr.curr.get() == '/' && nextch(rdr) == '*' { + } else if rdr.curr_is('/') && nextch_is(rdr, '*') { read_block_comment(rdr, code_to_the_left, comments); - } else if rdr.curr.get() == '#' && nextch(rdr) == '!' { + } else if rdr.curr_is('#') && nextch_is(rdr, '!') { read_shebang_comment(rdr, code_to_the_left, comments); } else { fail!(); } debug!("<<< consume comment"); @@ -363,7 +363,7 @@ pub fn gather_comments_and_literals(span_diagnostic: loop { let mut code_to_the_left = !first_read; consume_non_eol_whitespace(&rdr); - if rdr.curr.get() == '\n' { + if rdr.curr_is('\n') { code_to_the_left = false; consume_whitespace_counting_blank_lines(&rdr, &mut comments); } diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index 1bcff324e082f..f6ebfbfcc3b2e 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -16,7 +16,6 @@ use ext::tt::transcribe::{dup_tt_reader, tt_next_token}; use parse::token; use parse::token::{str_to_ident}; -use std::cast::transmute; use std::cell::{Cell, RefCell}; use std::char; use std::num::from_str_radix; @@ -48,13 +47,19 @@ pub struct StringReader { // The column of the next character to read col: Cell, // The last character to be read - curr: Cell, + curr: Cell>, filemap: @codemap::FileMap, /* cached: */ peek_tok: RefCell, peek_span: RefCell, } +impl StringReader { + pub fn curr_is(&self, c: char) -> bool { + self.curr.get() == Some(c) + } +} + pub fn new_string_reader(span_diagnostic: @SpanHandler, filemap: @codemap::FileMap) -> StringReader { @@ -74,7 +79,7 @@ pub fn new_low_level_string_reader(span_diagnostic: @SpanHandler, pos: Cell::new(filemap.start_pos), last_pos: Cell::new(filemap.start_pos), col: Cell::new(CharPos(0)), - curr: Cell::new(initial_char), + curr: Cell::new(Some(initial_char)), filemap: filemap, /* dummy values; not read */ peek_tok: RefCell::new(token::EOF), @@ -246,14 +251,12 @@ pub fn bump(rdr: &StringReader) { rdr.last_pos.set(rdr.pos.get()); let current_byte_offset = byte_offset(rdr, rdr.pos.get()).to_uint(); if current_byte_offset < (rdr.filemap.src).len() { - assert!(rdr.curr.get() != unsafe { - transmute(-1u32) - }); // FIXME: #8971: unsound - let last_char = rdr.curr.get(); + assert!(rdr.curr.get().is_some()); + let last_char = rdr.curr.get().unwrap(); let next = rdr.filemap.src.char_range_at(current_byte_offset); let byte_offset_diff = next.next - current_byte_offset; rdr.pos.set(rdr.pos.get() + Pos::from_uint(byte_offset_diff)); - rdr.curr.set(next.ch); + rdr.curr.set(Some(next.ch)); rdr.col.set(rdr.col.get() + CharPos(1u)); if last_char == '\n' { rdr.filemap.next_line(rdr.last_pos.get()); @@ -265,37 +268,50 @@ pub fn bump(rdr: &StringReader) { Pos::from_uint(current_byte_offset), byte_offset_diff); } } else { - rdr.curr.set(unsafe { transmute(-1u32) }); // FIXME: #8971: unsound + rdr.curr.set(None); } } pub fn is_eof(rdr: &StringReader) -> bool { - rdr.curr.get() == unsafe { transmute(-1u32) } // FIXME: #8971: unsound + rdr.curr.get().is_none() } -pub fn nextch(rdr: &StringReader) -> char { +pub fn nextch(rdr: &StringReader) -> Option { let offset = byte_offset(rdr, rdr.pos.get()).to_uint(); if offset < (rdr.filemap.src).len() { - return rdr.filemap.src.char_at(offset); - } else { return unsafe { transmute(-1u32) }; } // FIXME: #8971: unsound + Some(rdr.filemap.src.char_at(offset)) + } else { + None + } +} +pub fn nextch_is(rdr: &StringReader, c: char) -> bool { + nextch(rdr) == Some(c) } -fn hex_digit_val(c: char) -> int { - if in_range(c, '0', '9') { return (c as int) - ('0' as int); } - if in_range(c, 'a', 'f') { return (c as int) - ('a' as int) + 10; } - if in_range(c, 'A', 'F') { return (c as int) - ('A' as int) + 10; } +fn hex_digit_val(c: Option) -> int { + let d = c.unwrap_or('\x00'); + + if in_range(c, '0', '9') { return (d as int) - ('0' as int); } + if in_range(c, 'a', 'f') { return (d as int) - ('a' as int) + 10; } + if in_range(c, 'A', 'F') { return (d as int) - ('A' as int) + 10; } fail!(); } -pub fn is_whitespace(c: char) -> bool { - return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +pub fn is_whitespace(c: Option) -> bool { + match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace + ' ' | '\n' | '\t' | '\r' => true, + _ => false + } } -fn in_range(c: char, lo: char, hi: char) -> bool { - return lo <= c && c <= hi +fn in_range(c: Option, lo: char, hi: char) -> bool { + match c { + Some(c) => lo <= c && c <= hi, + _ => false + } } -fn is_dec_digit(c: char) -> bool { return in_range(c, '0', '9'); } +fn is_dec_digit(c: Option) -> bool { return in_range(c, '0', '9'); } -fn is_hex_digit(c: char) -> bool { +fn is_hex_digit(c: Option) -> bool { return in_range(c, '0', '9') || in_range(c, 'a', 'f') || in_range(c, 'A', 'F'); } @@ -317,15 +333,15 @@ pub fn is_line_non_doc_comment(s: &str) -> bool { // returns a Some(sugared-doc-attr) if one exists, None otherwise fn consume_any_line_comment(rdr: &StringReader) -> Option { - if rdr.curr.get() == '/' { + if rdr.curr_is('/') { match nextch(rdr) { - '/' => { + Some('/') => { bump(rdr); bump(rdr); // line comments starting with "///" or "//!" are doc-comments - if rdr.curr.get() == '/' || rdr.curr.get() == '!' { + if rdr.curr_is('/') || rdr.curr_is('!') { let start_bpos = rdr.pos.get() - BytePos(3); - while rdr.curr.get() != '\n' && !is_eof(rdr) { + while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); } let ret = with_str_from(rdr, start_bpos, |string| { @@ -344,16 +360,16 @@ fn consume_any_line_comment(rdr: &StringReader) return ret; } } else { - while rdr.curr.get() != '\n' && !is_eof(rdr) { bump(rdr); } + while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); } } // Restart whitespace munch. return consume_whitespace_and_comments(rdr); } - '*' => { bump(rdr); bump(rdr); return consume_block_comment(rdr); } + Some('*') => { bump(rdr); bump(rdr); return consume_block_comment(rdr); } _ => () } - } else if rdr.curr.get() == '#' { - if nextch(rdr) == '!' { + } else if rdr.curr_is('#') { + if nextch_is(rdr, '!') { // I guess this is the only way to figure out if // we're at the beginning of the file... let cmap = @CodeMap::new(); @@ -363,7 +379,7 @@ fn consume_any_line_comment(rdr: &StringReader) } let loc = cmap.lookup_char_pos_adj(rdr.last_pos.get()); if loc.line == 1u && loc.col == CharPos(0u) { - while rdr.curr.get() != '\n' && !is_eof(rdr) { bump(rdr); } + while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); } return consume_whitespace_and_comments(rdr); } } @@ -378,7 +394,7 @@ pub fn is_block_non_doc_comment(s: &str) -> bool { // might return a sugared-doc-attr fn consume_block_comment(rdr: &StringReader) -> Option { // block comments starting with "/**" or "/*!" are doc-comments - let is_doc_comment = rdr.curr.get() == '*' || rdr.curr.get() == '!'; + let is_doc_comment = rdr.curr_is('*') || rdr.curr_is('!'); let start_bpos = rdr.pos.get() - BytePos(if is_doc_comment {3} else {2}); let mut level: int = 1; @@ -390,11 +406,11 @@ fn consume_block_comment(rdr: &StringReader) -> Option { ~"unterminated block comment" }; fatal_span(rdr, start_bpos, rdr.last_pos.get(), msg); - } else if rdr.curr.get() == '/' && nextch(rdr) == '*' { + } else if rdr.curr_is('/') && nextch_is(rdr, '*') { level += 1; bump(rdr); bump(rdr); - } else if rdr.curr.get() == '*' && nextch(rdr) == '/' { + } else if rdr.curr_is('*') && nextch_is(rdr, '/') { level -= 1; bump(rdr); bump(rdr); @@ -424,12 +440,13 @@ fn consume_block_comment(rdr: &StringReader) -> Option { } fn scan_exponent(rdr: &StringReader, start_bpos: BytePos) -> Option<~str> { - let mut c = rdr.curr.get(); + // \x00 hits the `return None` case immediately, so this is fine. + let mut c = rdr.curr.get().unwrap_or('\x00'); let mut rslt = ~""; if c == 'e' || c == 'E' { rslt.push_char(c); bump(rdr); - c = rdr.curr.get(); + c = rdr.curr.get().unwrap_or('\x00'); if c == '-' || c == '+' { rslt.push_char(c); bump(rdr); @@ -448,10 +465,10 @@ fn scan_digits(rdr: &StringReader, radix: uint) -> ~str { let mut rslt = ~""; loop { let c = rdr.curr.get(); - if c == '_' { bump(rdr); continue; } - match char::to_digit(c, radix) { + if c == Some('_') { bump(rdr); continue; } + match c.and_then(|cc| char::to_digit(cc, radix)) { Some(_) => { - rslt.push_char(c); + rslt.push_char(c.unwrap()); bump(rdr); } _ => return rslt @@ -476,7 +493,7 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token { let mut num_str; let mut base = 10u; let mut c = c; - let mut n = nextch(rdr); + let mut n = nextch(rdr).unwrap_or('\x00'); let start_bpos = rdr.last_pos.get(); if c == '0' && n == 'x' { bump(rdr); @@ -492,7 +509,7 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token { base = 2u; } num_str = scan_digits(rdr, base); - c = rdr.curr.get(); + c = rdr.curr.get().unwrap_or('\x00'); nextch(rdr); if c == 'u' || c == 'i' { enum Result { Signed(ast::IntTy), Unsigned(ast::UintTy) } @@ -502,13 +519,13 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token { else { Unsigned(ast::TyU) } }; bump(rdr); - c = rdr.curr.get(); + c = rdr.curr.get().unwrap_or('\x00'); if c == '8' { bump(rdr); tp = if signed { Signed(ast::TyI8) } else { Unsigned(ast::TyU8) }; } - n = nextch(rdr); + n = nextch(rdr).unwrap_or('\x00'); if c == '1' && n == '6' { bump(rdr); bump(rdr); @@ -541,8 +558,7 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token { } } let mut is_float = false; - if rdr.curr.get() == '.' && !(ident_start(nextch(rdr)) || nextch(rdr) == - '.') { + if rdr.curr_is('.') && !(ident_start(nextch(rdr)) || nextch_is(rdr, '.')) { is_float = true; bump(rdr); let dec_part = scan_digits(rdr, 10u); @@ -557,10 +573,10 @@ fn scan_number(c: char, rdr: &StringReader) -> token::Token { None => () } - if rdr.curr.get() == 'f' { + if rdr.curr_is('f') { bump(rdr); - c = rdr.curr.get(); - n = nextch(rdr); + c = rdr.curr.get().unwrap_or('\x00'); + n = nextch(rdr).unwrap_or('\x00'); if c == '3' && n == '2' { bump(rdr); bump(rdr); @@ -602,18 +618,23 @@ fn scan_numeric_escape(rdr: &StringReader, n_hex_digits: uint) -> char { let mut accum_int = 0; let mut i = n_hex_digits; let start_bpos = rdr.last_pos.get(); - while i != 0u { + while i != 0u && !is_eof(rdr) { let n = rdr.curr.get(); if !is_hex_digit(n) { fatal_span_char(rdr, rdr.last_pos.get(), rdr.pos.get(), ~"illegal character in numeric character escape", - n); + n.unwrap()); } bump(rdr); accum_int *= 16; accum_int += hex_digit_val(n); i -= 1u; } + if i != 0 && is_eof(rdr) { + fatal_span(rdr, start_bpos, rdr.last_pos.get(), + ~"unterminated numeric character escape"); + } + match char::from_u32(accum_int as u32) { Some(x) => x, None => fatal_span(rdr, start_bpos, rdr.last_pos.get(), @@ -621,14 +642,18 @@ fn scan_numeric_escape(rdr: &StringReader, n_hex_digits: uint) -> char { } } -fn ident_start(c: char) -> bool { +fn ident_start(c: Option) -> bool { + let c = match c { Some(c) => c, None => return false }; + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && char::is_XID_start(c)) } -fn ident_continue(c: char) -> bool { +fn ident_continue(c: Option) -> bool { + let c = match c { Some(c) => c, None => return false }; + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') @@ -641,7 +666,7 @@ fn ident_continue(c: char) -> bool { // EFFECT: updates the interner fn next_token_inner(rdr: &StringReader) -> token::Token { let c = rdr.curr.get(); - if ident_start(c) && nextch(rdr) != '"' && nextch(rdr) != '#' { + if ident_start(c) && !nextch_is(rdr, '"') && !nextch_is(rdr, '#') { // Note: r as in r" or r#" is part of a raw string literal, // not an identifier, and is handled further down. @@ -654,7 +679,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { if string == "_" { token::UNDERSCORE } else { - let is_mod_name = rdr.curr.get() == ':' && nextch(rdr) == ':'; + let is_mod_name = rdr.curr_is(':') && nextch_is(rdr, ':'); // FIXME: perform NFKC normalization here. (Issue #2253) token::IDENT(str_to_ident(string), is_mod_name) @@ -662,16 +687,16 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { }) } if is_dec_digit(c) { - return scan_number(c, rdr); + return scan_number(c.unwrap(), rdr); } fn binop(rdr: &StringReader, op: token::BinOp) -> token::Token { bump(rdr); - if rdr.curr.get() == '=' { + if rdr.curr_is('=') { bump(rdr); return token::BINOPEQ(op); } else { return token::BINOP(op); } } - match c { + match c.expect("next_token_inner called at EOF") { @@ -682,9 +707,9 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { ',' => { bump(rdr); return token::COMMA; } '.' => { bump(rdr); - return if rdr.curr.get() == '.' { + return if rdr.curr_is('.') { bump(rdr); - if rdr.curr.get() == '.' { + if rdr.curr_is('.') { bump(rdr); token::DOTDOTDOT } else { @@ -705,7 +730,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { '~' => { bump(rdr); return token::TILDE; } ':' => { bump(rdr); - if rdr.curr.get() == ':' { + if rdr.curr_is(':') { bump(rdr); return token::MOD_SEP; } else { return token::COLON; } @@ -720,10 +745,10 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { // Multi-byte tokens. '=' => { bump(rdr); - if rdr.curr.get() == '=' { + if rdr.curr_is('=') { bump(rdr); return token::EQEQ; - } else if rdr.curr.get() == '>' { + } else if rdr.curr_is('>') { bump(rdr); return token::FAT_ARROW; } else { @@ -732,19 +757,19 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { } '!' => { bump(rdr); - if rdr.curr.get() == '=' { + if rdr.curr_is('=') { bump(rdr); return token::NE; } else { return token::NOT; } } '<' => { bump(rdr); - match rdr.curr.get() { + match rdr.curr.get().unwrap_or('\x00') { '=' => { bump(rdr); return token::LE; } '<' => { return binop(rdr, token::SHL); } '-' => { bump(rdr); - match rdr.curr.get() { + match rdr.curr.get().unwrap_or('\x00') { '>' => { bump(rdr); return token::DARROW; } _ => { return token::LARROW; } } @@ -754,7 +779,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { } '>' => { bump(rdr); - match rdr.curr.get() { + match rdr.curr.get().unwrap_or('\x00') { '=' => { bump(rdr); return token::GE; } '>' => { return binop(rdr, token::SHR); } _ => { return token::GT; } @@ -764,12 +789,14 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { // Either a character constant 'a' OR a lifetime name 'abc bump(rdr); let start = rdr.last_pos.get(); - let mut c2 = rdr.curr.get(); + + // the eof will be picked up by the final `'` check below + let mut c2 = rdr.curr.get().unwrap_or('\x00'); bump(rdr); // If the character is an ident start not followed by another single // quote, then this is a lifetime name: - if ident_start(c2) && rdr.curr.get() != '\'' { + if ident_start(Some(c2)) && !rdr.curr_is('\'') { while ident_continue(rdr.curr.get()) { bump(rdr); } @@ -798,19 +825,24 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { let escaped_pos = rdr.last_pos.get(); bump(rdr); match escaped { - 'n' => { c2 = '\n'; } - 'r' => { c2 = '\r'; } - 't' => { c2 = '\t'; } - '\\' => { c2 = '\\'; } - '\'' => { c2 = '\''; } - '"' => { c2 = '"'; } - '0' => { c2 = '\x00'; } - 'x' => { c2 = scan_numeric_escape(rdr, 2u); } - 'u' => { c2 = scan_numeric_escape(rdr, 4u); } - 'U' => { c2 = scan_numeric_escape(rdr, 8u); } - c2 => { - fatal_span_char(rdr, escaped_pos, rdr.last_pos.get(), - ~"unknown character escape", c2); + None => {} + Some(e) => { + c2 = match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => scan_numeric_escape(rdr, 2u), + 'u' => scan_numeric_escape(rdr, 4u), + 'U' => scan_numeric_escape(rdr, 8u), + c2 => { + fatal_span_char(rdr, escaped_pos, rdr.last_pos.get(), + ~"unknown character escape", c2) + } + } } } } @@ -820,7 +852,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { } _ => {} } - if rdr.curr.get() != '\'' { + if !rdr.curr_is('\'') { fatal_span_verbose(rdr, // Byte offsetting here is okay because the // character before position `start` is an @@ -836,17 +868,22 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { let mut accum_str = ~""; let start_bpos = rdr.last_pos.get(); bump(rdr); - while rdr.curr.get() != '"' { + while !rdr.curr_is('"') { if is_eof(rdr) { fatal_span(rdr, start_bpos, rdr.last_pos.get(), ~"unterminated double quote string"); } - let ch = rdr.curr.get(); + let ch = rdr.curr.get().unwrap(); bump(rdr); match ch { '\\' => { - let escaped = rdr.curr.get(); + if is_eof(rdr) { + fatal_span(rdr, start_bpos, rdr.last_pos.get(), + ~"unterminated double quote string"); + } + + let escaped = rdr.curr.get().unwrap(); let escaped_pos = rdr.last_pos.get(); bump(rdr); match escaped { @@ -883,15 +920,19 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { let start_bpos = rdr.last_pos.get(); bump(rdr); let mut hash_count = 0u; - while rdr.curr.get() == '#' { + while rdr.curr_is('#') { bump(rdr); hash_count += 1; } - if rdr.curr.get() != '"' { + + if is_eof(rdr) { + fatal_span(rdr, start_bpos, rdr.last_pos.get(), + ~"unterminated raw string"); + } else if !rdr.curr_is('"') { fatal_span_char(rdr, start_bpos, rdr.last_pos.get(), ~"only `#` is allowed in raw string delimitation; \ found illegal character", - rdr.curr.get()); + rdr.curr.get().unwrap()); } bump(rdr); let content_start_bpos = rdr.last_pos.get(); @@ -901,11 +942,11 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { fatal_span(rdr, start_bpos, rdr.last_pos.get(), ~"unterminated raw string"); } - if rdr.curr.get() == '"' { + if rdr.curr_is('"') { content_end_bpos = rdr.last_pos.get(); for _ in range(0, hash_count) { bump(rdr); - if rdr.curr.get() != '#' { + if !rdr.curr_is('#') { continue 'outer; } } @@ -921,14 +962,14 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { return token::LIT_STR_RAW(str_content, hash_count); } '-' => { - if nextch(rdr) == '>' { + if nextch_is(rdr, '>') { bump(rdr); bump(rdr); return token::RARROW; } else { return binop(rdr, token::MINUS); } } '&' => { - if nextch(rdr) == '&' { + if nextch_is(rdr, '&') { bump(rdr); bump(rdr); return token::ANDAND; @@ -936,7 +977,7 @@ fn next_token_inner(rdr: &StringReader) -> token::Token { } '|' => { match nextch(rdr) { - '|' => { bump(rdr); bump(rdr); return token::OROR; } + Some('|') => { bump(rdr); bump(rdr); return token::OROR; } _ => { return binop(rdr, token::OR); } } } From 285c25f7f4fd5e824da6b6d670a141535f948750 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Fri, 7 Feb 2014 01:39:14 +1100 Subject: [PATCH 3/4] rustc: put range asserts on `char` loads. A `char` is a Unicode codepoint, and so ranges from 0--0x10FFFF (with the surrogate gaps): we may as well inform LLVM of this. --- src/librustc/middle/trans/datum.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/librustc/middle/trans/datum.rs b/src/librustc/middle/trans/datum.rs index 2a2421077a8c6..35fe1d781c132 100644 --- a/src/librustc/middle/trans/datum.rs +++ b/src/librustc/middle/trans/datum.rs @@ -546,6 +546,10 @@ fn load<'a>(bcx: &'a Block<'a>, llptr: ValueRef, ty: ty::t) -> ValueRef { C_undef(type_of::type_of(bcx.ccx(), ty)) } else if ty::type_is_bool(ty) { LoadRangeAssert(bcx, llptr, 0, 2, lib::llvm::True) + } else if ty::type_is_char(ty) { + // a char is a unicode codepoint, and so takes values from 0 + // to 0x10FFFF inclusive only. + LoadRangeAssert(bcx, llptr, 0, 0x10FFFF + 1, lib::llvm::False) } else { Load(bcx, llptr) } From 5e2de79b3054a815a45b9f4c641d7f1356f2291b Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Sat, 8 Feb 2014 12:12:47 +1100 Subject: [PATCH 4/4] rustc: load bools as unsigned numbers. Apparently loading them signed will break if/when they become i1. --- src/librustc/middle/trans/datum.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librustc/middle/trans/datum.rs b/src/librustc/middle/trans/datum.rs index 35fe1d781c132..e7ebc2ef526c4 100644 --- a/src/librustc/middle/trans/datum.rs +++ b/src/librustc/middle/trans/datum.rs @@ -545,7 +545,7 @@ fn load<'a>(bcx: &'a Block<'a>, llptr: ValueRef, ty: ty::t) -> ValueRef { if type_is_zero_size(bcx.ccx(), ty) { C_undef(type_of::type_of(bcx.ccx(), ty)) } else if ty::type_is_bool(ty) { - LoadRangeAssert(bcx, llptr, 0, 2, lib::llvm::True) + LoadRangeAssert(bcx, llptr, 0, 2, lib::llvm::False) } else if ty::type_is_char(ty) { // a char is a unicode codepoint, and so takes values from 0 // to 0x10FFFF inclusive only.