From 606f50c46dd9a3852d36456d2015e1ccf832642e Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Tue, 31 Mar 2015 00:27:13 +1100 Subject: [PATCH] Lex binary and octal literals more eagerly. Previously 0b12 was considered two tokens, 0b1 and 2, as 2 isn't a valid base 2 digit. This patch changes that to collapse them into one (and makes `0b12` etc. an error: 2 isn't a valid base 2 digit). This may break some macro invocations of macros with `tt` (or syntax extensions) that rely on adjacent digits being separate tokens and hence is a [breaking-change] The fix is to separate the tokens, e.g. `0b12` -> `0b1 2`. cc https://github.com/rust-lang/rfcs/pull/879 --- src/libsyntax/parse/lexer/mod.rs | 38 ++++++++++++------- src/libsyntax/parse/mod.rs | 15 +++++++- src/test/parse-fail/issue-1802-1.rs | 2 +- src/test/parse-fail/lex-bad-binary-literal.rs | 21 ++++++++++ src/test/parse-fail/lex-bad-octal-literal.rs | 14 +++++++ 5 files changed, 75 insertions(+), 15 deletions(-) create mode 100644 src/test/parse-fail/lex-bad-binary-literal.rs create mode 100644 src/test/parse-fail/lex-bad-octal-literal.rs diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 532b632fac8..ae5c99123a5 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -621,7 +621,7 @@ impl<'a> StringReader<'a> { let base = 10; // find the integer representing the name - self.scan_digits(base); + self.scan_digits(base, base); let encoded_name : u32 = self.with_str_from(start_bpos, |s| { num::from_str_radix(s, 10).unwrap_or_else(|_| { panic!("expected digits representing a name, got {:?}, {}, range [{:?},{:?}]", @@ -639,7 +639,7 @@ impl<'a> StringReader<'a> { // find the integer representing the ctxt let start_bpos = self.last_pos; - self.scan_digits(base); + self.scan_digits(base, base); let encoded_ctxt : ast::SyntaxContext = self.with_str_from(start_bpos, |s| { num::from_str_radix(s, 10).unwrap_or_else(|_| { panic!("expected digits representing a ctxt, got {:?}, {}", s, whence); @@ -653,16 +653,28 @@ impl<'a> StringReader<'a> { ctxt: encoded_ctxt, } } - /// Scan through any digits (base `radix`) or underscores, and return how - /// many digits there were. - fn scan_digits(&mut self, radix: u32) -> usize { + /// Scan through any digits (base `scan_radix`) or underscores, + /// and return how many digits there were. + /// + /// `real_radix` represents the true radix of the number we're + /// interested in, and errors will be emitted for any digits + /// between `real_radix` and `scan_radix`. + fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize { + assert!(real_radix <= scan_radix); let mut len = 0; loop { let c = self.curr; if c == Some('_') { debug!("skipping a _"); self.bump(); continue; } - match c.and_then(|cc| cc.to_digit(radix)) { + match c.and_then(|cc| cc.to_digit(scan_radix)) { Some(_) => { debug!("{:?} in scan_digits", c); + // check that the hypothetical digit is actually + // in range for the true radix + if c.unwrap().to_digit(real_radix).is_none() { + self.err_span_(self.last_pos, self.pos, + &format!("invalid digit for a base {} literal", + real_radix)); + } len += 1; self.bump(); } @@ -681,11 +693,11 @@ impl<'a> StringReader<'a> { if c == '0' { match self.curr.unwrap_or('\0') { - 'b' => { self.bump(); base = 2; num_digits = self.scan_digits(2); } - 'o' => { self.bump(); base = 8; num_digits = self.scan_digits(8); } - 'x' => { self.bump(); base = 16; num_digits = self.scan_digits(16); } + 'b' => { self.bump(); base = 2; num_digits = self.scan_digits(2, 10); } + 'o' => { self.bump(); base = 8; num_digits = self.scan_digits(8, 10); } + 'x' => { self.bump(); base = 16; num_digits = self.scan_digits(16, 16); } '0'...'9' | '_' | '.' => { - num_digits = self.scan_digits(10) + 1; + num_digits = self.scan_digits(10, 10) + 1; } _ => { // just a 0 @@ -693,7 +705,7 @@ impl<'a> StringReader<'a> { } } } else if c.is_digit(10) { - num_digits = self.scan_digits(10) + 1; + num_digits = self.scan_digits(10, 10) + 1; } else { num_digits = 0; } @@ -712,7 +724,7 @@ impl<'a> StringReader<'a> { // with a number self.bump(); if self.curr.unwrap_or('\0').is_digit(10) { - self.scan_digits(10); + self.scan_digits(10, 10); self.scan_float_exponent(); } let last_pos = self.last_pos; @@ -935,7 +947,7 @@ impl<'a> StringReader<'a> { if self.curr_is('-') || self.curr_is('+') { self.bump(); } - if self.scan_digits(10) == 0 { + if self.scan_digits(10, 10) == 0 { self.err_span_(self.last_pos, self.pos, "expected at least one digit in exponent") } } diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index bea42a88bf5..f8820999c9d 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -735,7 +735,20 @@ pub fn integer_lit(s: &str, suffix: Option<&str>, sd: &SpanHandler, sp: Span) -> let res: u64 = match ::std::num::from_str_radix(s, base).ok() { Some(r) => r, - None => { sd.span_err(sp, "int literal is too large"); 0 } + None => { + // small bases are lexed as if they were base 10, e.g, the string + // might be `0b10201`. This will cause the conversion above to fail, + // but these cases have errors in the lexer: we don't want to emit + // two errors, and we especially don't want to emit this error since + // it isn't necessarily true. + let already_errored = base < 10 && + s.chars().any(|c| c.to_digit(10).map_or(false, |d| d >= base)); + + if !already_errored { + sd.span_err(sp, "int literal is too large"); + } + 0 + } }; // adjust the sign diff --git a/src/test/parse-fail/issue-1802-1.rs b/src/test/parse-fail/issue-1802-1.rs index 8ce99f517c4..00fb2808faa 100644 --- a/src/test/parse-fail/issue-1802-1.rs +++ b/src/test/parse-fail/issue-1802-1.rs @@ -10,5 +10,5 @@ // error-pattern:no valid digits found for number fn main() { - log(error, 0b42); + log(error, 0b); } diff --git a/src/test/parse-fail/lex-bad-binary-literal.rs b/src/test/parse-fail/lex-bad-binary-literal.rs new file mode 100644 index 00000000000..e92000c54ba --- /dev/null +++ b/src/test/parse-fail/lex-bad-binary-literal.rs @@ -0,0 +1,21 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +fn main() { + 0b121; //~ ERROR invalid digit for a base 2 literal + 0b10_10301; //~ ERROR invalid digit for a base 2 literal + 0b30; //~ ERROR invalid digit for a base 2 literal + 0b41; //~ ERROR invalid digit for a base 2 literal + 0b5; //~ ERROR invalid digit for a base 2 literal + 0b6; //~ ERROR invalid digit for a base 2 literal + 0b7; //~ ERROR invalid digit for a base 2 literal + 0b8; //~ ERROR invalid digit for a base 2 literal + 0b9; //~ ERROR invalid digit for a base 2 literal +} diff --git a/src/test/parse-fail/lex-bad-octal-literal.rs b/src/test/parse-fail/lex-bad-octal-literal.rs new file mode 100644 index 00000000000..bf9880cb6cf --- /dev/null +++ b/src/test/parse-fail/lex-bad-octal-literal.rs @@ -0,0 +1,14 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +fn main() { + 0o18; //~ ERROR invalid digit for a base 8 literal + 0o1234_9_5670; //~ ERROR invalid digit for a base 8 literal +}