From 606f50c46dd9a3852d36456d2015e1ccf832642e Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Tue, 31 Mar 2015 00:27:13 +1100
Subject: [PATCH] Lex binary and octal literals more eagerly.

Previously 0b12 was considered two tokens, 0b1 and 2, as 2 isn't a valid
base 2 digit. This patch changes that to collapse them into one (and
makes `0b12` etc. an error: 2 isn't a valid base 2 digit).

This may break some macro invocations of macros with `tt` (or syntax
extensions) that rely on adjacent digits being separate tokens and hence
is a

[breaking-change]

The fix is to separate the tokens, e.g. `0b12` -> `0b1 2`.

cc https://github.com/rust-lang/rfcs/pull/879
---
 src/libsyntax/parse/lexer/mod.rs              | 38 ++++++++++++-------
 src/libsyntax/parse/mod.rs                    | 15 +++++++-
 src/test/parse-fail/issue-1802-1.rs           |  2 +-
 src/test/parse-fail/lex-bad-binary-literal.rs | 21 ++++++++++
 src/test/parse-fail/lex-bad-octal-literal.rs  | 14 +++++++
 5 files changed, 75 insertions(+), 15 deletions(-)
 create mode 100644 src/test/parse-fail/lex-bad-binary-literal.rs
 create mode 100644 src/test/parse-fail/lex-bad-octal-literal.rs

diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index 532b632fac8..ae5c99123a5 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -621,7 +621,7 @@ impl<'a> StringReader<'a> {
         let base = 10;
 
         // find the integer representing the name
-        self.scan_digits(base);
+        self.scan_digits(base, base);
         let encoded_name : u32 = self.with_str_from(start_bpos, |s| {
             num::from_str_radix(s, 10).unwrap_or_else(|_| {
                 panic!("expected digits representing a name, got {:?}, {}, range [{:?},{:?}]",
@@ -639,7 +639,7 @@ impl<'a> StringReader<'a> {
 
         // find the integer representing the ctxt
         let start_bpos = self.last_pos;
-        self.scan_digits(base);
+        self.scan_digits(base, base);
         let encoded_ctxt : ast::SyntaxContext = self.with_str_from(start_bpos, |s| {
             num::from_str_radix(s, 10).unwrap_or_else(|_| {
                 panic!("expected digits representing a ctxt, got {:?}, {}", s, whence);
@@ -653,16 +653,28 @@ impl<'a> StringReader<'a> {
                      ctxt: encoded_ctxt, }
     }
 
-    /// Scan through any digits (base `radix`) or underscores, and return how
-    /// many digits there were.
-    fn scan_digits(&mut self, radix: u32) -> usize {
+    /// Scan through any digits (base `scan_radix`) or underscores,
+    /// and return how many digits there were.
+    ///
+    /// `real_radix` represents the true radix of the number we're
+    /// interested in, and errors will be emitted for any digits
+    /// between `real_radix` and `scan_radix`.
+    fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
+        assert!(real_radix <= scan_radix);
         let mut len = 0;
         loop {
             let c = self.curr;
             if c == Some('_') { debug!("skipping a _"); self.bump(); continue; }
-            match c.and_then(|cc| cc.to_digit(radix)) {
+            match c.and_then(|cc| cc.to_digit(scan_radix)) {
                 Some(_) => {
                     debug!("{:?} in scan_digits", c);
+                    // check that the hypothetical digit is actually
+                    // in range for the true radix
+                    if c.unwrap().to_digit(real_radix).is_none() {
+                        self.err_span_(self.last_pos, self.pos,
+                                       &format!("invalid digit for a base {} literal",
+                                                real_radix));
+                    }
                     len += 1;
                     self.bump();
                 }
@@ -681,11 +693,11 @@ impl<'a> StringReader<'a> {
 
         if c == '0' {
             match self.curr.unwrap_or('\0') {
-                'b' => { self.bump(); base = 2; num_digits = self.scan_digits(2); }
-                'o' => { self.bump(); base = 8; num_digits = self.scan_digits(8); }
-                'x' => { self.bump(); base = 16; num_digits = self.scan_digits(16); }
+                'b' => { self.bump(); base = 2; num_digits = self.scan_digits(2, 10); }
+                'o' => { self.bump(); base = 8; num_digits = self.scan_digits(8, 10); }
+                'x' => { self.bump(); base = 16; num_digits = self.scan_digits(16, 16); }
                 '0'...'9' | '_' | '.' => {
-                    num_digits = self.scan_digits(10) + 1;
+                    num_digits = self.scan_digits(10, 10) + 1;
                 }
                 _ => {
                     // just a 0
@@ -693,7 +705,7 @@ impl<'a> StringReader<'a> {
                 }
             }
         } else if c.is_digit(10) {
-            num_digits = self.scan_digits(10) + 1;
+            num_digits = self.scan_digits(10, 10) + 1;
         } else {
             num_digits = 0;
         }
@@ -712,7 +724,7 @@ impl<'a> StringReader<'a> {
             // with a number
             self.bump();
             if self.curr.unwrap_or('\0').is_digit(10) {
-                self.scan_digits(10);
+                self.scan_digits(10, 10);
                 self.scan_float_exponent();
             }
             let last_pos = self.last_pos;
@@ -935,7 +947,7 @@ impl<'a> StringReader<'a> {
             if self.curr_is('-') || self.curr_is('+') {
                 self.bump();
             }
-            if self.scan_digits(10) == 0 {
+            if self.scan_digits(10, 10) == 0 {
                 self.err_span_(self.last_pos, self.pos, "expected at least one digit in exponent")
             }
         }
diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs
index bea42a88bf5..f8820999c9d 100644
--- a/src/libsyntax/parse/mod.rs
+++ b/src/libsyntax/parse/mod.rs
@@ -735,7 +735,20 @@ pub fn integer_lit(s: &str, suffix: Option<&str>, sd: &SpanHandler, sp: Span) ->
 
     let res: u64 = match ::std::num::from_str_radix(s, base).ok() {
         Some(r) => r,
-        None => { sd.span_err(sp, "int literal is too large"); 0 }
+        None => {
+            // small bases are lexed as if they were base 10, e.g, the string
+            // might be `0b10201`. This will cause the conversion above to fail,
+            // but these cases have errors in the lexer: we don't want to emit
+            // two errors, and we especially don't want to emit this error since
+            // it isn't necessarily true.
+            let already_errored = base < 10 &&
+                s.chars().any(|c| c.to_digit(10).map_or(false, |d| d >= base));
+
+            if !already_errored {
+                sd.span_err(sp, "int literal is too large");
+            }
+            0
+        }
     };
 
     // adjust the sign
diff --git a/src/test/parse-fail/issue-1802-1.rs b/src/test/parse-fail/issue-1802-1.rs
index 8ce99f517c4..00fb2808faa 100644
--- a/src/test/parse-fail/issue-1802-1.rs
+++ b/src/test/parse-fail/issue-1802-1.rs
@@ -10,5 +10,5 @@
 
 // error-pattern:no valid digits found for number
 fn main() {
-    log(error, 0b42);
+    log(error, 0b);
 }
diff --git a/src/test/parse-fail/lex-bad-binary-literal.rs b/src/test/parse-fail/lex-bad-binary-literal.rs
new file mode 100644
index 00000000000..e92000c54ba
--- /dev/null
+++ b/src/test/parse-fail/lex-bad-binary-literal.rs
@@ -0,0 +1,21 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+fn main() {
+    0b121; //~ ERROR invalid digit for a base 2 literal
+    0b10_10301; //~ ERROR invalid digit for a base 2 literal
+    0b30; //~ ERROR invalid digit for a base 2 literal
+    0b41; //~ ERROR invalid digit for a base 2 literal
+    0b5; //~ ERROR invalid digit for a base 2 literal
+    0b6; //~ ERROR invalid digit for a base 2 literal
+    0b7; //~ ERROR invalid digit for a base 2 literal
+    0b8; //~ ERROR invalid digit for a base 2 literal
+    0b9; //~ ERROR invalid digit for a base 2 literal
+}
diff --git a/src/test/parse-fail/lex-bad-octal-literal.rs b/src/test/parse-fail/lex-bad-octal-literal.rs
new file mode 100644
index 00000000000..bf9880cb6cf
--- /dev/null
+++ b/src/test/parse-fail/lex-bad-octal-literal.rs
@@ -0,0 +1,14 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+fn main() {
+    0o18; //~ ERROR invalid digit for a base 8 literal
+    0o1234_9_5670;  //~ ERROR invalid digit for a base 8 literal
+}