rust/src/comp/front/lexer.rs

924 lines
24 KiB
Rust
Raw Normal View History

import std::io;
import std::str;
import std::vec;
import std::int;
import std::map;
import std::map::hashmap;
import std::option;
import std::option::some;
import std::option::none;
import driver::session::session;
import util::common;
import util::common::new_str_hash;
import util::data::interner;
state type reader = state obj {
fn is_eof() -> bool;
fn curr() -> char;
fn next() -> char;
2011-04-19 22:35:49 +02:00
fn init();
fn bump();
fn mark();
fn get_mark_chpos() -> uint;
fn get_interner() -> @interner::interner[str];
fn get_chpos() -> uint;
2011-05-30 23:10:54 +02:00
fn get_col() -> uint;
fn get_filemap() -> codemap::filemap;
fn err(str m);
};
fn new_reader(session sess, io::reader rdr,
codemap::filemap filemap,
@interner::interner[str] itr) -> reader {
state obj reader(session sess,
str file,
uint len,
2011-05-30 23:10:54 +02:00
mutable uint col,
mutable uint pos,
mutable char ch,
mutable uint mark_chpos,
mutable uint chpos,
2011-05-09 23:17:28 +02:00
mutable vec[str] strs,
codemap::filemap fm,
@interner::interner[str] itr) {
fn is_eof() -> bool {
ret ch == -1 as char;
}
fn mark() { mark_chpos = chpos; }
fn get_mark_chpos() -> uint { ret mark_chpos; }
fn get_chpos() -> uint { ret chpos; }
fn curr() -> char {
ret ch;
}
fn next() -> char {
if (pos < len) {ret str::char_at(file, pos);}
else {ret -1 as char;}
}
2011-04-19 22:35:49 +02:00
fn init() {
if (pos < len) {
auto next = str::char_range_at(file, pos);
pos = next._1;
ch = next._0;
2010-08-27 21:36:57 +02:00
}
}
2010-08-27 21:36:57 +02:00
2011-04-19 22:35:49 +02:00
fn bump() {
if (pos < len) {
2011-05-30 23:10:54 +02:00
col += 1u;
chpos += 1u;
if (ch == '\n') {
codemap::next_line(fm, chpos);
2011-05-30 23:10:54 +02:00
col = 0u;
}
auto next = str::char_range_at(file, pos);
pos = next._1;
ch = next._0;
} else {
ch = -1 as char;
}
}
fn get_interner() -> @interner::interner[str] { ret itr; }
2011-05-09 23:17:28 +02:00
2011-05-30 23:10:54 +02:00
fn get_col() -> uint {
ret col;
}
fn get_filemap() -> codemap::filemap {
ret fm;
}
fn err(str m) {
sess.span_err(rec(lo=chpos, hi=chpos), m);
}
}
auto file = str::unsafe_from_bytes(rdr.read_whole_stream());
let vec[str] strs = [];
2011-05-30 23:10:54 +02:00
auto rd = reader(sess, file, str::byte_len(file), 0u, 0u,
-1 as char,
filemap.start_pos, filemap.start_pos,
strs, filemap, itr);
rd.init();
ret rd;
}
2010-08-20 03:42:17 +02:00
fn in_range(char c, char lo, char hi) -> bool {
ret lo <= c && c <= hi;
2010-08-20 03:42:17 +02:00
}
fn is_alpha(char c) -> bool {
ret in_range(c, 'a', 'z') ||
in_range(c, 'A', 'Z');
}
fn is_dec_digit(char c) -> bool {
ret in_range(c, '0', '9');
}
fn is_alnum(char c) -> bool {
ret is_alpha(c) || is_dec_digit(c);
}
2010-08-20 03:42:17 +02:00
fn is_hex_digit(char c) -> bool {
ret in_range(c, '0', '9') ||
in_range(c, 'a', 'f') ||
in_range(c, 'A', 'F');
}
fn is_bin_digit(char c) -> bool {
ret c == '0' || c == '1';
}
fn dec_digit_val(char c) -> int {
ret (c as int) - ('0' as int);
}
fn hex_digit_val(char c) -> int {
if (in_range(c, '0', '9')) {
ret (c as int) - ('0' as int);
}
if (in_range(c, 'a', 'f')) {
ret ((c as int) - ('a' as int)) + 10;
}
if (in_range(c, 'A', 'F')) {
ret ((c as int) - ('A' as int)) + 10;
}
fail;
}
fn bin_digit_value(char c) -> int {
if (c == '0') { ret 0; }
ret 1;
}
2010-08-20 03:42:17 +02:00
fn is_whitespace(char c) -> bool {
ret c == ' ' || c == '\t' || c == '\r' || c == '\n';
2010-08-20 03:42:17 +02:00
}
2011-05-30 23:10:54 +02:00
fn consume_whitespace_and_comments(&reader rdr) {
2010-08-27 21:36:57 +02:00
while (is_whitespace(rdr.curr())) {
rdr.bump();
}
2010-08-27 21:36:57 +02:00
be consume_any_line_comment(rdr);
}
fn consume_any_line_comment(&reader rdr) {
2010-08-27 21:36:57 +02:00
if (rdr.curr() == '/') {
alt (rdr.next()) {
case ('/') {
while (rdr.curr() != '\n' && !rdr.is_eof()) {
rdr.bump();
}
// Restart whitespace munch.
2011-05-30 23:10:54 +02:00
be consume_whitespace_and_comments(rdr);
}
case ('*') {
2010-08-27 21:36:57 +02:00
rdr.bump();
rdr.bump();
be consume_block_comment(rdr);
}
case (_) {
ret;
}
}
}
}
fn consume_block_comment(&reader rdr) {
let int level = 1;
while (level > 0) {
if (rdr.is_eof()) {
rdr.err("unterminated block comment");
fail;
}
if (rdr.curr() == '/' && rdr.next() == '*') {
rdr.bump();
rdr.bump();
level += 1;
} else {
if (rdr.curr() == '*' && rdr.next() == '/') {
rdr.bump();
rdr.bump();
level -= 1;
} else {
rdr.bump();
}
}
}
// restart whitespace munch.
2011-05-30 23:10:54 +02:00
be consume_whitespace_and_comments(rdr);
}
fn digits_to_string(str s) -> int {
let int accum_int = 0;
let int i = 0;
for (u8 c in s) {
accum_int *= 10;
accum_int += dec_digit_val(c as char);
}
ret accum_int;
}
fn scan_exponent(&reader rdr) -> option::t[str] {
auto c = rdr.curr();
auto res = "";
if (c == 'e' || c == 'E') {
res += str::from_bytes([c as u8]);
rdr.bump();
c = rdr.curr();
if (c == '-' || c == '+') {
res += str::from_bytes([c as u8]);
rdr.bump();
}
auto exponent = scan_dec_digits(rdr);
if (str::byte_len(exponent) > 0u) {
ret(some(res + exponent));
}
else {
rdr.err("scan_exponent: bad fp literal");
fail;
}
}
else {
ret none[str];
}
}
fn scan_dec_digits(&reader rdr) -> str {
auto c = rdr.curr();
let str res = "";
while (is_dec_digit (c) || c == '_') {
if (c != '_') {
res += str::from_bytes([c as u8]);
}
rdr.bump();
c = rdr.curr();
}
ret res;
}
fn scan_number(char c, &reader rdr) -> token::token {
auto accum_int = 0;
let str dec_str = "";
let bool is_dec_integer = false;
auto n = rdr.next();
if (c == '0' && n == 'x') {
rdr.bump();
rdr.bump();
c = rdr.curr();
while (is_hex_digit(c) || c == '_') {
if (c != '_') {
accum_int *= 16;
accum_int += hex_digit_val(c);
}
rdr.bump();
c = rdr.curr();
}
} else if (c == '0' && n == 'b') {
rdr.bump();
rdr.bump();
c = rdr.curr();
while (is_bin_digit(c) || c == '_') {
if (c != '_') {
accum_int *= 2;
accum_int += bin_digit_value(c);
}
rdr.bump();
c = rdr.curr();
}
} else {
dec_str = scan_dec_digits(rdr);
is_dec_integer = true;
}
if (is_dec_integer) {
accum_int = digits_to_string(dec_str);
}
c = rdr.curr();
n = rdr.next();
if (c == 'u' || c == 'i') {
let bool signed = (c == 'i');
rdr.bump();
c = rdr.curr();
if (c == '8') {
rdr.bump();
if (signed) {
ret token::LIT_MACH_INT(common::ty_i8, accum_int);
} else {
ret token::LIT_MACH_INT(common::ty_u8, accum_int);
}
}
n = rdr.next();
if (c == '1' && n == '6') {
rdr.bump();
rdr.bump();
if (signed) {
ret token::LIT_MACH_INT(common::ty_i16, accum_int);
} else {
ret token::LIT_MACH_INT(common::ty_u16, accum_int);
}
}
if (c == '3' && n == '2') {
rdr.bump();
rdr.bump();
if (signed) {
ret token::LIT_MACH_INT(common::ty_i32, accum_int);
} else {
ret token::LIT_MACH_INT(common::ty_u32, accum_int);
}
}
if (c == '6' && n == '4') {
rdr.bump();
rdr.bump();
if (signed) {
ret token::LIT_MACH_INT(common::ty_i64, accum_int);
} else {
ret token::LIT_MACH_INT(common::ty_u64, accum_int);
}
}
if (signed) {
ret token::LIT_INT(accum_int);
} else {
// FIXME: should cast in the target bit-width.
ret token::LIT_UINT(accum_int as uint);
}
}
c = rdr.curr();
if (c == '.') {
// Parse a floating-point number.
rdr.bump();
auto dec_part = scan_dec_digits(rdr);
auto float_str = dec_str + "." + dec_part;
c = rdr.curr();
auto exponent_str = scan_exponent(rdr);
alt (exponent_str) {
case (some(?s)) {
float_str += s;
}
case (none) {
}
}
c = rdr.curr();
if (c == 'f') {
rdr.bump();
c = rdr.curr();
n = rdr.next();
if (c == '3' && n == '2') {
rdr.bump(); rdr.bump();
ret token::LIT_MACH_FLOAT(util::common::ty_f32,
interner::intern[str](*rdr.get_interner(), float_str));
}
else if (c == '6' && n == '4') {
rdr.bump(); rdr.bump();
ret token::LIT_MACH_FLOAT(util::common::ty_f64,
interner::intern[str](*rdr.get_interner(), float_str));
/* FIXME: if this is out of range for either a 32-bit or
64-bit float, it won't be noticed till the back-end */
}
}
else {
ret token::LIT_FLOAT(interner::intern[str](*rdr.get_interner(),
float_str));
}
}
auto maybe_exponent = scan_exponent(rdr);
alt(maybe_exponent) {
case(some(?s)) {
ret token::LIT_FLOAT(interner::intern[str](*rdr.get_interner(),
dec_str + s));
}
case(none) {
ret token::LIT_INT(accum_int);
}
}
}
fn scan_numeric_escape(&reader rdr, uint n_hex_digits) -> char {
auto accum_int = 0;
while (n_hex_digits != 0u) {
auto n = rdr.curr();
rdr.bump();
if (!is_hex_digit(n)) {
rdr.err(#fmt("illegal numeric character escape: %d", n as int));
fail;
}
accum_int *= 16;
accum_int += hex_digit_val(n);
n_hex_digits -= 1u;
}
ret accum_int as char;
}
fn next_token(&reader rdr) -> token::token {
auto accum_str = "";
2010-08-20 03:42:17 +02:00
2011-05-30 23:10:54 +02:00
consume_whitespace_and_comments(rdr);
2010-08-24 17:50:56 +02:00
if (rdr.is_eof()) { ret token::EOF; }
2010-08-20 03:42:17 +02:00
2010-12-10 02:11:05 +01:00
rdr.mark();
2010-08-27 21:36:57 +02:00
auto c = rdr.curr();
if (is_alpha(c) || c == '_') {
while (is_alnum(c) || c == '_') {
str::push_char(accum_str, c);
2010-08-27 21:36:57 +02:00
rdr.bump();
c = rdr.curr();
}
if (str::eq(accum_str, "_")) {
ret token::UNDERSCORE;
}
auto is_mod_name = c == ':' && rdr.next() == ':';
ret token::IDENT(interner::intern[str](*rdr.get_interner(),
accum_str), is_mod_name);
2010-08-20 19:03:31 +02:00
}
if (is_dec_digit(c)) {
ret scan_number(c, rdr);
}
fn binop(&reader rdr, token::binop op) -> token::token {
2010-08-27 21:36:57 +02:00
rdr.bump();
if (rdr.curr() == '=') {
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token::BINOPEQ(op);
} else {
ret token::BINOP(op);
}
}
alt (c) {
// One-byte tokens.
case ('?') { rdr.bump(); ret token::QUES; }
case (';') { rdr.bump(); ret token::SEMI; }
case (',') { rdr.bump(); ret token::COMMA; }
case ('.') { rdr.bump(); ret token::DOT; }
case ('(') { rdr.bump(); ret token::LPAREN; }
case (')') { rdr.bump(); ret token::RPAREN; }
case ('{') { rdr.bump(); ret token::LBRACE; }
case ('}') { rdr.bump(); ret token::RBRACE; }
case ('[') { rdr.bump(); ret token::LBRACKET; }
case (']') { rdr.bump(); ret token::RBRACKET; }
case ('@') { rdr.bump(); ret token::AT; }
case ('#') { rdr.bump(); ret token::POUND; }
case ('~') { rdr.bump(); ret token::TILDE; }
case (':') {
rdr.bump();
if (rdr.curr() == ':') {
rdr.bump();
ret token::MOD_SEP;
}
else {
ret token::COLON;
};
}
// Multi-byte tokens.
case ('=') {
rdr.bump();
if (rdr.curr() == '=') {
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token::EQEQ;
} else {
ret token::EQ;
}
}
case ('!') {
rdr.bump();
if (rdr.curr() == '=') {
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token::NE;
} else {
ret token::NOT;
}
}
case ('<') {
rdr.bump();
alt (rdr.curr()) {
case ('=') {
rdr.bump();
ret token::LE;
}
case ('<') {
ret binop(rdr, token::LSL);
}
case ('-') {
rdr.bump();
ret token::LARROW;
}
case ('|') {
rdr.bump();
ret token::SEND;
}
case (_) {
ret token::LT;
}
}
}
case ('>') {
rdr.bump();
alt (rdr.curr()) {
case ('=') {
rdr.bump();
ret token::GE;
}
case ('>') {
if (rdr.next() == '>') {
rdr.bump();
ret binop(rdr, token::ASR);
} else {
ret binop(rdr, token::LSR);
}
}
case (_) {
ret token::GT;
}
}
}
case ('\'') {
2010-08-27 21:36:57 +02:00
rdr.bump();
auto c2 = rdr.curr();
rdr.bump();
if (c2 == '\\') {
auto escaped = rdr.curr();
rdr.bump();
alt (escaped) {
case ('n') { c2 = '\n'; }
case ('r') { c2 = '\r'; }
case ('t') { c2 = '\t'; }
case ('\\') { c2 = '\\'; }
case ('\'') { c2 = '\''; }
case ('x') { c2 = scan_numeric_escape(rdr, 2u); }
case ('u') { c2 = scan_numeric_escape(rdr, 4u); }
case ('U') { c2 = scan_numeric_escape(rdr, 8u); }
case (?c2) {
rdr.err(#fmt("unknown character escape: %d",
c2 as int));
fail;
}
}
}
2010-08-27 21:36:57 +02:00
if (rdr.curr() != '\'') {
rdr.err("unterminated character constant");
fail;
}
rdr.bump(); // advance curr past token
ret token::LIT_CHAR(c2);
}
case ('"') {
2010-08-27 21:36:57 +02:00
rdr.bump();
while (rdr.curr() != '"') {
auto ch = rdr.curr();
rdr.bump();
alt (ch) {
case ('\\') {
auto escaped = rdr.curr();
rdr.bump();
alt (escaped) {
case ('n') {
str::push_byte(accum_str, '\n' as u8);
}
case ('r') {
str::push_byte(accum_str, '\r' as u8);
}
case ('t') {
str::push_byte(accum_str, '\t' as u8);
}
case ('\\') {
str::push_byte(accum_str, '\\' as u8);
}
case ('"') {
str::push_byte(accum_str, '"' as u8);
}
case ('x') {
str::push_char(accum_str,
scan_numeric_escape(rdr, 2u));
}
case ('u') {
str::push_char(accum_str,
scan_numeric_escape(rdr, 4u));
}
case ('U') {
str::push_char(accum_str,
scan_numeric_escape(rdr, 8u));
}
case (?c2) {
rdr.err(#fmt("unknown string escape: %d",
c2 as int));
fail;
}
}
}
case (_) {
str::push_char(accum_str, ch);
}
}
}
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token::LIT_STR(interner::intern[str](*rdr.get_interner(),
accum_str));
}
case ('-') {
2010-08-27 21:36:57 +02:00
if (rdr.next() == '>') {
rdr.bump();
rdr.bump();
ret token::RARROW;
} else {
ret binop(rdr, token::MINUS);
}
}
case ('&') {
2010-08-27 21:36:57 +02:00
if (rdr.next() == '&') {
rdr.bump();
rdr.bump();
ret token::ANDAND;
} else {
ret binop(rdr, token::AND);
}
}
case ('|') {
2011-05-27 03:16:24 +02:00
alt (rdr.next()) {
case ('|') {
rdr.bump();
rdr.bump();
ret token::OROR;
}
case ('>') {
2011-05-27 20:45:22 +02:00
rdr.bump();
2011-05-27 03:16:24 +02:00
rdr.bump();
ret token::RECV;
}
case (_) {
ret binop(rdr, token::OR);
}
}
}
case ('+') {
ret binop(rdr, token::PLUS);
}
case ('*') {
ret binop(rdr, token::STAR);
}
case ('/') {
ret binop(rdr, token::SLASH);
}
case ('^') {
ret binop(rdr, token::CARET);
}
case ('%') {
ret binop(rdr, token::PERCENT);
}
case (?c) {
rdr.err(#fmt("unkown start of token: %d", c as int));
fail;
}
}
fail;
}
2011-05-30 23:10:54 +02:00
tag cmnt_style {
isolated; // No code on either side of each line of the comment
trailing; // Code exists to the left of the comment
mixed; // Code before /* foo */ and after the comment
}
2011-05-30 23:10:54 +02:00
type cmnt = rec(cmnt_style style, vec[str] lines, uint pos);
2011-05-30 23:10:54 +02:00
fn read_to_eol(&reader rdr) -> str {
auto val = "";
while (rdr.curr() != '\n' && !rdr.is_eof()) {
str::push_char(val, rdr.curr());
rdr.bump();
}
if (rdr.curr() == '\n') {
rdr.bump();
2011-05-30 23:10:54 +02:00
} else {
assert rdr.is_eof();
}
2011-05-30 23:10:54 +02:00
ret val;
}
2011-05-30 23:10:54 +02:00
fn read_one_line_comment(&reader rdr) -> str {
auto val = read_to_eol(rdr);
assert val.(0) == ('/' as u8) && val.(1) == ('/' as u8);
ret val;
}
fn consume_whitespace(&reader rdr) {
while (is_whitespace(rdr.curr()) && !rdr.is_eof()) {
rdr.bump();
}
}
2011-05-30 23:10:54 +02:00
fn consume_non_eol_whitespace(&reader rdr) {
while (is_whitespace(rdr.curr()) &&
rdr.curr() != '\n' && !rdr.is_eof()) {
rdr.bump();
}
}
fn read_line_comments(&reader rdr, bool code_to_the_left) -> cmnt {
log ">>> line comments";
auto p = rdr.get_chpos();
let vec[str] lines = [];
2011-05-30 23:10:54 +02:00
while (rdr.curr() == '/' && rdr.next() == '/') {
lines += [read_one_line_comment(rdr)];
consume_non_eol_whitespace(rdr);
}
log "<<< line comments";
ret rec(style = if (code_to_the_left) { trailing } else { isolated },
lines = lines,
pos=p);
}
fn all_whitespace(&str s, uint begin, uint end) -> bool {
let uint i = begin;
while (i != end) {
if (!is_whitespace(s.(i) as char)) {
ret false;
}
i += 1u;
}
ret true;
}
fn trim_whitespace_prefix_and_push_line(&mutable vec[str] lines,
&str s, uint col) {
auto s1;
if (all_whitespace(s, 0u, col)) {
if (col < str::byte_len(s)) {
s1 = str::slice(s, col, str::byte_len(s));
} else {
2011-05-30 23:10:54 +02:00
s1 = "";
}
2011-05-30 23:10:54 +02:00
} else {
s1 = s;
}
log "pushing line: " + s1;
lines += [s1];
}
fn read_block_comment(&reader rdr,
bool code_to_the_left) -> cmnt {
log ">>> block comment";
auto p = rdr.get_chpos();
let vec[str] lines = [];
let uint col = rdr.get_col();
rdr.bump();
rdr.bump();
auto curr_line = "/*";
let int level = 1;
while (level > 0) {
log #fmt("=== block comment level %d", level);
if (rdr.is_eof()) {
2011-05-30 23:10:54 +02:00
rdr.err("unterminated block comment");
fail;
}
2011-05-30 23:10:54 +02:00
if (rdr.curr() == '\n') {
trim_whitespace_prefix_and_push_line(lines, curr_line, col);
curr_line = "";
rdr.bump();
} else {
str::push_char(curr_line, rdr.curr());
if (rdr.curr() == '/' && rdr.next() == '*') {
rdr.bump();
rdr.bump();
curr_line += "*";
level += 1;
} else {
if (rdr.curr() == '*' && rdr.next() == '/') {
rdr.bump();
rdr.bump();
curr_line += "/";
level -= 1;
} else {
rdr.bump();
}
}
}
}
2011-05-30 23:10:54 +02:00
if (str::byte_len(curr_line) != 0u) {
trim_whitespace_prefix_and_push_line(lines, curr_line, col);
}
auto style = if (code_to_the_left) { trailing } else { isolated };
consume_non_eol_whitespace(rdr);
if (!rdr.is_eof() &&
rdr.curr() != '\n' &&
vec::len(lines) == 1u) {
style = mixed;
}
log "<<< block comment";
ret rec(style = style, lines = lines, pos=p);
}
fn peeking_at_comment(&reader rdr) -> bool {
ret (rdr.curr() == '/' && rdr.next() == '/') ||
(rdr.curr() == '/' && rdr.next() == '*');
}
fn consume_comment(&reader rdr, bool code_to_the_left,
&mutable vec[cmnt] comments) {
log ">>> consume comment";
if (rdr.curr() == '/' && rdr.next() == '/') {
vec::push[cmnt](comments,
read_line_comments(rdr, code_to_the_left));
} else if (rdr.curr() == '/' && rdr.next() == '*') {
vec::push[cmnt](comments,
read_block_comment(rdr, code_to_the_left));
} else { fail; }
log "<<< consume comment";
}
fn gather_comments(session sess, str path) -> vec[cmnt] {
auto srdr = io::file_reader(path);
auto itr = @interner::mk[str](str::hash, str::eq);
auto rdr = new_reader(sess, srdr, codemap::new_filemap(path, 0u), itr);
let vec[cmnt] comments = [];
while (!rdr.is_eof()) {
while (true) {
2011-05-30 23:10:54 +02:00
auto code_to_the_left = true;
consume_non_eol_whitespace(rdr);
if (rdr.next() == '\n') {
code_to_the_left = false;
consume_whitespace(rdr);
}
while (peeking_at_comment(rdr)) {
consume_comment(rdr, code_to_the_left, comments);
consume_whitespace(rdr);
}
break;
}
next_token(rdr);
}
ret comments;
2011-05-30 23:10:54 +02:00
}
//
// Local Variables:
// mode: rust
// fill-column: 78;
// indent-tabs-mode: nil
// c-basic-offset: 4
// buffer-file-coding-system: utf-8-unix
// compile-command: "make -k -C $RBUILD 2>&1 | sed -e 's/\\/x\\//x:\\//g'";
// End:
//