rust/src/comp/front/lexer.rs

769 lines
20 KiB
Rust
Raw Normal View History

import std.io;
import std._str;
import std._int;
import std.map;
import std.map.hashmap;
import std.option;
import std.option.some;
import std.option.none;
import util.common;
import util.common.new_str_hash;
state type reader = state obj {
fn is_eof() -> bool;
fn curr() -> char;
fn next() -> char;
impure fn bump();
fn mark();
fn get_filename() -> str;
fn get_mark_pos() -> common.pos;
fn get_curr_pos() -> common.pos;
fn get_keywords() -> hashmap[str,token.token];
fn get_reserved() -> hashmap[str,()];
};
impure fn new_reader(io.reader rdr, str filename) -> reader
{
state obj reader(io.reader rdr,
str filename,
mutable char c,
mutable char n,
mutable uint mark_line,
mutable uint mark_col,
mutable uint line,
mutable uint col,
hashmap[str,token.token] keywords,
hashmap[str,()] reserved) {
fn is_eof() -> bool {
ret c == (-1) as char;
}
fn get_curr_pos() -> common.pos {
ret rec(line=line, col=col);
}
fn get_mark_pos() -> common.pos {
ret rec(line=mark_line, col=mark_col);
}
fn get_filename() -> str {
ret filename;
}
2010-08-27 21:36:57 +02:00
fn curr() -> char {
ret c;
}
2010-08-27 21:36:57 +02:00
fn next() -> char {
ret n;
}
impure fn bump() {
2010-12-10 02:11:05 +01:00
let char prev = c;
2010-08-27 21:36:57 +02:00
c = n;
if (c == (-1) as char) {
ret;
}
2010-12-10 02:11:05 +01:00
if (prev == '\n') {
line += 1u;
2010-12-10 02:11:05 +01:00
col = 0u;
} else {
col += 1u;
}
2010-08-27 21:36:57 +02:00
n = rdr.read_char() as char;
}
fn mark() {
mark_line = line;
mark_col = col;
}
fn get_keywords() -> hashmap[str,token.token] {
ret keywords;
}
fn get_reserved() -> hashmap[str,()] {
ret reserved;
}
}
auto keywords = new_str_hash[token.token]();
keywords.insert("mod", token.MOD);
keywords.insert("use", token.USE);
keywords.insert("meta", token.META);
keywords.insert("auth", token.AUTH);
keywords.insert("syntax", token.SYNTAX);
keywords.insert("if", token.IF);
keywords.insert("else", token.ELSE);
keywords.insert("while", token.WHILE);
keywords.insert("do", token.DO);
keywords.insert("alt", token.ALT);
keywords.insert("case", token.CASE);
keywords.insert("for", token.FOR);
keywords.insert("each", token.EACH);
keywords.insert("put", token.PUT);
keywords.insert("ret", token.RET);
keywords.insert("be", token.BE);
keywords.insert("fail", token.FAIL);
keywords.insert("drop", token.DROP);
keywords.insert("type", token.TYPE);
keywords.insert("check", token.CHECK);
keywords.insert("claim", token.CLAIM);
keywords.insert("prove", token.PROVE);
keywords.insert("abs", token.ABS);
keywords.insert("state", token.STATE);
keywords.insert("gc", token.GC);
keywords.insert("impure", token.IMPURE);
keywords.insert("unsafe", token.UNSAFE);
keywords.insert("native", token.NATIVE);
keywords.insert("mutable", token.MUTABLE);
keywords.insert("auto", token.AUTO);
keywords.insert("fn", token.FN);
keywords.insert("iter", token.ITER);
keywords.insert("import", token.IMPORT);
keywords.insert("export", token.EXPORT);
keywords.insert("let", token.LET);
keywords.insert("const", token.CONST);
keywords.insert("log", token.LOG);
keywords.insert("spawn", token.SPAWN);
keywords.insert("thread", token.THREAD);
keywords.insert("yield", token.YIELD);
keywords.insert("join", token.JOIN);
keywords.insert("bool", token.BOOL);
keywords.insert("int", token.INT);
keywords.insert("uint", token.UINT);
keywords.insert("float", token.FLOAT);
keywords.insert("char", token.CHAR);
keywords.insert("str", token.STR);
keywords.insert("rec", token.REC);
keywords.insert("tup", token.TUP);
keywords.insert("tag", token.TAG);
keywords.insert("vec", token.VEC);
keywords.insert("any", token.ANY);
keywords.insert("obj", token.OBJ);
keywords.insert("port", token.PORT);
keywords.insert("chan", token.CHAN);
keywords.insert("task", token.TASK);
keywords.insert("true", token.LIT_BOOL(true));
keywords.insert("false", token.LIT_BOOL(false));
keywords.insert("in", token.IN);
keywords.insert("as", token.AS);
keywords.insert("with", token.WITH);
keywords.insert("bind", token.BIND);
keywords.insert("u8", token.MACH(common.ty_u8));
keywords.insert("u16", token.MACH(common.ty_u16));
keywords.insert("u32", token.MACH(common.ty_u32));
keywords.insert("u64", token.MACH(common.ty_u64));
keywords.insert("i8", token.MACH(common.ty_i8));
keywords.insert("i16", token.MACH(common.ty_i16));
keywords.insert("i32", token.MACH(common.ty_i32));
keywords.insert("i64", token.MACH(common.ty_i64));
keywords.insert("f32", token.MACH(common.ty_f32));
keywords.insert("f64", token.MACH(common.ty_f64));
2011-02-23 02:23:38 +01:00
auto reserved = new_str_hash[()]();
reserved.insert("f16", ()); // IEEE 754-2008 'binary16' interchange fmt
reserved.insert("f80", ()); // IEEE 754-1985 'extended'
reserved.insert("f128", ()); // IEEE 754-2008 'binary128'
reserved.insert("m32", ()); // IEEE 754-2008 'decimal32'
reserved.insert("m64", ()); // IEEE 754-2008 'decimal64'
reserved.insert("m128", ()); // IEEE 754-2008 'decimal128'
reserved.insert("dec", ()); // One of m32, m64, m128
ret reader(rdr, filename, rdr.read_char() as char,
rdr.read_char() as char, 1u, 0u, 1u, 0u, keywords, reserved);
}
2010-08-20 03:42:17 +02:00
fn in_range(char c, char lo, char hi) -> bool {
ret lo <= c && c <= hi;
2010-08-20 03:42:17 +02:00
}
fn is_alpha(char c) -> bool {
ret in_range(c, 'a', 'z') ||
in_range(c, 'A', 'Z');
}
fn is_dec_digit(char c) -> bool {
ret in_range(c, '0', '9');
}
fn is_alnum(char c) -> bool {
ret is_alpha(c) || is_dec_digit(c);
}
2010-08-20 03:42:17 +02:00
fn is_hex_digit(char c) -> bool {
ret in_range(c, '0', '9') ||
in_range(c, 'a', 'f') ||
in_range(c, 'A', 'F');
}
fn is_bin_digit(char c) -> bool {
ret c == '0' || c == '1';
}
fn dec_digit_val(char c) -> int {
ret (c as int) - ('0' as int);
}
fn hex_digit_val(char c) -> int {
if (in_range(c, '0', '9')) {
ret (c as int) - ('0' as int);
}
if (in_range(c, 'a', 'f')) {
ret ((c as int) - ('a' as int)) + 10;
}
if (in_range(c, 'A', 'F')) {
ret ((c as int) - ('A' as int)) + 10;
}
fail;
}
fn bin_digit_value(char c) -> int {
if (c == '0') { ret 0; }
ret 1;
}
2010-08-20 03:42:17 +02:00
fn is_whitespace(char c) -> bool {
ret c == ' ' || c == '\t' || c == '\r' || c == '\n';
2010-08-20 03:42:17 +02:00
}
impure fn consume_any_whitespace(reader rdr) {
2010-08-27 21:36:57 +02:00
while (is_whitespace(rdr.curr())) {
rdr.bump();
}
2010-08-27 21:36:57 +02:00
be consume_any_line_comment(rdr);
}
impure fn consume_any_line_comment(reader rdr) {
2010-08-27 21:36:57 +02:00
if (rdr.curr() == '/') {
alt (rdr.next()) {
case ('/') {
while (rdr.curr() != '\n') {
rdr.bump();
}
// Restart whitespace munch.
be consume_any_whitespace(rdr);
}
case ('*') {
2010-08-27 21:36:57 +02:00
rdr.bump();
rdr.bump();
be consume_block_comment(rdr);
}
case (_) {
ret;
}
}
}
}
impure fn consume_block_comment(reader rdr) {
let int level = 1;
while (level > 0) {
if (rdr.curr() == '/' && rdr.next() == '*') {
rdr.bump();
rdr.bump();
level += 1;
} else {
if (rdr.curr() == '*' && rdr.next() == '/') {
rdr.bump();
rdr.bump();
level -= 1;
} else {
rdr.bump();
}
}
}
// restart whitespace munch.
be consume_any_whitespace(rdr);
}
impure fn scan_dec_digits(reader rdr) -> int {
auto c = rdr.curr();
let int accum_int = 0;
while (is_dec_digit(c) || c == '_') {
if (c != '_') {
accum_int *= 10;
accum_int += dec_digit_val(c);
}
rdr.bump();
c = rdr.curr();
}
ret accum_int;
}
impure fn scan_exponent(reader rdr) -> option.t[int] {
auto c = rdr.curr();
auto sign = 1;
if (c == 'e' || c == 'E') {
rdr.bump();
c = rdr.curr();
if (c == '-') {
sign = -1;
rdr.bump();
} else if (c == '+') {
rdr.bump();
}
auto exponent = scan_dec_digits(rdr);
ret(some(sign * exponent));
}
else {
ret none[int];
}
}
impure fn scan_number(mutable char c, reader rdr) -> token.token {
auto accum_int = 0;
auto n = rdr.next();
if (c == '0' && n == 'x') {
rdr.bump();
rdr.bump();
c = rdr.curr();
while (is_hex_digit(c) || c == '_') {
if (c != '_') {
accum_int *= 16;
accum_int += hex_digit_val(c);
}
rdr.bump();
c = rdr.curr();
}
} else if (c == '0' && n == 'b') {
rdr.bump();
rdr.bump();
c = rdr.curr();
while (is_bin_digit(c) || c == '_') {
if (c != '_') {
accum_int *= 2;
accum_int += bin_digit_value(c);
}
rdr.bump();
c = rdr.curr();
}
} else {
accum_int = scan_dec_digits(rdr);
}
c = rdr.curr();
n = rdr.next();
if (c == 'u' || c == 'i') {
let bool signed = (c == 'i');
rdr.bump();
c = rdr.curr();
if (c == '8') {
rdr.bump();
if (signed) {
ret token.LIT_MACH_INT(common.ty_i8, accum_int);
} else {
ret token.LIT_MACH_INT(common.ty_u8, accum_int);
}
}
n = rdr.next();
if (c == '1' && n == '6') {
rdr.bump();
rdr.bump();
if (signed) {
ret token.LIT_MACH_INT(common.ty_i16, accum_int);
} else {
ret token.LIT_MACH_INT(common.ty_u16, accum_int);
}
}
if (c == '3' && n == '2') {
rdr.bump();
rdr.bump();
if (signed) {
ret token.LIT_MACH_INT(common.ty_i32, accum_int);
} else {
ret token.LIT_MACH_INT(common.ty_u32, accum_int);
}
}
if (c == '6' && n == '4') {
rdr.bump();
rdr.bump();
if (signed) {
ret token.LIT_MACH_INT(common.ty_i64, accum_int);
} else {
ret token.LIT_MACH_INT(common.ty_u64, accum_int);
}
}
if (signed) {
ret token.LIT_INT(accum_int);
} else {
// FIXME: should cast in the target bit-width.
ret token.LIT_UINT(accum_int as uint);
}
}
c = rdr.curr();
if (c == '.') {
// Parse a floating-point number.
rdr.bump();
auto accum_int1 = scan_dec_digits(rdr);
auto base_str = _int.to_str(accum_int, 10u) + "."
+ _int.to_str(accum_int1, 10u);
c = rdr.curr();
auto exponent_str = "";
let option.t[int] maybe_exponent = scan_exponent(rdr);
alt(maybe_exponent) {
case(some[int](?i)) {
exponent_str = "e" + _int.to_str(i, 10u);
}
case(none[int]) {
}
}
c = rdr.curr();
if (c == 'f') {
rdr.bump();
c = rdr.curr();
n = rdr.next();
if (c == '3' && n == '2') {
rdr.bump(); rdr.bump();
ret token.LIT_MACH_FLOAT(util.common.ty_f32,
base_str + exponent_str);
}
else if (c == '6' && n == '4') {
rdr.bump(); rdr.bump();
ret token.LIT_MACH_FLOAT(util.common.ty_f64,
base_str + exponent_str);
}
}
else {
ret token.LIT_FLOAT(base_str + exponent_str);
}
}
auto maybe_exponent = scan_exponent(rdr);
alt(maybe_exponent) {
case(some[int](?i)) {
ret token.LIT_FLOAT(_int.to_str(accum_int, 10u)
+ "e" + _int.to_str(i, 10u));
}
case(none[int]) {
ret token.LIT_INT(accum_int);
}
}
}
impure fn next_token(reader rdr) -> token.token {
auto accum_str = "";
2010-08-20 03:42:17 +02:00
2010-08-27 21:36:57 +02:00
consume_any_whitespace(rdr);
2010-08-24 17:50:56 +02:00
if (rdr.is_eof()) { ret token.EOF; }
2010-08-20 03:42:17 +02:00
2010-12-10 02:11:05 +01:00
rdr.mark();
2010-08-27 21:36:57 +02:00
auto c = rdr.curr();
if (is_alpha(c) || c == '_') {
while (is_alnum(c) || c == '_') {
_str.push_byte(accum_str, (c as u8));
2010-08-27 21:36:57 +02:00
rdr.bump();
c = rdr.curr();
}
if (_str.eq(accum_str, "_")) {
ret token.UNDERSCORE;
}
auto kwds = rdr.get_keywords();
if (kwds.contains_key(accum_str)) {
ret kwds.get(accum_str);
}
2011-02-23 02:23:38 +01:00
auto rsvd = rdr.get_reserved();
if (rsvd.contains_key(accum_str)) {
log "reserved keyword";
fail;
}
ret token.IDENT(accum_str);
2010-08-20 19:03:31 +02:00
}
if (is_dec_digit(c)) {
ret scan_number(c, rdr);
}
impure fn binop(reader rdr, token.binop op) -> token.token {
2010-08-27 21:36:57 +02:00
rdr.bump();
if (rdr.curr() == '=') {
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token.BINOPEQ(op);
} else {
ret token.BINOP(op);
}
}
alt (c) {
// One-byte tokens.
case (':') { rdr.bump(); ret token.COLON; }
case ('?') { rdr.bump(); ret token.QUES; }
case (';') { rdr.bump(); ret token.SEMI; }
case (',') { rdr.bump(); ret token.COMMA; }
case ('.') { rdr.bump(); ret token.DOT; }
case ('(') { rdr.bump(); ret token.LPAREN; }
case (')') { rdr.bump(); ret token.RPAREN; }
case ('{') { rdr.bump(); ret token.LBRACE; }
case ('}') { rdr.bump(); ret token.RBRACE; }
case ('[') { rdr.bump(); ret token.LBRACKET; }
case (']') { rdr.bump(); ret token.RBRACKET; }
case ('@') { rdr.bump(); ret token.AT; }
case ('#') { rdr.bump(); ret token.POUND; }
case ('~') { rdr.bump(); ret token.TILDE; }
// Multi-byte tokens.
case ('=') {
rdr.bump();
if (rdr.curr() == '=') {
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token.EQEQ;
} else {
ret token.EQ;
}
}
case ('!') {
rdr.bump();
if (rdr.curr() == '=') {
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token.NE;
} else {
ret token.NOT;
}
}
case ('<') {
rdr.bump();
alt (rdr.curr()) {
case ('=') {
rdr.bump();
ret token.LE;
}
case ('<') {
ret binop(rdr, token.LSL);
}
case ('-') {
rdr.bump();
ret token.LARROW;
}
case ('|') {
rdr.bump();
ret token.SEND;
}
case (_) {
ret token.LT;
}
}
}
case ('>') {
rdr.bump();
alt (rdr.curr()) {
case ('=') {
rdr.bump();
ret token.GE;
}
case ('>') {
if (rdr.next() == '>') {
rdr.bump();
ret binop(rdr, token.ASR);
} else {
ret binop(rdr, token.LSR);
}
}
case (_) {
ret token.GT;
}
}
}
case ('\'') {
2010-08-27 21:36:57 +02:00
rdr.bump();
auto c2 = rdr.curr();
if (c2 == '\\') {
2010-08-27 21:36:57 +02:00
alt (rdr.next()) {
case ('n') { rdr.bump(); c2 = '\n'; }
case ('r') { rdr.bump(); c2 = '\r'; }
case ('t') { rdr.bump(); c2 = '\t'; }
case ('\\') { rdr.bump(); c2 = '\\'; }
case ('\'') { rdr.bump(); c2 = '\''; }
// FIXME: unicode numeric escapes.
case (?c2) {
log "unknown character escape";
log c2;
fail;
}
}
}
2010-08-27 21:36:57 +02:00
if (rdr.next() != '\'') {
log "unterminated character constant";
fail;
}
2010-08-27 21:36:57 +02:00
rdr.bump();
rdr.bump();
ret token.LIT_CHAR(c2);
}
case ('"') {
2010-08-27 21:36:57 +02:00
rdr.bump();
// FIXME: general utf8-consumption support.
2010-08-27 21:36:57 +02:00
while (rdr.curr() != '"') {
alt (rdr.curr()) {
case ('\\') {
2010-08-27 21:36:57 +02:00
alt (rdr.next()) {
case ('n') {
rdr.bump();
_str.push_byte(accum_str, '\n' as u8);
}
case ('r') {
rdr.bump();
_str.push_byte(accum_str, '\r' as u8);
}
case ('t') {
rdr.bump();
_str.push_byte(accum_str, '\t' as u8);
}
case ('\\') {
rdr.bump();
_str.push_byte(accum_str, '\\' as u8);
}
case ('"') {
rdr.bump();
_str.push_byte(accum_str, '"' as u8);
}
// FIXME: unicode numeric escapes.
case (?c2) {
log "unknown string escape";
log c2;
fail;
}
}
}
case (_) {
_str.push_byte(accum_str, rdr.curr() as u8);
}
}
2010-08-27 21:36:57 +02:00
rdr.bump();
}
2010-08-27 21:36:57 +02:00
rdr.bump();
ret token.LIT_STR(accum_str);
}
case ('-') {
2010-08-27 21:36:57 +02:00
if (rdr.next() == '>') {
rdr.bump();
rdr.bump();
ret token.RARROW;
} else {
ret binop(rdr, token.MINUS);
}
}
case ('&') {
2010-08-27 21:36:57 +02:00
if (rdr.next() == '&') {
rdr.bump();
rdr.bump();
ret token.ANDAND;
} else {
ret binop(rdr, token.AND);
}
}
case ('|') {
if (rdr.next() == '|') {
rdr.bump();
rdr.bump();
ret token.OROR;
} else {
ret binop(rdr, token.OR);
}
}
case ('+') {
ret binop(rdr, token.PLUS);
}
case ('*') {
ret binop(rdr, token.STAR);
}
case ('/') {
ret binop(rdr, token.SLASH);
}
case ('^') {
ret binop(rdr, token.CARET);
}
case ('%') {
ret binop(rdr, token.PERCENT);
}
}
fail;
}
//
// Local Variables:
// mode: rust
// fill-column: 78;
// indent-tabs-mode: nil
// c-basic-offset: 4
// buffer-file-coding-system: utf-8-unix
// compile-command: "make -k -C ../.. 2>&1 | sed -e 's/\\/x\\//x:\\//g'";
// End:
//