From cb362626f326a565aca34c1a11c95dcb7152b798 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 1 Jan 2018 18:58:46 +0300 Subject: [PATCH] Parser: guess what? Groundwork! --- grammar.ron | 10 ++- src/bin/gen.rs | 19 ++++- src/lexer/mod.rs | 3 + src/lexer/ptr.rs | 5 ++ src/parser/event_parser/grammar.rs | 62 +++++++++++++- src/parser/event_parser/parser.rs | 15 +++- src/syntax_kinds.rs | 130 +++++++++++++++++------------ tests/data/lexer/0011_keywords.rs | 1 + tests/data/lexer/0011_keywords.txt | 12 +++ 9 files changed, 199 insertions(+), 58 deletions(-) create mode 100644 tests/data/lexer/0011_keywords.rs create mode 100644 tests/data/lexer/0011_keywords.txt diff --git a/grammar.ron b/grammar.ron index 439c4ef9c81..fb2c6d90ed3 100644 --- a/grammar.ron +++ b/grammar.ron @@ -1,4 +1,12 @@ Grammar( + keywords: [ + "use", + "fn", + "struct", + "enum", + "trait", + "impl", + ], tokens: [ "ERROR", "IDENT", @@ -53,6 +61,6 @@ Grammar( "SHEBANG", ], nodes: [ - "FILE" + "FILE", ] ) \ No newline at end of file diff --git a/src/bin/gen.rs b/src/bin/gen.rs index f5a66d9f254..9d7f7e3893a 100644 --- a/src/bin/gen.rs +++ b/src/bin/gen.rs @@ -17,6 +17,7 @@ fn main() { #[derive(Deserialize)] struct Grammar { + keywords: Vec, tokens: Vec, nodes: Vec, } @@ -33,8 +34,10 @@ impl Grammar { acc.push_str("use tree::{SyntaxKind, SyntaxInfo};\n"); acc.push_str("\n"); - let syntax_kinds: Vec<&String> = - self.tokens.iter().chain(self.nodes.iter()) + let syntax_kinds: Vec = + self.keywords.iter().map(|kw| kw_token(kw)) + .chain(self.tokens.iter().cloned()) + .chain(self.nodes.iter().cloned()) .collect(); for (idx, kind) in syntax_kinds.iter().enumerate() { @@ -60,6 +63,14 @@ impl Grammar { acc.push_str("pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {\n"); acc.push_str(" &INFOS[kind.0 as usize]\n"); + acc.push_str("}\n\n"); + acc.push_str("pub(crate) fn ident_to_keyword(ident: &str) -> Option {\n"); + acc.push_str(" match ident {\n"); + for kw in self.keywords.iter() { + write!(acc, " {:?} => Some({}),\n", kw, kw_token(kw)).unwrap(); + } + acc.push_str(" _ => None,\n"); + acc.push_str(" }\n"); acc.push_str("}\n"); acc } @@ -77,4 +88,8 @@ fn generated_file() -> PathBuf { fn scream(word: &str) -> String { word.chars().map(|c| c.to_ascii_uppercase()).collect() +} + +fn kw_token(keyword: &str) -> String { + format!("{}_KW", scream(keyword)) } \ No newline at end of file diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 7c425976307..bc5344b5ffa 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -187,6 +187,9 @@ fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { return if c == '_' { UNDERSCORE } else { IDENT }; } ptr.bump_while(is_ident_continue); + if let Some(kind) = ident_to_keyword(ptr.current_token_text()) { + return kind; + } IDENT } diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index 2f759119af7..ff6ef11fcd2 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs @@ -59,6 +59,11 @@ impl<'s> Ptr<'s> { } } + pub fn current_token_text(&self) -> &str { + let len: u32 = self.len.into(); + &self.text[..len as usize] + } + fn chars(&self) -> Chars { let len: u32 = self.len.into(); self.text[len as usize ..].chars() diff --git a/src/parser/event_parser/grammar.rs b/src/parser/event_parser/grammar.rs index c3496cccd64..5219ed535de 100644 --- a/src/parser/event_parser/grammar.rs +++ b/src/parser/event_parser/grammar.rs @@ -3,8 +3,68 @@ use super::parser::Parser; use syntax_kinds::*; +// Items // + pub fn file(p: &mut Parser) { p.start(FILE); - //TODO: parse_shebang + shebang(p); + inner_attributes(p); + mod_items(p); + p.finish(); +} + +type Result = ::std::result::Result<(), ()>; +const OK: Result = Ok(()); +const ERR: Result = Err(()); + +fn shebang(_: &mut Parser) { + //TODO +} + +fn inner_attributes(_: &mut Parser) { + //TODO +} + +fn mod_items(p: &mut Parser) { + loop { + skip_until_item(p); + if p.is_eof() { + return; + } + if item(p).is_err() { + skip_one_token(p); + } + } +} + +fn item(p: &mut Parser) -> Result { + outer_attributes(p)?; + visibility(p)?; + ERR +} + + + +// Paths, types, attributes, and stuff // + +fn outer_attributes(_: &mut Parser) -> Result { + OK +} + +fn visibility(_: &mut Parser) -> Result { + OK +} + +// Expressions // + +// Error recovery and high-order utils // + +fn skip_until_item(_: &mut Parser) { + //TODO +} + +fn skip_one_token(p: &mut Parser) { + p.start(ERROR); + p.bump().unwrap(); p.finish(); } \ No newline at end of file diff --git a/src/parser/event_parser/parser.rs b/src/parser/event_parser/parser.rs index 9592b90c9d8..0e4d44b7984 100644 --- a/src/parser/event_parser/parser.rs +++ b/src/parser/event_parser/parser.rs @@ -34,10 +34,14 @@ impl<'t> Parser<'t> { } pub(crate) fn into_events(self) -> Vec { - assert!(self.pos == self.non_ws_tokens.len()); + assert!(self.is_eof()); self.events } + pub(crate) fn is_eof(&self) -> bool { + self.pos == self.non_ws_tokens.len() + } + pub(crate) fn start(&mut self, kind: SyntaxKind) { self.event(Event::Start { kind }); } @@ -46,6 +50,15 @@ impl<'t> Parser<'t> { self.event(Event::Finish); } + pub(crate) fn bump(&mut self) -> Option { + if self.is_eof() { + return None; + } + let idx = self.non_ws_tokens[self.pos].0; + self.pos += 1; + Some(self.raw_tokens[idx].kind) + } + fn event(&mut self, event: Event) { self.events.push(event) } diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index b83f48dd8ce..a1bcad06254 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs @@ -1,60 +1,72 @@ // Generated from grammar.ron use tree::{SyntaxKind, SyntaxInfo}; -pub const ERROR: SyntaxKind = SyntaxKind(0); -pub const IDENT: SyntaxKind = SyntaxKind(1); -pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); -pub const WHITESPACE: SyntaxKind = SyntaxKind(3); -pub const INT_NUMBER: SyntaxKind = SyntaxKind(4); -pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5); -pub const SEMI: SyntaxKind = SyntaxKind(6); -pub const COMMA: SyntaxKind = SyntaxKind(7); -pub const DOT: SyntaxKind = SyntaxKind(8); -pub const DOTDOT: SyntaxKind = SyntaxKind(9); -pub const DOTDOTDOT: SyntaxKind = SyntaxKind(10); -pub const DOTDOTEQ: SyntaxKind = SyntaxKind(11); -pub const L_PAREN: SyntaxKind = SyntaxKind(12); -pub const R_PAREN: SyntaxKind = SyntaxKind(13); -pub const L_CURLY: SyntaxKind = SyntaxKind(14); -pub const R_CURLY: SyntaxKind = SyntaxKind(15); -pub const L_BRACK: SyntaxKind = SyntaxKind(16); -pub const R_BRACK: SyntaxKind = SyntaxKind(17); -pub const L_ANGLE: SyntaxKind = SyntaxKind(18); -pub const R_ANGLE: SyntaxKind = SyntaxKind(19); -pub const AT: SyntaxKind = SyntaxKind(20); -pub const POUND: SyntaxKind = SyntaxKind(21); -pub const TILDE: SyntaxKind = SyntaxKind(22); -pub const QUESTION: SyntaxKind = SyntaxKind(23); -pub const COLON: SyntaxKind = SyntaxKind(24); -pub const COLONCOLON: SyntaxKind = SyntaxKind(25); -pub const DOLLAR: SyntaxKind = SyntaxKind(26); -pub const EQ: SyntaxKind = SyntaxKind(27); -pub const EQEQ: SyntaxKind = SyntaxKind(28); -pub const FAT_ARROW: SyntaxKind = SyntaxKind(29); -pub const NEQ: SyntaxKind = SyntaxKind(30); -pub const NOT: SyntaxKind = SyntaxKind(31); -pub const LIFETIME: SyntaxKind = SyntaxKind(32); -pub const CHAR: SyntaxKind = SyntaxKind(33); -pub const BYTE: SyntaxKind = SyntaxKind(34); -pub const STRING: SyntaxKind = SyntaxKind(35); -pub const RAW_STRING: SyntaxKind = SyntaxKind(36); -pub const BYTE_STRING: SyntaxKind = SyntaxKind(37); -pub const RAW_BYTE_STRING: SyntaxKind = SyntaxKind(38); -pub const PLUS: SyntaxKind = SyntaxKind(39); -pub const MINUS: SyntaxKind = SyntaxKind(40); -pub const STAR: SyntaxKind = SyntaxKind(41); -pub const SLASH: SyntaxKind = SyntaxKind(42); -pub const CARET: SyntaxKind = SyntaxKind(43); -pub const PERCENT: SyntaxKind = SyntaxKind(44); -pub const AMPERSAND: SyntaxKind = SyntaxKind(45); -pub const PIPE: SyntaxKind = SyntaxKind(46); -pub const THIN_ARROW: SyntaxKind = SyntaxKind(47); -pub const COMMENT: SyntaxKind = SyntaxKind(48); -pub const DOC_COMMENT: SyntaxKind = SyntaxKind(49); -pub const SHEBANG: SyntaxKind = SyntaxKind(50); -pub const FILE: SyntaxKind = SyntaxKind(51); +pub const USE_KW: SyntaxKind = SyntaxKind(0); +pub const FN_KW: SyntaxKind = SyntaxKind(1); +pub const STRUCT_KW: SyntaxKind = SyntaxKind(2); +pub const ENUM_KW: SyntaxKind = SyntaxKind(3); +pub const TRAIT_KW: SyntaxKind = SyntaxKind(4); +pub const IMPL_KW: SyntaxKind = SyntaxKind(5); +pub const ERROR: SyntaxKind = SyntaxKind(6); +pub const IDENT: SyntaxKind = SyntaxKind(7); +pub const UNDERSCORE: SyntaxKind = SyntaxKind(8); +pub const WHITESPACE: SyntaxKind = SyntaxKind(9); +pub const INT_NUMBER: SyntaxKind = SyntaxKind(10); +pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(11); +pub const SEMI: SyntaxKind = SyntaxKind(12); +pub const COMMA: SyntaxKind = SyntaxKind(13); +pub const DOT: SyntaxKind = SyntaxKind(14); +pub const DOTDOT: SyntaxKind = SyntaxKind(15); +pub const DOTDOTDOT: SyntaxKind = SyntaxKind(16); +pub const DOTDOTEQ: SyntaxKind = SyntaxKind(17); +pub const L_PAREN: SyntaxKind = SyntaxKind(18); +pub const R_PAREN: SyntaxKind = SyntaxKind(19); +pub const L_CURLY: SyntaxKind = SyntaxKind(20); +pub const R_CURLY: SyntaxKind = SyntaxKind(21); +pub const L_BRACK: SyntaxKind = SyntaxKind(22); +pub const R_BRACK: SyntaxKind = SyntaxKind(23); +pub const L_ANGLE: SyntaxKind = SyntaxKind(24); +pub const R_ANGLE: SyntaxKind = SyntaxKind(25); +pub const AT: SyntaxKind = SyntaxKind(26); +pub const POUND: SyntaxKind = SyntaxKind(27); +pub const TILDE: SyntaxKind = SyntaxKind(28); +pub const QUESTION: SyntaxKind = SyntaxKind(29); +pub const COLON: SyntaxKind = SyntaxKind(30); +pub const COLONCOLON: SyntaxKind = SyntaxKind(31); +pub const DOLLAR: SyntaxKind = SyntaxKind(32); +pub const EQ: SyntaxKind = SyntaxKind(33); +pub const EQEQ: SyntaxKind = SyntaxKind(34); +pub const FAT_ARROW: SyntaxKind = SyntaxKind(35); +pub const NEQ: SyntaxKind = SyntaxKind(36); +pub const NOT: SyntaxKind = SyntaxKind(37); +pub const LIFETIME: SyntaxKind = SyntaxKind(38); +pub const CHAR: SyntaxKind = SyntaxKind(39); +pub const BYTE: SyntaxKind = SyntaxKind(40); +pub const STRING: SyntaxKind = SyntaxKind(41); +pub const RAW_STRING: SyntaxKind = SyntaxKind(42); +pub const BYTE_STRING: SyntaxKind = SyntaxKind(43); +pub const RAW_BYTE_STRING: SyntaxKind = SyntaxKind(44); +pub const PLUS: SyntaxKind = SyntaxKind(45); +pub const MINUS: SyntaxKind = SyntaxKind(46); +pub const STAR: SyntaxKind = SyntaxKind(47); +pub const SLASH: SyntaxKind = SyntaxKind(48); +pub const CARET: SyntaxKind = SyntaxKind(49); +pub const PERCENT: SyntaxKind = SyntaxKind(50); +pub const AMPERSAND: SyntaxKind = SyntaxKind(51); +pub const PIPE: SyntaxKind = SyntaxKind(52); +pub const THIN_ARROW: SyntaxKind = SyntaxKind(53); +pub const COMMENT: SyntaxKind = SyntaxKind(54); +pub const DOC_COMMENT: SyntaxKind = SyntaxKind(55); +pub const SHEBANG: SyntaxKind = SyntaxKind(56); +pub const FILE: SyntaxKind = SyntaxKind(57); -static INFOS: [SyntaxInfo; 52] = [ +static INFOS: [SyntaxInfo; 58] = [ + SyntaxInfo { name: "USE_KW" }, + SyntaxInfo { name: "FN_KW" }, + SyntaxInfo { name: "STRUCT_KW" }, + SyntaxInfo { name: "ENUM_KW" }, + SyntaxInfo { name: "TRAIT_KW" }, + SyntaxInfo { name: "IMPL_KW" }, SyntaxInfo { name: "ERROR" }, SyntaxInfo { name: "IDENT" }, SyntaxInfo { name: "UNDERSCORE" }, @@ -112,3 +124,15 @@ static INFOS: [SyntaxInfo; 52] = [ pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { &INFOS[kind.0 as usize] } + +pub(crate) fn ident_to_keyword(ident: &str) -> Option { + match ident { + "use" => Some(USE_KW), + "fn" => Some(FN_KW), + "struct" => Some(STRUCT_KW), + "enum" => Some(ENUM_KW), + "trait" => Some(TRAIT_KW), + "impl" => Some(IMPL_KW), + _ => None, + } +} diff --git a/tests/data/lexer/0011_keywords.rs b/tests/data/lexer/0011_keywords.rs new file mode 100644 index 00000000000..aa89d70c511 --- /dev/null +++ b/tests/data/lexer/0011_keywords.rs @@ -0,0 +1 @@ +fn use struct trait enum impl diff --git a/tests/data/lexer/0011_keywords.txt b/tests/data/lexer/0011_keywords.txt new file mode 100644 index 00000000000..d90047d1e8c --- /dev/null +++ b/tests/data/lexer/0011_keywords.txt @@ -0,0 +1,12 @@ +FN_KW 2 "fn" +WHITESPACE 1 " " +USE_KW 3 "use" +WHITESPACE 1 " " +STRUCT_KW 6 "struct" +WHITESPACE 1 " " +TRAIT_KW 5 "trait" +WHITESPACE 1 " " +ENUM_KW 4 "enum" +WHITESPACE 1 " " +IMPL_KW 4 "impl" +WHITESPACE 1 "\n"