Start library

2017-12-28 23:55:16 +03:00 · 2017-12-28 23:55:16 +03:00 · e132280844
parent 268cb2a04e
commit e132280844
5 changed files with 113 additions and 190 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/target/
+**/*.rs.bk
+Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,6 @@
+[package]
+name = "libsyntax2"
+version = "0.1.0"
+authors = ["Aleksey Kladov <aleksey.kladov@gmail.com>"]
+
+[dependencies]
--- a/minirust.rs
+++ b/minirust.rs
@ -1,152 +0,0 @@
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-pub struct NodeKind(u16);
-
-pub struct File {
-    text: String,
-    nodes: Vec<NodeData>,
-}
-
-struct NodeData {
-    kind: NodeKind,
-    range: (u32, u32),
-    parent: Option<u32>,
-    first_child: Option<u32>,
-    next_sibling: Option<u32>,
-}
-
-#[derive(Clone, Copy)]
-pub struct Node<'f> {
-    file: &'f File,
-    idx: u32,
-}
-
-pub struct Children<'f> {
-    next: Option<Node<'f>>,
-}
-
-impl File {
-    pub fn root<'f>(&'f self) -> Node<'f> {
-        assert!(!self.nodes.is_empty());
-        Node { file: self, idx: 0 }
-    }
-}
-
-impl<'f> Node<'f> {
-    pub fn kind(&self) -> NodeKind {
-        self.data().kind
-    }
-
-    pub fn text(&self) -> &'f str {
-        let (start, end) = self.data().range;
-        &self.file.text[start as usize..end as usize]
-    }
-
-    pub fn parent(&self) -> Option<Node<'f>> {
-        self.as_node(self.data().parent)
-    }
-
-    pub fn children(&self) -> Children<'f> {
-        Children { next: self.as_node(self.data().first_child) }
-    }
-
-    fn data(&self) -> &'f NodeData {
-        &self.file.nodes[self.idx as usize]
-    }
-
-    fn as_node(&self, idx: Option<u32>) -> Option<Node<'f>> {
-        idx.map(|idx| Node { file: self.file, idx })
-    }
-}
-
-impl<'f> Iterator for Children<'f> {
-    type Item = Node<'f>;
-
-    fn next(&mut self) -> Option<Node<'f>> {
-        let next = self.next;
-        self.next = next.and_then(|node| node.as_node(node.data().next_sibling));
-        next
-    }
-}
-
-pub const ERROR: NodeKind = NodeKind(0);
-pub const WHITESPACE: NodeKind = NodeKind(1);
-pub const STRUCT_KW: NodeKind = NodeKind(2);
-pub const IDENT: NodeKind = NodeKind(3);
-pub const L_CURLY: NodeKind = NodeKind(4);
-pub const R_CURLY: NodeKind = NodeKind(5);
-pub const COLON: NodeKind = NodeKind(6);
-pub const COMMA: NodeKind = NodeKind(7);
-pub const AMP: NodeKind = NodeKind(8);
-pub const LINE_COMMENT: NodeKind = NodeKind(9);
-pub const FILE: NodeKind = NodeKind(10);
-pub const STRUCT_DEF: NodeKind = NodeKind(11);
-pub const FIELD_DEF: NodeKind = NodeKind(12);
-pub const TYPE_REF: NodeKind = NodeKind(13);
-
-
-pub trait AstNode<'f>: Copy + 'f {
-    fn new(node: Node<'f>) -> Option<Self>;
-    fn node(&self) -> Node<'f>;
-}
-
-pub fn child_of_kind<'f>(node: Node<'f>, kind: NodeKind) -> Option<Node<'f>> {
-    node.children().find(|child| child.kind() == kind)
-}
-
-pub fn ast_children<'f, A: AstNode<'f>>(node: Node<'f>) -> Box<Iterator<Item=A> + 'f> {
-    Box::new(node.children().filter_map(A::new))
-}
-
-#[derive(Clone, Copy)]
-pub struct StructDef<'f>(Node<'f>);
-
-#[derive(Clone, Copy)]
-pub struct FieldDef<'f>(Node<'f>);
-
-#[derive(Clone, Copy)]
-pub struct TypeRef<'f>(Node<'f>);
-
-pub trait NameOwner<'f>: AstNode<'f> {
-    fn name_ident(&self) -> Node<'f> {
-        child_of_kind(self.node(), IDENT).unwrap()
-    }
-
-    fn name(&self) -> &'f str { self.name_ident().text() }
-}
-
-
-impl<'f> AstNode<'f> for StructDef<'f> {
-    fn new(node: Node<'f>) -> Option<Self> {
-        if node.kind() == STRUCT_DEF { Some(StructDef(node)) } else { None }
-    }
-    fn node(&self) -> Node<'f> { self.0 }
-}
-
-impl<'f> AstNode<'f> for FieldDef<'f> {
-    fn new(node: Node<'f>) -> Option<Self> {
-        if node.kind() == FIELD_DEF { Some(FieldDef(node)) } else { None }
-    }
-    fn node(&self) -> Node<'f> { self.0 }
-}
-
-impl<'f> AstNode<'f> for TypeRef<'f> {
-    fn new(node: Node<'f>) -> Option<Self> {
-        if node.kind() == TYPE_REF { Some(TypeRef(node)) } else { None }
-    }
-    fn node(&self) -> Node<'f> { self.0 }
-}
-
-impl<'f> NameOwner<'f> for StructDef<'f> {}
-impl<'f> NameOwner<'f> for FieldDef<'f> {}
-
-impl<'f> StructDef<'f> {
-    pub fn fields(&self) -> Box<Iterator<Item=FieldDef<'f>> + 'f> {
-        ast_children(self.node())
-    }
-}
-
-impl<'f> FieldDef<'f> {
-    pub fn type_ref(&self) -> Option<TypeRef<'f>> {
-        ast_children(self.node()).next()
-    }
-}
--- a/rfc.md
+++ b/rfc.md
@ -30,12 +30,66 @@ other tools, and eventual libsyntax removal.

 Note that this RFC does not propose to stabilize any API for working
 with rust syntax: the semver version of the hypothetical library would
-be `0.1.0`.
+be `0.1.0`. It is intended to be used by tools, which are currently
+closely related to the compiler: `rustc`, `rustfmt`, `clippy`, `rls`
+and hypothetical `rustfix`. While it would be possible to create
+third-party tools on top of the new libsyntax, the burden of adopting
+to breaking changes would be on authors of such tools.


 # Motivation
 [motivation]: #motivation

+There are two main drawbacks with the current version of libsyntax:
+
+* It is tightly integrated with the compiler and hard to use
+  independently
+
+* The AST representation is not well-suited for use inside IDEs
+
+
+## IDE support
+
+There are several differences in how IDEs and compilers typically
+treat source code.
+
+In the compiler, it is convenient to transform the source
+code into Abstract Syntax Tree form, which is independent of the
+surface syntax. For example, it's convenient to discard comments,
+whitespaces and desugar some syntactic constructs in terms of the
+simpler ones.
+
+In contrast, IDEs work much closer to the source code, so it is
+crucial to preserve full information about the original text. For
+example, IDE may adjust indentation after typing a `}` which closes a
+block, and to do this correctly, IDE must be aware of syntax (that is,
+that `}` indeed closes some block, and is not a syntax error) and of
+all whitespaces and comments. So, IDE suitable AST should explicitly
+account for syntactic elements, not considered important by the
+compiler.
+
+Another difference is that IDEs typically work with incomplete and
+syntactically invalid code. This boils down to two parser properties.
+First, the parser must produce syntax tree even if some required input
+is missing. For example, for input `fn foo` the function node should
+be present in the parse, despite the fact that there is no parameters
+or body. Second, the parser must be able to skip over parts of input
+it can't recognize and aggressively recover from errors. That is, the
+syntax tree data structure should be able to handle both missing and
+extra nodes.
+
+IDEs also need the ability to incrementally reparse and relex source
+code after the user types. A smart IDE would use syntax tree structure
+to handle editing commands (for example, to add/remove trailing commas
+after join/split lines actions), so parsing time can be very
+noticeable.
+
+
+Currently rustc uses the classical AST approach, and preserves some of
+the source code information in the form of spans in the AST. It is not
+clear if this structure can full fill all IDE requirements.
+
+
 ## Reusability

 In theory, the parser can be a pure function, which takes a `&str` as
@ -67,29 +121,6 @@ files. As a data point, it turned out to be easier to move `rustfmt`
 into the main `rustc` repository than to move libsyntax outside!


-## IDE support
-
-There is one big difference in how IDEs and compilers typically treat
-source code.
-
-In the compiler, it is convenient to transform the source
-code into Abstract Syntax Tree form, which is independent of the
-surface syntax. For example, it's convenient to discard comments,
-whitespaces and desugar some syntactic constructs in terms of the
-simpler ones.
-
-In contrast, for IDEs it is crucial to have a lossless view of the
-source code because, for example, it's important to preserve comments
-during refactorings. Ideally, IDEs should be able to incrementally
-relex and reparse the file as the user types, because syntax tree is
-necessary to correctly handle certain code-editing actions like
-autoindentation or joining lines. IDE also must be able to produce
-partial parse trees when some input is missing or invalid.
-
-Currently rustc uses the AST approach, and preserves some of the
-source code information in the form of spans in the AST.
-
-
 # Guide-level explanation
 [guide-level-explanation]: #guide-level-explanation

@ -99,11 +130,33 @@ Not applicable.
 # Reference-level explanation
 [reference-level-explanation]: #reference-level-explanation

-This section proposes a new syntax tree data structure, which should
-be suitable for both compiler and IDE. It is heavily inspired by [PSI]
-data structure which used in [IntelliJ] based IDEs and in the [Kotlin]
-compiler.
+It is not clear if a single parser can accommodate the needs of the
+compiler and the IDE, but there is hope that it is possible. The RFC
+proposes to develop libsynax2.0 as an experimental crates.io crate. If
+the experiment turns out to be a success, the second RFC will propose
+to integrate it with all existing tools and `rustc`.

+Next, a syntax tree data structure is proposed for libsyntax2.0. It
+seems to have the following important properties:
+
+* It is lossless and faithfully represents the original source code,
+  including explicit nodes for comments and whitespace.
+
+* It is flexible and allows to encode arbitrary node structure,
+  even for invalid syntax.
+
+* It is minimal: it stores small amount of data and has no
+  dependencies. For instance, it does not need compiler's string
+  interner or literal data representation.
+
+* While the tree itself is minimal, it is extensible in a sense that
+  it possible to associate arbitrary data with certain nodes in a
+  type-safe way.
+
+
+It is not clear if this representation is the best one. It is heavily
+inspired by [PSI] data structure which used in [IntelliJ] based IDEs
+and in the [Kotlin] compiler.

 [PSI]: http://www.jetbrains.org/intellij/sdk/docs/reference_guide/custom_language_support/implementing_parser_and_psi.html
 [IntelliJ]: https://github.com/JetBrains/intellij-community/
@ -351,6 +404,11 @@ impl<'f> AstNode<'f> for TypeRef<'f> {
 }
 ```

+Note that although AST wrappers provide a type-safe access to the
+tree, they are still represented as indexes, so clients of the syntax
+tree can easily associated additional data with AST nodes by storing
+it in a side-table.
+

 ## Missing Source Code

@ -374,7 +432,8 @@ This RFC proposes huge changes to the internals of the compiler, so
 it's important to proceed carefully and incrementally. The following
 plan is suggested:

-* RFC discussion about the theoretical feasibility of the proposal.
+* RFC discussion about the theoretical feasibility of the proposal,
+  and the best representation representation for the syntax tree.

 * Implementation of the proposal as a completely separate crates.io
  crate, by refactoring existing libsyntax source code to produce a
@ -394,10 +453,10 @@ plan is suggested:
  experiemt on crates.io. However, actually using it in the compiler
  and other tools would require massive refactorings.

- Proposed syntax tree requires to keep the original source code
-  available, which might increase memory usage of the
-  compiler. However, it should be possible to throw the original tree
-  and source code away after conversion to HIR.
+- It's difficult to know upfront if the proposed syntax tree would
+  actually work well in both the compiler and IDE. It may be possible
+  that some drawbacks will be discovered during implementation.
+

 # Rationale and alternatives
 [alternatives]: #alternatives
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,7 @@
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn it_works() {
+        assert_eq!(2 + 2, 4);
+    }
+}