From 4f9e1ba80b10a61c02fdcd842117f0234f06b439 Mon Sep 17 00:00:00 2001 From: Graydon Hoare Date: Fri, 13 Jan 2012 15:05:12 -0800 Subject: [PATCH] More doc porting. --- doc/rust.md | 409 +++++++++++++++++++++++++++++++++---- src/etc/extract_grammar.py | 4 + 2 files changed, 377 insertions(+), 36 deletions(-) diff --git a/doc/rust.md b/doc/rust.md index e026b2ef860..b498751e880 100644 --- a/doc/rust.md +++ b/doc/rust.md @@ -74,9 +74,9 @@ Where: Unicode codepoint `U+00QQ`. - `IDENTIFIER` is a nonempty string of ASCII letters and underscores. - The `repeat` forms apply to the adjacent `element`, and are as follows: - - `'?'` means zero or one repetition - - `'*'` means zero or more repetitions - - `'+'` means one or more repetitions + - `?` means zero or one repetition + - `*` means zero or more repetitions + - `+` means one or more repetitions - NUMBER trailing a repeat symbol gives a maximum repetition count - NUMBER on its own gives an exact repetition count @@ -107,7 +107,7 @@ the [token](#tokens) rule, and are assumed to be the result of a lexical-analysis phase feeding the parser, driven by a DFA, operating over the disjunction of all such string table entries. -When such a string enclosed in double-quotes (`'"'`) occurs inside the +When such a string enclosed in double-quotes (`"`) occurs inside the grammar, it is an implicit reference to a single member of such a string table production. See [tokens](#tokens) for more information. @@ -150,10 +150,10 @@ Some productions are defined by exclusion of particular Unicode characters: - `non_null` is any single Unicode character aside from `U+0000` (null) - `non_eol` is `non_null` restricted to exclude `U+000A` (`'\n'`) - - `non_star` is `non_null` restricted to exclude `U+002A` (`'*'`) - - `non_slash` is `non_null` restricted to exclude `U+002F` (`'/'`) - - `non_single_quote` is `non_null` restricted to exclude `U+0027` (`'\''`) - - `non_double_quote` is `non_null` restricted to exclude `U+0022` (`'\"'`) + - `non_star` is `non_null` restricted to exclude `U+002A` (`*`) + - `non_slash` is `non_null` restricted to exclude `U+002F` (`/`) + - `non_single_quote` is `non_null` restricted to exclude `U+0027` (`'`) + - `non_double_quote` is `non_null` restricted to exclude `U+0022` (`"`) ## Comments @@ -165,7 +165,7 @@ line_comment : "//" non_eol * ; ~~~~~~~~ Comments in Rust code follow the general C++ style of line and block-comment -forms, with proper nesting of block-comment delimeters. Comments are +forms, with proper nesting of block-comment delimiters. Comments are interpreted as a form of whitespace. ## Whitespace @@ -284,29 +284,29 @@ nonzero_dec: '1' | '2' | '3' | '4' A _character literal_ is a single Unicode character enclosed within two `U+0027` (single-quote) characters, with the exception of `U+0027` itself, -which must be _escaped_ by a preceding U+005C character (`'\'`). +which must be _escaped_ by a preceding U+005C character (`\`). A _string literal_ is a sequence of any Unicode characters enclosed within two `U+0022` (double-quote) characters, with the exception of `U+0022` -itself, which must be _escaped_ by a preceding `U+005C` character (`'\'`). +itself, which must be _escaped_ by a preceding `U+005C` character (`\`). Some additional _escapes_ are available in either character or string -literals. An escape starts with a `U+005C` (`'\'`) and continues with one of +literals. An escape starts with a `U+005C` (`\`) and continues with one of the following forms: - * An _8-bit codepoint escape_ escape starts with `U+0078` (`'x'`) and is + * An _8-bit codepoint escape_ escape starts with `U+0078` (`x`) and is followed by exactly two _hex digits_. It denotes the Unicode codepoint equal to the provided hex value. - * A _16-bit codepoint escape_ starts with `U+0075` (`'u'`) and is followed + * A _16-bit codepoint escape_ starts with `U+0075` (`u`) and is followed by exactly four _hex digits_. It denotes the Unicode codepoint equal to the provided hex value. - * A _32-bit codepoint escape_ starts with `U+0055` (`'U'`) and is followed + * A _32-bit codepoint escape_ starts with `U+0055` (`U`) and is followed by exactly eight _hex digits_. It denotes the Unicode codepoint equal to the provided hex value. - * A _whitespace escape_ is one of the characters `U+006E` (`'n'`), `U+0072` - (`'r'`), or `U+0074` (`'t'`), denoting the unicode values `U+000A` (LF), + * A _whitespace escape_ is one of the characters `U+006E` (`n`), `U+0072` + (`r`), or `U+0074` (`t`), denoting the unicode values `U+000A` (LF), `U+000D` (CR) or `U+0009` (HT) respectively. - * The _backslash escape_ is the character U+005C (`'\'`) which must be + * The _backslash escape_ is the character U+005C (`\`) which must be escaped in order to denote *itself*. #### Number literals @@ -341,9 +341,9 @@ An _integer literal_ has one of three forms: * A _decimal literal_ starts with a *decimal digit* and continues with any mixture of *decimal digits* and _underscores_. * A _hex literal_ starts with the character sequence `U+0030` `U+0078` - (`"0x"`) and continues as any mixture hex digits and underscores. + (`0x`) and continues as any mixture hex digits and underscores. * A _binary literal_ starts with the character sequence `U+0030` `U+0062` - (`"0b"`) and continues as any mixture binary digits and underscores. + (`0b`) and continues as any mixture binary digits and underscores. By default, an integer literal is of type `int`. An integer literal may be followed (immediately, without any spaces) by an _integer suffix_, which @@ -372,7 +372,7 @@ Examples of integer literals of various forms: A _floating-point literal_ has one of two forms: * Two _decimal literals_ separated by a period - character `U+002E` (`'.'`), with an optional _exponent_ trailing after the + character `U+002E` (`.`), with an optional _exponent_ trailing after the second decimal literal. * A single _decimal literal_ followed by an _exponent_. @@ -425,7 +425,7 @@ type_path_tail : '<' type_expr [ ',' type_expr ] + '>' ~~~~~~~~ A _path_ is a sequence of one or more path components _logically_ separated by -a namespace qualifier (`"::"`). If a path consists of only one component, it +a namespace qualifier (`::`). If a path consists of only one component, it may refer to either an [item](#items) or a (variable)[#variables) in a local control scope. If a path has multiple components, it refers to an item. @@ -444,9 +444,9 @@ x::y::z; Path components are usually [identifiers](#identifiers), but the trailing component of a path may be an angle-bracket enclosed list of [type arguments](type-arguments). In [expression](#expressions) context, the type -argument list is given after a final (`"::"`) namespace qualifier in order to +argument list is given after a final (`::`) namespace qualifier in order to disambiguate it from a relational expression involving the less-than symbol -(`'<'`). In [type expression](#type-expressions) context, the final namespace +(`<`). In [type expression](#type-expressions) context, the final namespace qualifier is omitted. Two examples of paths with type arguments: @@ -471,7 +471,11 @@ is directed towards a single crate in source form, and if successful produces a single crate in binary form, either an executable or a library. A _crate_ is a unit of compilation and linking, as well as versioning, -distribution and runtime loading. +distribution and runtime loading. A crate contains a _tree_ of nested +[module](#modules) scopes. The top-level of this tree is a module that is +anonymous -- from the point of view of paths within the module -- and any item +within a crate has a canonical [module path](#paths) denoting its location +within the crate's module tree. Crates are provided to the Rust compiler through two kinds of file: @@ -494,8 +498,9 @@ from the source file name, with the `.rs` extension removed. ## Crate files ~~~~~~~~ {.ebnf .gram} -crate : [ attribute * directive ] * ; -directive : view_directive | dir_directive | source_directive ; +crate : attribute [ ';' | attribute* directive ] + | directive ; +directive : view_item | dir_directive | source_directive ; ~~~~~~~~ A crate file contains a crate definition, for which the production above @@ -505,17 +510,37 @@ analogous to an *assembly* in the ECMA-335 CLI model, a *library* in the SML/NJ Compilation Manager, a *unit* in the Owens and Flatt module system, or a *configuration* in Mesa.] A crate file describes: -* Metadata about the crate, such as author, name, version, and copyright. -* The source file and directory modules that make up the crate. -* Any external crates or native modules that the crate imports to its top level. -* The organization of the crate's internal namespace. -* The set of names exported from the crate. +* [Attributes](#attributes) about the crate, such as author, name, version, + and copyright. These are used for linking, versioning and distributing + crates. +* The source-file and directory modules that make up the crate. +* Any `use`, `import` or `export` [view items](#view-items) that apply to the + anonymous module at the top-level of the crate's module tree. -### View directives +An example of a crate file: -A `view_directive` contains a single `view_item` and arranges the top-level -namespace of the crate, the same way a `view_item` would in a module. See -[view items](#view-items). +~~~~~~~~ +// Linkage attributes +#[ link(name = "projx" + vers = "2.5", + uuid = "9cccc5d5-aceb-4af5-8285-811211826b82") ]; + +// Additional metadata attributes +#[ desc = "Project X", + license = "BSD" ]; + author = "Jane Doe" ]; + +// Import a module. +use std (ver = "1.0"); + +// Define some modules. +#[path = "foo.rs"] +mod foo; +mod bar { + #[path = "quux.rs"] + mod quux; +} +~~~~~~~~ ### Dir directives @@ -541,8 +566,78 @@ a referencing crate file, or by the filename of the source file itself. # Items and attributes + +### Attributes + +~~~~~~~~{.ebnf .gram} +attribute : '#' '[' attr_list ']' ; +attr_list : attr [ ',' attr_list ]* +attr : ident [ '=' literal + | '(' attr_list ')' ] ? ; +~~~~~~~~ + +Static entities in Rust -- crates, modules and items -- may have _attributes_ +applied to them. ^[Attributes in Rust are modeled on Attributes in ECMA-335, +C#] An attribute is a general, free-form piece of metadata that is interpreted +according to name, convention, and language and compiler version. Attributes +may appear as any of: + +* A single identifier, the attribute name +* An identifier followed by the equals sign '=' and a literal, providing a key/value pair +* An identifier followed by a parenthesized list of sub-attribute arguments + +Attributes are applied to an entity by placing them within a hash-list +(`#[...]`) as either a prefix to the entity or as a semicolon-delimited +declaration within the entity body. + +An example of attributes: + +~~~~~~~~ +// A function marked as a unit test +#[test] +fn test_foo() { + ... +} + +// General metadata applied to the enclosing module or crate. +#[license = "BSD"]; + +// A conditionally-compiled module +#[cfg(target_os="linux")] +mod bar { + ... +} + +// A documentation attribute +#[doc = "Add two numbers together." +fn add(x: int, y: int) { x + y } +~~~~~~~~ + +In future versions of Rust, user-provided extensions to the compiler will be +able to interpret attributes. When this facility is provided, a distinction +will be made between language-reserved and user-available attributes. + +At present, only the Rust compiler interprets attributes, so all attribute +names are effectively reserved. Some significant attributes include: + +* The `doc` attribute, for documenting code where it's written. +* The `cfg` attribute, for conditional-compilation by build-configuration. +* The `link` attribute, for describing linkage metadata for a crate. +* The `test` attribute, for marking functions as unit tests. + +Other attributes may be added or removed during development of the language. + + # Statements and expressions +## Call expressions + +~~~~~~~~ {.abnf .gram} +expr_list : [ expr [ ',' expr ]* ] ? ; +paren_expr_list : '(' expr_list ')' ; +call_expr : expr paren_expr_list ; +~~~~~~~~ + ## Operators ### Unary operators @@ -563,8 +658,248 @@ a referencing crate file, or by the filename of the source file itself. <- <-> = += -= *= /= %= &= |= ^= <<= >>= >>>= ~~~~~~~~ + +## Syntax extensions + +~~~~~~~~ {.abnf .gram} +syntax_ext_expr : '#' ident paren_expr_list ? brace_match ? ; +~~~~~~~~ + +Rust provides a notation for _syntax extension_. The notation for invoking +a syntax extension is a marked syntactic form that can appear as an expression +in the body of a Rust program. + +After parsing, a syntax-extension invocation is expanded into a Rust +expression. The name of the extension determines the translation performed. In +future versions of Rust, user-provided syntax extensions aside from macros +will be provided via external crates. + +At present, only a set of built-in syntax extensions, as well as macros +introduced inline in source code using the `macro` extension, may be used. The +current built-in syntax extensions are: + + +* `fmt` expands into code to produce a formatted string, similar to + `printf` from C. +* `env` expands into a string literal containing the value of that + environment variable at compile-time. +* `concat_idents` expands into an identifier which is the + concatenation of its arguments. +* `ident_to_str` expands into a string literal containing the name of + its argument (which must be a literal). +* `log_syntax` causes the compiler to pretty-print its arguments. + + +Finally, `macro` is used to define a new macro. A macro can abstract over +second-class Rust concepts that are present in syntax. The arguments to +`macro` are pairs (two-element vectors). The pairs consist of an invocation +and the syntax to expand into. An example: + +~~~~~~~~ +#macro([#apply[fn, [args, ...]], fn(args, ...)]); +~~~~~~~~ + +In this case, the invocation `#apply[sum, 5, 8, 6]` expands to +`sum(5,8,6)`. If `...` follows an expression (which need not be as +simple as a single identifier) in the input syntax, the matcher will expect an +arbitrary number of occurrences of the thing preceding it, and bind syntax to +the identifiers it contains. If it follows an expression in the output syntax, +it will transcribe that expression repeatedly, according to the identifiers +(bound to syntax) that it contains. + +The behaviour of `...` is known as Macro By Example. It allows you to +write a macro with arbitrary repetition by specifying only one case of that +repetition, and following it by `...`, both where the repeated input is +matched, and where the repeated output must be transcribed. A more +sophisticated example: + + +~~~~~~~~ +#macro([#zip_literals[[x, ...], [y, ...]), [[x, y], ...]]); +#macro([#unzip_literals[[x, y], ...], [[x, ...], [y, ...]]]); +~~~~~~~~ + +In this case, `#zip_literals[[1,2,3], [1,2,3]]` expands to +`[[1,1],[2,2],[3,3]]`, and `#unzip_literals[[1,1], [2,2], [3,3]]` +expands to `[[1,2,3],[1,2,3]]`. + +Macro expansion takes place outside-in: that is, +`#unzip_literals[#zip_literals[[1,2,3],[1,2,3]]]` will fail because +`unzip_literals` expects a list, not a macro invocation, as an argument. + +The macro system currently has some limitations. It's not possible to +destructure anything other than vector literals (therefore, the arguments to +complicated macros will tend to be an ocean of square brackets). Macro +invocations and `...` can only appear in expression positions. Finally, +macro expansion is currently unhygienic. That is, name collisions between +macro-generated and user-written code can cause unintentional capture. + +Future versions of Rust will address these issues. + # Memory and concurrency model +## Memory model + +A Rust task's memory consists of a static set of *items*, a set of tasks +each with its own *stack*, and a *heap*. Immutable portions of the +heap may be shared between tasks, mutable portions may not. + +Allocations in the stack consist of *slots*, and allocations in the heap +consist of *boxes*. + + +### Memory allocation and lifetime + +The _items_ of a program are those functions, objects, modules and types +that have their value calculated at compile-time and stored uniquely in the +memory image of the rust process. Items are neither dynamically allocated nor +freed. + +A task's _stack_ consists of activation frames automatically allocated on +entry to each function as the task executes. A stack allocation is reclaimed +when control leaves the frame containing it. + +The _heap_ is a general term that describes two separate sets of boxes: +shared boxes -- which may be subject to garbage collection -- and unique +boxes. The lifetime of an allocation in the heap depends on the lifetime of +the box values pointing to it. Since box values may themselves be passed in +and out of frames, or stored in the heap, heap allocations may outlive the +frame they are allocated within. + + +### Memory ownership + +A task owns all memory it can *safely* reach through local variables, +shared or unique boxes, and/or references. Sharing memory between tasks can +only be accomplished using *unsafe* constructs, such as raw pointer +operations or calling C code. + +When a task sends a value of *unique* kind over a channel, it loses +ownership of the value sent and can no longer refer to it. This is statically +guaranteed by the combined use of "move semantics" and unique kinds, within +the communication system. + +When a stack frame is exited, its local allocations are all released, and its +references to boxes (both shared and owned) are dropped. + +A shared box may (in the case of a recursive, mutable shared type) be cyclic; +in this case the release of memory inside the shared structure may be deferred +until task-local garbage collection can reclaim it. Code can ensure no such +delayed deallocation occurs by restricting itself to unique boxes and similar +unshared kinds of data. + +When a task finishes, its stack is necessarily empty and it therefore has no +references to any boxes; the remainder of its heap is immediately freed. + + +### Memory slots + +A task's stack contains slots. + +A _slot_ is a component of a stack frame. A slot is either *local* or +a *reference*. + +A _local_ slot (or *stack-local* allocation) holds a value directly, +allocated within the stack's memory. The value is a part of the stack frame. + +A _reference_ references a value outside the frame. It may refer to a +value allocated in another frame *or* a boxed value in the heap. The +reference-formation rules ensure that the referent will outlive the reference. + +Local slots are always implicitly mutable. + +Local slots are not initialized when allocated; the entire frame worth of +local slots are allocated at once, on frame-entry, in an uninitialized +state. Subsequent statements within a function may or may not initialize the +local slots. Local slots can be used only after they have been initialized; +this condition is guaranteed by the typestate system. + +References are created for function arguments. If the compiler can not prove +that the referred-to value will outlive the reference, it will try to set +aside a copy of that value to refer to. If this is not semantically safe (for +example, if the referred-to value contains mutable fields), it will reject the +program. If the compiler deems copying the value expensive, it will warn. + +A function can be declared to take an argument by mutable reference. This +allows the function to write to the slot that the reference refers to. + +An example function that accepts an value by mutable reference: + +~~~~~~~~ +fn incr(&i: int) { + i = i + 1; +} +~~~~~~~~ + + +### Memory boxes + +A _box_ is a reference to a heap allocation holding another value. There +are two kinds of boxes: *shared boxes* and *unique boxes*. + +A _shared box_ type or value is constructed by the prefix *at* sigil `@`. + +A _unique box_ type or value is constructed by the prefix *tilde* sigil `~`. + +Multiple shared box values can point to the same heap allocation; copying a +shared box value makes a shallow copy of the pointer (optionally incrementing +a reference count, if the shared box is implemented through +reference-counting). + +Unique box values exist in 1:1 correspondence with their heap allocation; +copying a unique box value makes a deep copy of the heap allocation and +produces a pointer to the new allocation. + +An example of constructing one shared box type and value, and one unique box +type and value: + +~~~~~~~~ +let x: @int = @10; +let x: ~int = ~10; +~~~~~~~~ + +Some operations implicitly dereference boxes. Examples of such @dfn{implicit +dereference} operations are: + +* arithmetic operators (`x + y - z`) +* field selection (`x.y.z`) + + +An example of an implicit-dereference operation performed on box values: + +~~~~~~~~ +let x: @int = @10; +let y: @int = @12; +assert (x + y == 22); +~~~~~~~~ + +Other operations act on box values as single-word-sized address values. For +these operations, to access the value held in the box requires an explicit +dereference of the box value. Explicitly dereferencing a box is indicated with +the unary *star* operator `*`. Examples of such @dfn{explicit +dereference} operations are: + +* copying box values (`x = y`) +* passing box values to functions (`f(x,y)`) + + +An example of an explicit-dereference operation performed on box values: + +~~~~~~~~ +fn takes_boxed(b: @int) { +} + +fn takes_unboxed(b: int) { +} + +fn main() { + let x: @int = @10; + takes_boxed(x); + takes_unboxed(*x); +} +~~~~~~~~ + + # Runtime services, linkage and debugging # Appendix: Rationales and design tradeoffs @@ -643,3 +978,5 @@ Additional specific influences can be seen from the following languages: * The lexical identifier rule of Python. * The block syntax of Ruby. + +LocalWords: codepoint diff --git a/src/etc/extract_grammar.py b/src/etc/extract_grammar.py index 6066a1ebc9c..63e89563464 100755 --- a/src/etc/extract_grammar.py +++ b/src/etc/extract_grammar.py @@ -86,6 +86,10 @@ symnames = { "::": "coloncolon", +"->": "rightarrow", +"<-": "leftarrow", +"<->": "swaparrow", + "//": "linecomment", "/*": "openblockcomment", "*/": "closeblockcomment"