rust/crates/ra_syntax/src/string_lexing.rs

use self::CharComponentKind::*;
use rowan::{TextRange, TextUnit};

pub fn parse_string_literal(src: &str) -> StringComponentIterator {
    StringComponentIterator {
        parser: Parser::new(src),
        has_closing_quote: false,
    }
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub struct StringComponent {
    pub range: TextRange,
    pub kind: StringComponentKind,
}

impl StringComponent {
    fn new(range: TextRange, kind: StringComponentKind) -> StringComponent {
        StringComponent { range, kind }
    }
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub enum StringComponentKind {
    IgnoreNewline,
    Char(CharComponentKind),
}

pub struct StringComponentIterator<'a> {
    parser: Parser<'a>,
    pub has_closing_quote: bool,
}

impl<'a> Iterator for StringComponentIterator<'a> {
    type Item = StringComponent;
    fn next(&mut self) -> Option<StringComponent> {
        if self.parser.pos == 0 {
            assert!(
                self.parser.advance() == '"',
                "string literal should start with double quotes"
            );
        }

        if let Some(component) = self.parser.parse_string_component() {
            return Some(component);
        }

        // We get here when there are no char components left to parse
        if self.parser.peek() == Some('"') {
            self.parser.advance();
            self.has_closing_quote = true;
        }

        assert!(
            self.parser.peek() == None,
            "string literal should leave no unparsed input: src = {}, pos = {}, length = {}",
            self.parser.src,
            self.parser.pos,
            self.parser.src.len()
        );

        None
    }
}

pub fn parse_char_literal(src: &str) -> CharComponentIterator {
    CharComponentIterator {
        parser: Parser::new(src),
        has_closing_quote: false,
    }
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub struct CharComponent {
    pub range: TextRange,
    pub kind: CharComponentKind,
}

impl CharComponent {
    fn new(range: TextRange, kind: CharComponentKind) -> CharComponent {
        CharComponent { range, kind }
    }
}

#[derive(Debug, Eq, PartialEq, Clone)]
pub enum CharComponentKind {
    CodePoint,
    AsciiEscape,
    AsciiCodeEscape,
    UnicodeEscape,
}

pub struct CharComponentIterator<'a> {
    parser: Parser<'a>,
    pub has_closing_quote: bool,
}

impl<'a> Iterator for CharComponentIterator<'a> {
    type Item = CharComponent;
    fn next(&mut self) -> Option<CharComponent> {
        if self.parser.pos == 0 {
            assert!(
                self.parser.advance() == '\'',
                "char literal should start with a quote"
            );
        }

        if let Some(component) = self.parser.parse_char_component() {
            return Some(component);
        }

        // We get here when there are no char components left to parse
        if self.parser.peek() == Some('\'') {
            self.parser.advance();
            self.has_closing_quote = true;
        }

        assert!(
            self.parser.peek() == None,
            "char literal should leave no unparsed input: src = {}, pos = {}, length = {}",
            self.parser.src,
            self.parser.pos,
            self.parser.src.len()
        );

        None
    }
}

pub struct Parser<'a> {
    src: &'a str,
    pos: usize,
}

impl<'a> Parser<'a> {
    pub fn new(src: &'a str) -> Parser<'a> {
        Parser { src, pos: 0 }
    }

    // Utility methods

    pub fn peek(&self) -> Option<char> {
        if self.pos == self.src.len() {
            return None;
        }

        self.src[self.pos..].chars().next()
    }

    pub fn advance(&mut self) -> char {
        let next = self
            .peek()
            .expect("cannot advance if end of input is reached");
        self.pos += next.len_utf8();
        next
    }

    pub fn skip_whitespace(&mut self) {
        while self.peek().map(|c| c.is_whitespace()) == Some(true) {
            self.advance();
        }
    }

    pub fn get_pos(&self) -> TextUnit {
        (self.pos as u32).into()
    }

    // Char parsing methods

    fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent {
        match self.peek() {
            Some('{') => {
                self.advance();

                // Parse anything until we reach `}`
                while let Some(next) = self.peek() {
                    self.advance();
                    if next == '}' {
                        break;
                    }
                }

                let end = self.get_pos();
                CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
            }
            Some(_) | None => {
                let end = self.get_pos();
                CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
            }
        }
    }

    fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent {
        let code_start = self.get_pos();
        while let Some(next) = self.peek() {
            if next == '\'' || (self.get_pos() - code_start == 2.into()) {
                break;
            }

            self.advance();
        }

        let end = self.get_pos();
        CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)
    }

    fn parse_escape(&mut self, start: TextUnit) -> CharComponent {
        if self.peek().is_none() {
            return CharComponent::new(TextRange::from_to(start, start), AsciiEscape);
        }

        let next = self.advance();
        let end = self.get_pos();
        let range = TextRange::from_to(start, end);
        match next {
            'x' => self.parse_ascii_code_escape(start),
            'u' => self.parse_unicode_escape(start),
            _ => CharComponent::new(range, AsciiEscape),
        }
    }

    pub fn parse_char_component(&mut self) -> Option<CharComponent> {
        let next = self.peek()?;

        // Ignore character close
        if next == '\'' {
            return None;
        }

        let start = self.get_pos();
        self.advance();

        if next == '\\' {
            Some(self.parse_escape(start))
        } else {
            let end = self.get_pos();
            Some(CharComponent::new(
                TextRange::from_to(start, end),
                CodePoint,
            ))
        }
    }

    pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
        // In string literals, when a `\` occurs immediately before the newline, the `\`,
        // the newline, and all whitespace at the beginning of the next line are ignored
        match self.peek() {
            Some('\n') | Some('\r') => {
                self.skip_whitespace();
                Some(StringComponent::new(
                    TextRange::from_to(start, self.get_pos()),
                    StringComponentKind::IgnoreNewline,
                ))
            }
            _ => None,
        }
    }

    pub fn parse_string_component(&mut self) -> Option<StringComponent> {
        let next = self.peek()?;

        // Ignore string close
        if next == '"' {
            return None;
        }

        let start = self.get_pos();
        self.advance();

        if next == '\\' {
            // Strings can use `\` to ignore newlines, so we first try to parse one of those
            // before falling back to parsing char escapes
            self.parse_ignore_newline(start).or_else(|| {
                let char_component = self.parse_escape(start);
                Some(StringComponent::new(
                    char_component.range,
                    StringComponentKind::Char(char_component.kind),
                ))
            })
        } else {
            let end = self.get_pos();
            Some(StringComponent::new(
                TextRange::from_to(start, end),
                StringComponentKind::Char(CodePoint),
            ))
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn parse(src: &str) -> (bool, Vec<CharComponent>) {
        let component_iterator = &mut super::parse_char_literal(src);
        let components: Vec<_> = component_iterator.collect();
        (component_iterator.has_closing_quote, components)
    }

    fn unclosed_char_component(src: &str) -> CharComponent {
        let (has_closing_quote, components) = parse(src);
        assert!(!has_closing_quote, "char should not have closing quote");
        assert!(components.len() == 1);
        components[0].clone()
    }

    fn closed_char_component(src: &str) -> CharComponent {
        let (has_closing_quote, components) = parse(src);
        assert!(has_closing_quote, "char should have closing quote");
        assert!(
            components.len() == 1,
            "Literal: {}\nComponents: {:#?}",
            src,
            components
        );
        components[0].clone()
    }

    fn closed_char_components(src: &str) -> Vec<CharComponent> {
        let (has_closing_quote, components) = parse(src);
        assert!(has_closing_quote, "char should have closing quote");
        components
    }

    fn range_closed(src: &str) -> TextRange {
        TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
    }

    fn range_unclosed(src: &str) -> TextRange {
        TextRange::from_to(1.into(), (src.len() as u32).into())
    }

    #[test]
    fn test_unicode_escapes() {
        let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
        for escape in unicode_escapes {
            let escape_sequence = format!(r"'\u{}'", escape);
            let component = closed_char_component(&escape_sequence);
            let expected_range = range_closed(&escape_sequence);
            assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
            assert_eq!(component.range, expected_range);
        }
    }

    #[test]
    fn test_unicode_escapes_unclosed() {
        let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
        for escape in unicode_escapes {
            let escape_sequence = format!(r"'\u{}'", escape);
            let component = unclosed_char_component(&escape_sequence);
            let expected_range = range_unclosed(&escape_sequence);
            assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
            assert_eq!(component.range, expected_range);
        }
    }

    #[test]
    fn test_empty_char() {
        let (has_closing_quote, components) = parse("''");
        assert!(has_closing_quote, "char should have closing quote");
        assert!(components.len() == 0);
    }

    #[test]
    fn test_unclosed_char() {
        let component = unclosed_char_component("'a");
        assert!(component.kind == CodePoint);
        assert!(component.range == TextRange::from_to(1.into(), 2.into()));
    }

    #[test]
    fn test_digit_escapes() {
        let literals = &[r"", r"5", r"55"];

        for literal in literals {
            let lit_text = format!(r"'\x{}'", literal);
            let component = closed_char_component(&lit_text);
            assert!(component.kind == CharComponentKind::AsciiCodeEscape);
            assert!(component.range == range_closed(&lit_text));
        }

        // More than 2 digits starts a new codepoint
        let components = closed_char_components(r"'\x555'");
        assert!(components.len() == 2);
        assert!(components[1].kind == CharComponentKind::CodePoint);
    }

    #[test]
    fn test_ascii_escapes() {
        let literals = &[
            r"\'", "\\\"", // equivalent to \"
            r"\n", r"\r", r"\t", r"\\", r"\0",
        ];

        for literal in literals {
            let lit_text = format!("'{}'", literal);
            let component = closed_char_component(&lit_text);
            assert!(component.kind == CharComponentKind::AsciiEscape);
            assert!(component.range == range_closed(&lit_text));
        }
    }

    #[test]
    fn test_no_escapes() {
        let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];

        for &literal in literals {
            let lit_text = format!("'{}'", literal);
            let component = closed_char_component(&lit_text);
            assert!(component.kind == CharComponentKind::CodePoint);
            assert!(component.range == range_closed(&lit_text));
        }
    }
}
Add character literal parsing and validation 2018-11-04 15:06:38 +01:00			`use self::CharComponentKind::*;`
			`use rowan::{TextRange, TextUnit};`

Validate string literals 2018-11-08 15:42:00 +01:00			`pub fn parse_string_literal(src: &str) -> StringComponentIterator {`
			`StringComponentIterator {`
			`parser: Parser::new(src),`
			`has_closing_quote: false,`
			`}`
			`}`

			`#[derive(Debug, Eq, PartialEq, Clone)]`
			`pub struct StringComponent {`
			`pub range: TextRange,`
			`pub kind: StringComponentKind,`
			`}`

			`impl StringComponent {`
			`fn new(range: TextRange, kind: StringComponentKind) -> StringComponent {`
			`StringComponent { range, kind }`
			`}`
			`}`

			`#[derive(Debug, Eq, PartialEq, Clone)]`
			`pub enum StringComponentKind {`
			`IgnoreNewline,`
			`Char(CharComponentKind),`
			`}`

			`pub struct StringComponentIterator<'a> {`
			`parser: Parser<'a>,`
			`pub has_closing_quote: bool,`
			`}`

			`impl<'a> Iterator for StringComponentIterator<'a> {`
			`type Item = StringComponent;`
			`fn next(&mut self) -> Option<StringComponent> {`
			`if self.parser.pos == 0 {`
			`assert!(`
			`self.parser.advance() == '"',`
			`"string literal should start with double quotes"`
			`);`
			`}`

			`if let Some(component) = self.parser.parse_string_component() {`
			`return Some(component);`
			`}`

			`// We get here when there are no char components left to parse`
			`if self.parser.peek() == Some('"') {`
			`self.parser.advance();`
			`self.has_closing_quote = true;`
			`}`

			`assert!(`
			`self.parser.peek() == None,`
			`"string literal should leave no unparsed input: src = {}, pos = {}, length = {}",`
			`self.parser.src,`
			`self.parser.pos,`
			`self.parser.src.len()`
			`);`

			`None`
			`}`
			`}`

Add character literal parsing and validation 2018-11-04 15:06:38 +01:00			`pub fn parse_char_literal(src: &str) -> CharComponentIterator {`
			`CharComponentIterator {`
			`parser: Parser::new(src),`
			`has_closing_quote: false,`
			`}`
			`}`

			`#[derive(Debug, Eq, PartialEq, Clone)]`
			`pub struct CharComponent {`
			`pub range: TextRange,`
			`pub kind: CharComponentKind,`
			`}`

			`impl CharComponent {`
			`fn new(range: TextRange, kind: CharComponentKind) -> CharComponent {`
			`CharComponent { range, kind }`
			`}`
			`}`

			`#[derive(Debug, Eq, PartialEq, Clone)]`
			`pub enum CharComponentKind {`
			`CodePoint,`
			`AsciiEscape,`
			`AsciiCodeEscape,`
			`UnicodeEscape,`
			`}`

			`pub struct CharComponentIterator<'a> {`
			`parser: Parser<'a>,`
			`pub has_closing_quote: bool,`
			`}`

			`impl<'a> Iterator for CharComponentIterator<'a> {`
			`type Item = CharComponent;`
			`fn next(&mut self) -> Option<CharComponent> {`
			`if self.parser.pos == 0 {`
			`assert!(`
			`self.parser.advance() == '\'',`
			`"char literal should start with a quote"`
			`);`
			`}`

			`if let Some(component) = self.parser.parse_char_component() {`
			`return Some(component);`
			`}`

			`// We get here when there are no char components left to parse`
			`if self.parser.peek() == Some('\'') {`
			`self.parser.advance();`
			`self.has_closing_quote = true;`
			`}`

			`assert!(`
			`self.parser.peek() == None,`
			`"char literal should leave no unparsed input: src = {}, pos = {}, length = {}",`
			`self.parser.src,`
			`self.parser.pos,`
			`self.parser.src.len()`
			`);`

			`None`
			`}`
			`}`

			`pub struct Parser<'a> {`
			`src: &'a str,`
			`pos: usize,`
			`}`

			`impl<'a> Parser<'a> {`
			`pub fn new(src: &'a str) -> Parser<'a> {`
			`Parser { src, pos: 0 }`
			`}`

			`// Utility methods`

			`pub fn peek(&self) -> Option<char> {`
			`if self.pos == self.src.len() {`
			`return None;`
			`}`

			`self.src[self.pos..].chars().next()`
			`}`

			`pub fn advance(&mut self) -> char {`
			`let next = self`
			`.peek()`
			`.expect("cannot advance if end of input is reached");`
			`self.pos += next.len_utf8();`
			`next`
			`}`

Validate string literals 2018-11-08 15:42:00 +01:00			`pub fn skip_whitespace(&mut self) {`
			`while self.peek().map(\|c\| c.is_whitespace()) == Some(true) {`
			`self.advance();`
			`}`
			`}`

Add character literal parsing and validation 2018-11-04 15:06:38 +01:00			`pub fn get_pos(&self) -> TextUnit {`
			`(self.pos as u32).into()`
			`}`

			`// Char parsing methods`

			`fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent {`
			`match self.peek() {`
			`Some('{') => {`
			`self.advance();`

			// Parse anything until we reach `}`
			`while let Some(next) = self.peek() {`
			`self.advance();`
			`if next == '}' {`
			`break;`
			`}`
			`}`

			`let end = self.get_pos();`
			`CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)`
			`}`
			`Some(_) \| None => {`
			`let end = self.get_pos();`
			`CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)`
			`}`
			`}`
			`}`

			`fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent {`
			`let code_start = self.get_pos();`
			`while let Some(next) = self.peek() {`
			`if next == '\'' \|\| (self.get_pos() - code_start == 2.into()) {`
			`break;`
			`}`

			`self.advance();`
			`}`

			`let end = self.get_pos();`
			`CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)`
			`}`

			`fn parse_escape(&mut self, start: TextUnit) -> CharComponent {`
			`if self.peek().is_none() {`
			`return CharComponent::new(TextRange::from_to(start, start), AsciiEscape);`
			`}`

			`let next = self.advance();`
			`let end = self.get_pos();`
			`let range = TextRange::from_to(start, end);`
			`match next {`
			`'x' => self.parse_ascii_code_escape(start),`
			`'u' => self.parse_unicode_escape(start),`
			`_ => CharComponent::new(range, AsciiEscape),`
			`}`
			`}`

			`pub fn parse_char_component(&mut self) -> Option<CharComponent> {`
			`let next = self.peek()?;`

			`// Ignore character close`
			`if next == '\'' {`
			`return None;`
			`}`

			`let start = self.get_pos();`
			`self.advance();`

			`if next == '\\' {`
			`Some(self.parse_escape(start))`
			`} else {`
			`let end = self.get_pos();`
			`Some(CharComponent::new(`
			`TextRange::from_to(start, end),`
			`CodePoint,`
			`))`
			`}`
			`}`
Validate string literals 2018-11-08 15:42:00 +01:00
			`pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {`
			// In string literals, when a `\` occurs immediately before the newline, the `\`,
			`// the newline, and all whitespace at the beginning of the next line are ignored`
			`match self.peek() {`
			`Some('\n') \| Some('\r') => {`
			`self.skip_whitespace();`
			`Some(StringComponent::new(`
			`TextRange::from_to(start, self.get_pos()),`
			`StringComponentKind::IgnoreNewline,`
			`))`
			`}`
			`_ => None,`
			`}`
			`}`

			`pub fn parse_string_component(&mut self) -> Option<StringComponent> {`
			`let next = self.peek()?;`

			`// Ignore string close`
			`if next == '"' {`
			`return None;`
			`}`

			`let start = self.get_pos();`
			`self.advance();`

			`if next == '\\' {`
			// Strings can use `\` to ignore newlines, so we first try to parse one of those
			`// before falling back to parsing char escapes`
			`self.parse_ignore_newline(start).or_else(\|\| {`
			`let char_component = self.parse_escape(start);`
			`Some(StringComponent::new(`
			`char_component.range,`
			`StringComponentKind::Char(char_component.kind),`
			`))`
			`})`
			`} else {`
			`let end = self.get_pos();`
			`Some(StringComponent::new(`
			`TextRange::from_to(start, end),`
			`StringComponentKind::Char(CodePoint),`
			`))`
			`}`
			`}`
Add character literal parsing and validation 2018-11-04 15:06:38 +01:00			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`fn parse(src: &str) -> (bool, Vec<CharComponent>) {`
			`let component_iterator = &mut super::parse_char_literal(src);`
			`let components: Vec<_> = component_iterator.collect();`
			`(component_iterator.has_closing_quote, components)`
			`}`

			`fn unclosed_char_component(src: &str) -> CharComponent {`
			`let (has_closing_quote, components) = parse(src);`
			`assert!(!has_closing_quote, "char should not have closing quote");`
			`assert!(components.len() == 1);`
			`components[0].clone()`
			`}`

			`fn closed_char_component(src: &str) -> CharComponent {`
			`let (has_closing_quote, components) = parse(src);`
			`assert!(has_closing_quote, "char should have closing quote");`
			`assert!(`
			`components.len() == 1,`
			`"Literal: {}\nComponents: {:#?}",`
			`src,`
			`components`
			`);`
			`components[0].clone()`
			`}`

			`fn closed_char_components(src: &str) -> Vec<CharComponent> {`
			`let (has_closing_quote, components) = parse(src);`
			`assert!(has_closing_quote, "char should have closing quote");`
			`components`
			`}`

			`fn range_closed(src: &str) -> TextRange {`
			`TextRange::from_to(1.into(), (src.len() as u32 - 1).into())`
			`}`

			`fn range_unclosed(src: &str) -> TextRange {`
			`TextRange::from_to(1.into(), (src.len() as u32).into())`
			`}`

			`#[test]`
			`fn test_unicode_escapes() {`
Finish implementing char validation 2018-11-06 17:05:06 +01:00			`let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];`
Add character literal parsing and validation 2018-11-04 15:06:38 +01:00			`for escape in unicode_escapes {`
			`let escape_sequence = format!(r"'\u{}'", escape);`
			`let component = closed_char_component(&escape_sequence);`
			`let expected_range = range_closed(&escape_sequence);`
			`assert_eq!(component.kind, CharComponentKind::UnicodeEscape);`
			`assert_eq!(component.range, expected_range);`
			`}`
			`}`

			`#[test]`
			`fn test_unicode_escapes_unclosed() {`
			`let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];`
			`for escape in unicode_escapes {`
			`let escape_sequence = format!(r"'\u{}'", escape);`
			`let component = unclosed_char_component(&escape_sequence);`
			`let expected_range = range_unclosed(&escape_sequence);`
			`assert_eq!(component.kind, CharComponentKind::UnicodeEscape);`
			`assert_eq!(component.range, expected_range);`
			`}`
			`}`

			`#[test]`
			`fn test_empty_char() {`
			`let (has_closing_quote, components) = parse("''");`
			`assert!(has_closing_quote, "char should have closing quote");`
			`assert!(components.len() == 0);`
			`}`

			`#[test]`
			`fn test_unclosed_char() {`
			`let component = unclosed_char_component("'a");`
			`assert!(component.kind == CodePoint);`
			`assert!(component.range == TextRange::from_to(1.into(), 2.into()));`
			`}`

			`#[test]`
			`fn test_digit_escapes() {`
			`let literals = &[r"", r"5", r"55"];`

			`for literal in literals {`
			`let lit_text = format!(r"'\x{}'", literal);`
			`let component = closed_char_component(&lit_text);`
			`assert!(component.kind == CharComponentKind::AsciiCodeEscape);`
			`assert!(component.range == range_closed(&lit_text));`
			`}`

			`// More than 2 digits starts a new codepoint`
			`let components = closed_char_components(r"'\x555'");`
			`assert!(components.len() == 2);`
			`assert!(components[1].kind == CharComponentKind::CodePoint);`
			`}`

			`#[test]`
			`fn test_ascii_escapes() {`
			`let literals = &[`
			`r"\'", "\\\"", // equivalent to \"`
			`r"\n", r"\r", r"\t", r"\\", r"\0",`
			`];`

			`for literal in literals {`
			`let lit_text = format!("'{}'", literal);`
			`let component = closed_char_component(&lit_text);`
			`assert!(component.kind == CharComponentKind::AsciiEscape);`
			`assert!(component.range == range_closed(&lit_text));`
			`}`
			`}`

			`#[test]`
			`fn test_no_escapes() {`
			`let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];`

			`for &literal in literals {`
			`let lit_text = format!("'{}'", literal);`
			`let component = closed_char_component(&lit_text);`
			`assert!(component.kind == CharComponentKind::CodePoint);`
			`assert!(component.range == range_closed(&lit_text));`
			`}`
			`}`
			`}`