Refactor parsing code into struct

This also splits the giant state machine match expression into separate
methods.
This commit is contained in:
David Lukes 2018-02-26 15:41:38 +01:00
parent 310c1146f2
commit bbd6d9cd55
4 changed files with 215 additions and 175 deletions

View file

@ -1,174 +0,0 @@
use regex;
/// Convert the license template into a string which can be turned into a regex.
///
/// The license template could use regex syntax directly, but that would require a lot of manual
/// escaping, which is inconvenient. It is therefore literal by default, with optional regex
/// subparts delimited by `{` and `}`. Additionally:
///
/// - to insert literal `{`, `}` or `\`, escape it with `\`
/// - an empty regex placeholder (`{}`) is shorthand for `{.*?}`
///
/// This function parses this input format and builds a properly escaped *string* representation of
/// the equivalent regular expression. It **does not** however guarantee that the returned string is
/// a syntactically valid regular expression.
///
/// # Examples
///
/// ```
/// # use rustfmt_config::license;
/// assert_eq!(
/// license::parse_template(
/// r"
/// // Copyright {\d+} The \} Rust \\ Project \{ Developers. See the {([A-Z]+)}
/// // file at the top-level directory of this distribution and at
/// // {}.
/// //
/// // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
/// // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
/// // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
/// // option. This file may not be copied, modified, or distributed
/// // except according to those terms.
/// "
/// ).unwrap(),
/// r"^
/// // Copyright \d+ The \} Rust \\ Project \{ Developers\. See the ([A-Z]+)
/// // file at the top\-level directory of this distribution and at
/// // .*?\.
/// //
/// // Licensed under the Apache License, Version 2\.0 <LICENSE\-APACHE or
/// // http://www\.apache\.org/licenses/LICENSE\-2\.0> or the MIT license
/// // <LICENSE\-MIT or http://opensource\.org/licenses/MIT>, at your
/// // option\. This file may not be copied, modified, or distributed
/// // except according to those terms\.
/// "
/// );
/// ```
pub fn parse_template(template: &str) -> Result<String, String> {
// the template is parsed using a state machine
enum State {
Lit,
LitEsc,
// the u32 keeps track of brace nesting
Re(u32),
ReEsc(u32),
}
let mut parsed = String::from("^");
let mut buffer = String::new();
let mut state = State::Lit;
let mut linum = 1;
// keeps track of last line on which a regex placeholder was started
let mut open_brace_line = 0;
for chr in template.chars() {
if chr == '\n' {
linum += 1;
}
state = match state {
State::Lit => match chr {
'{' => {
parsed.push_str(&regex::escape(&buffer));
buffer.clear();
open_brace_line = linum;
State::Re(1)
}
'}' => return Err(format!("escape or balance closing brace on l. {}", linum)),
'\\' => State::LitEsc,
_ => {
buffer.push(chr);
State::Lit
}
},
State::LitEsc => {
buffer.push(chr);
State::Lit
}
State::Re(brace_nesting) => {
match chr {
'{' => {
buffer.push(chr);
State::Re(brace_nesting + 1)
}
'}' => {
match brace_nesting {
1 => {
// default regex for empty placeholder {}
if buffer.is_empty() {
buffer = ".*?".to_string();
}
parsed.push_str(&buffer);
buffer.clear();
State::Lit
}
_ => {
buffer.push(chr);
State::Re(brace_nesting - 1)
}
}
}
'\\' => {
buffer.push(chr);
State::ReEsc(brace_nesting)
}
_ => {
buffer.push(chr);
State::Re(brace_nesting)
}
}
}
State::ReEsc(brace_nesting) => {
buffer.push(chr);
State::Re(brace_nesting)
}
}
}
match state {
State::Re(_) | State::ReEsc(_) => {
return Err(format!(
"escape or balance opening brace on l. {}",
open_brace_line
));
}
State::LitEsc => return Err(format!("incomplete escape sequence on l. {}", linum)),
_ => (),
}
parsed.push_str(&regex::escape(&buffer));
Ok(parsed)
}
#[cfg(test)]
mod test {
use super::parse_template;
#[test]
fn test_parse_license_template() {
assert_eq!(
parse_template("literal (.*)").unwrap(),
r"^literal \(\.\*\)"
);
assert_eq!(parse_template(r"escaping \}").unwrap(), r"^escaping \}");
assert!(parse_template("unbalanced } without escape").is_err());
assert_eq!(
parse_template(r"{\d+} place{-?}holder{s?}").unwrap(),
r"^\d+ place-?holders?"
);
assert_eq!(parse_template("default {}").unwrap(), "^default .*?");
assert_eq!(
parse_template(r"unbalanced nested braces {\{{3}}").unwrap(),
r"^unbalanced nested braces \{{3}"
);
assert_eq!(
parse_template("parsing error }").unwrap_err(),
"escape or balance closing brace on l. 1"
);
assert_eq!(
parse_template("parsing error {\nsecond line").unwrap_err(),
"escape or balance opening brace on l. 1"
);
assert_eq!(
parse_template(r"parsing error \").unwrap_err(),
"incomplete escape sequence on l. 1"
);
}
}

View file

@ -408,7 +408,7 @@ macro_rules! create_config {
license_template_path, e);
return;
};
let license_template_parsed = match license::parse_template(&license_template_str) {
let license_template_parsed = match TemplateParser::parse(&license_template_str) {
Ok(string) => string,
Err(e) => {
eprintln!("Warning: unable to parse license template file {:?}: {}",

213
src/config/license.rs Normal file
View file

@ -0,0 +1,213 @@
use regex;
// the template is parsed using a state machine
enum ParsingState {
Lit,
LitEsc,
// the u32 keeps track of brace nesting
Re(u32),
ReEsc(u32),
Abort(String),
}
use self::ParsingState::*;
pub struct TemplateParser {
parsed: String,
buffer: String,
state: ParsingState,
linum: u32,
open_brace_line: u32,
}
impl TemplateParser {
fn new() -> Self {
Self {
parsed: "^".to_owned(),
buffer: String::new(),
state: Lit,
linum: 1,
// keeps track of last line on which a regex placeholder was started
open_brace_line: 0,
}
}
/// Convert a license template into a string which can be turned into a regex.
///
/// The license template could use regex syntax directly, but that would require a lot of manual
/// escaping, which is inconvenient. It is therefore literal by default, with optional regex
/// subparts delimited by `{` and `}`. Additionally:
///
/// - to insert literal `{`, `}` or `\`, escape it with `\`
/// - an empty regex placeholder (`{}`) is shorthand for `{.*?}`
///
/// This function parses this input format and builds a properly escaped *string* representation
/// of the equivalent regular expression. It **does not** however guarantee that the returned
/// string is a syntactically valid regular expression.
///
/// # Examples
///
/// ```
/// # use rustfmt_config::license::TemplateParser;
/// assert_eq!(
/// TemplateParser::parse(
/// r"
/// // Copyright {\d+} The \} Rust \\ Project \{ Developers. See the {([A-Z]+)}
/// // file at the top-level directory of this distribution and at
/// // {}.
/// //
/// // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
/// // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
/// // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
/// // option. This file may not be copied, modified, or distributed
/// // except according to those terms.
/// "
/// ).unwrap(),
/// r"^
/// // Copyright \d+ The \} Rust \\ Project \{ Developers\. See the ([A-Z]+)
/// // file at the top\-level directory of this distribution and at
/// // .*?\.
/// //
/// // Licensed under the Apache License, Version 2\.0 <LICENSE\-APACHE or
/// // http://www\.apache\.org/licenses/LICENSE\-2\.0> or the MIT license
/// // <LICENSE\-MIT or http://opensource\.org/licenses/MIT>, at your
/// // option\. This file may not be copied, modified, or distributed
/// // except according to those terms\.
/// "
/// );
/// ```
pub fn parse(template: &str) -> Result<String, String> {
let mut parser = Self::new();
for chr in template.chars() {
if chr == '\n' {
parser.linum += 1;
}
parser.state = match parser.state {
Lit => parser.trans_from_lit(chr),
LitEsc => parser.trans_from_litesc(chr),
Re(brace_nesting) => parser.trans_from_re(chr, brace_nesting),
ReEsc(brace_nesting) => parser.trans_from_reesc(chr, brace_nesting),
Abort(msg) => return Err(msg),
};
}
// check if we've ended parsing in a valid state
match parser.state {
Abort(msg) => return Err(msg),
Re(_) | ReEsc(_) => {
return Err(format!(
"escape or balance opening brace on l. {}",
parser.open_brace_line
));
}
LitEsc => return Err(format!("incomplete escape sequence on l. {}", parser.linum)),
_ => (),
}
parser.parsed.push_str(&regex::escape(&parser.buffer));
Ok(parser.parsed)
}
fn trans_from_lit(&mut self, chr: char) -> ParsingState {
match chr {
'{' => {
self.parsed.push_str(&regex::escape(&self.buffer));
self.buffer.clear();
self.open_brace_line = self.linum;
Re(1)
}
'}' => Abort(format!(
"escape or balance closing brace on l. {}",
self.linum
)),
'\\' => LitEsc,
_ => {
self.buffer.push(chr);
Lit
}
}
}
fn trans_from_litesc(&mut self, chr: char) -> ParsingState {
self.buffer.push(chr);
Lit
}
fn trans_from_re(&mut self, chr: char, brace_nesting: u32) -> ParsingState {
match chr {
'{' => {
self.buffer.push(chr);
Re(brace_nesting + 1)
}
'}' => {
match brace_nesting {
1 => {
// default regex for empty placeholder {}
if self.buffer.is_empty() {
self.parsed.push_str(".*?");
} else {
self.parsed.push_str(&self.buffer);
}
self.buffer.clear();
Lit
}
_ => {
self.buffer.push(chr);
Re(brace_nesting - 1)
}
}
}
'\\' => {
self.buffer.push(chr);
ReEsc(brace_nesting)
}
_ => {
self.buffer.push(chr);
Re(brace_nesting)
}
}
}
fn trans_from_reesc(&mut self, chr: char, brace_nesting: u32) -> ParsingState {
self.buffer.push(chr);
Re(brace_nesting)
}
}
#[cfg(test)]
mod test {
use super::TemplateParser;
#[test]
fn test_parse_license_template() {
assert_eq!(
TemplateParser::parse("literal (.*)").unwrap(),
r"^literal \(\.\*\)"
);
assert_eq!(
TemplateParser::parse(r"escaping \}").unwrap(),
r"^escaping \}"
);
assert!(TemplateParser::parse("unbalanced } without escape").is_err());
assert_eq!(
TemplateParser::parse(r"{\d+} place{-?}holder{s?}").unwrap(),
r"^\d+ place-?holders?"
);
assert_eq!(TemplateParser::parse("default {}").unwrap(), "^default .*?");
assert_eq!(
TemplateParser::parse(r"unbalanced nested braces {\{{3}}").unwrap(),
r"^unbalanced nested braces \{{3}"
);
assert_eq!(
TemplateParser::parse("parsing error }").unwrap_err(),
"escape or balance closing brace on l. 1"
);
assert_eq!(
TemplateParser::parse("parsing error {\nsecond line").unwrap_err(),
"escape or balance opening brace on l. 1"
);
assert_eq!(
TemplateParser::parse(r"parsing error \").unwrap_err(),
"incomplete escape sequence on l. 1"
);
}
}

View file

@ -29,6 +29,7 @@ pub mod license;
use config::config_type::ConfigType;
use config::file_lines::FileLines;
use config::license::TemplateParser;
pub use config::lists::*;
pub use config::options::*;
use config::summary::Summary;