Add Utf8Error::resume_from, to help incremental and/or lossy decoding.
Without this, code outside of the standard library needs to reimplement most of the logic `from_utf8` to interpret the bytes after `valid_up_to()`.
This commit is contained in:
parent
fd182c4010
commit
182044248c
3 changed files with 87 additions and 22 deletions
|
@ -28,6 +28,7 @@
|
||||||
#![feature(test)]
|
#![feature(test)]
|
||||||
#![feature(unboxed_closures)]
|
#![feature(unboxed_closures)]
|
||||||
#![feature(unicode)]
|
#![feature(unicode)]
|
||||||
|
#![feature(utf8_error_resume_from)]
|
||||||
|
|
||||||
extern crate collections;
|
extern crate collections;
|
||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
|
@ -540,6 +540,36 @@ fn from_utf8_mostly_ascii() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_utf8_error() {
|
||||||
|
macro_rules! test {
|
||||||
|
($input: expr, $expected_valid_up_to: expr, $expected_resume_from: expr) => {
|
||||||
|
let error = from_utf8($input).unwrap_err();
|
||||||
|
assert_eq!(error.valid_up_to(), $expected_valid_up_to);
|
||||||
|
assert_eq!(error.resume_from(), $expected_resume_from);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
test!(b"A\xC3\xA9 \xFF ", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \x80 ", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xC1 ", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xC1", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xC2", 4, None);
|
||||||
|
test!(b"A\xC3\xA9 \xC2 ", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xE0", 4, None);
|
||||||
|
test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xE0\xA0", 4, None);
|
||||||
|
test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(6));
|
||||||
|
test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(6));
|
||||||
|
test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xF1", 4, None);
|
||||||
|
test!(b"A\xC3\xA9 \xF1\x80", 4, None);
|
||||||
|
test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
|
||||||
|
test!(b"A\xC3\xA9 \xF1 ", 4, Some(5));
|
||||||
|
test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(6));
|
||||||
|
test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(7));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_as_bytes() {
|
fn test_as_bytes() {
|
||||||
// no null
|
// no null
|
||||||
|
|
|
@ -125,13 +125,14 @@ Section: Creating a string
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub struct Utf8Error {
|
pub struct Utf8Error {
|
||||||
valid_up_to: usize,
|
valid_up_to: usize,
|
||||||
|
invalid_length: Option<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Utf8Error {
|
impl Utf8Error {
|
||||||
/// Returns the index in the given string up to which valid UTF-8 was
|
/// Returns the index in the given string up to which valid UTF-8 was
|
||||||
/// verified.
|
/// verified.
|
||||||
///
|
///
|
||||||
/// It is the maximum index such that `from_utf8(input[..index])`
|
/// It is the maximum index such that `from_utf8(&input[..index])`
|
||||||
/// would return `Ok(_)`.
|
/// would return `Ok(_)`.
|
||||||
///
|
///
|
||||||
/// # Examples
|
/// # Examples
|
||||||
|
@ -152,6 +153,21 @@ impl Utf8Error {
|
||||||
/// ```
|
/// ```
|
||||||
#[stable(feature = "utf8_error", since = "1.5.0")]
|
#[stable(feature = "utf8_error", since = "1.5.0")]
|
||||||
pub fn valid_up_to(&self) -> usize { self.valid_up_to }
|
pub fn valid_up_to(&self) -> usize { self.valid_up_to }
|
||||||
|
|
||||||
|
/// Provide more information about the failure:
|
||||||
|
///
|
||||||
|
/// * `None`: the end of the input was reached unexpectedly.
|
||||||
|
/// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
|
||||||
|
/// If a byte stream (such as a file or a network socket) is being decoded incrementally,
|
||||||
|
/// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
|
||||||
|
///
|
||||||
|
/// * `Some(index)`: an unexpected byte was encountered.
|
||||||
|
/// The index provided is where decoding should resume
|
||||||
|
/// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
|
||||||
|
#[unstable(feature = "utf8_error_resume_from", reason ="new", issue = "0")]
|
||||||
|
pub fn resume_from(&self) -> Option<usize> {
|
||||||
|
self.invalid_length.map(|l| self.valid_up_to + l as usize)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a slice of bytes to a string slice.
|
/// Converts a slice of bytes to a string slice.
|
||||||
|
@ -300,7 +316,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
impl fmt::Display for Utf8Error {
|
impl fmt::Display for Utf8Error {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
|
if let Some(invalid_length) = self.invalid_length {
|
||||||
|
write!(f, "invalid utf-8 sequence of {} bytes from index {}",
|
||||||
|
invalid_length, self.valid_up_to)
|
||||||
|
} else {
|
||||||
|
write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1241,17 +1262,20 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||||
|
|
||||||
while index < len {
|
while index < len {
|
||||||
let old_offset = index;
|
let old_offset = index;
|
||||||
macro_rules! err { () => {{
|
macro_rules! err {
|
||||||
return Err(Utf8Error {
|
($invalid_length: expr) => {
|
||||||
valid_up_to: old_offset
|
return Err(Utf8Error {
|
||||||
})
|
valid_up_to: old_offset,
|
||||||
}}}
|
invalid_length: $invalid_length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
macro_rules! next { () => {{
|
macro_rules! next { () => {{
|
||||||
index += 1;
|
index += 1;
|
||||||
// we needed data, but there was none: error!
|
// we needed data, but there was none: error!
|
||||||
if index >= len {
|
if index >= len {
|
||||||
err!()
|
err!(None)
|
||||||
}
|
}
|
||||||
v[index]
|
v[index]
|
||||||
}}}
|
}}}
|
||||||
|
@ -1259,7 +1283,6 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||||
let first = v[index];
|
let first = v[index];
|
||||||
if first >= 128 {
|
if first >= 128 {
|
||||||
let w = UTF8_CHAR_WIDTH[first as usize];
|
let w = UTF8_CHAR_WIDTH[first as usize];
|
||||||
let second = next!();
|
|
||||||
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
|
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
|
||||||
// first C2 80 last DF BF
|
// first C2 80 last DF BF
|
||||||
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
|
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
|
||||||
|
@ -1279,25 +1302,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||||
// %xF4 %x80-8F 2( UTF8-tail )
|
// %xF4 %x80-8F 2( UTF8-tail )
|
||||||
match w {
|
match w {
|
||||||
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
|
2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||||
|
err!(Some(1))
|
||||||
|
},
|
||||||
3 => {
|
3 => {
|
||||||
match (first, second, next!() & !CONT_MASK) {
|
match (first, next!()) {
|
||||||
(0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) |
|
(0xE0 , 0xA0 ... 0xBF) |
|
||||||
(0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
|
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
|
||||||
(0xED , 0x80 ... 0x9F, TAG_CONT_U8) |
|
(0xED , 0x80 ... 0x9F) |
|
||||||
(0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
|
(0xEE ... 0xEF, 0x80 ... 0xBF) => {}
|
||||||
_ => err!()
|
_ => err!(Some(1))
|
||||||
|
}
|
||||||
|
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||||
|
err!(Some(2))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
4 => {
|
4 => {
|
||||||
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
|
match (first, next!()) {
|
||||||
(0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
|
(0xF0 , 0x90 ... 0xBF) |
|
||||||
(0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
|
(0xF1 ... 0xF3, 0x80 ... 0xBF) |
|
||||||
(0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
|
(0xF4 , 0x80 ... 0x8F) => {}
|
||||||
_ => err!()
|
_ => err!(Some(1))
|
||||||
|
}
|
||||||
|
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||||
|
err!(Some(2))
|
||||||
|
}
|
||||||
|
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||||
|
err!(Some(3))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => err!()
|
_ => err!(Some(1))
|
||||||
}
|
}
|
||||||
index += 1;
|
index += 1;
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in a new issue