diff --git a/src/libcollections/lib.rs b/src/libcollections/lib.rs index 2639e6dce46..d1e91b28c46 100644 --- a/src/libcollections/lib.rs +++ b/src/libcollections/lib.rs @@ -56,6 +56,7 @@ #![feature(unicode)] #![feature(unique)] #![feature(unsafe_no_drop_flag, filling_drop)] +#![feature(decode_utf16)] #![feature(utf8_error)] #![cfg_attr(test, feature(rand, test))] diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs index 5c5f6cace6a..e7b2423c7c4 100644 --- a/src/libcollections/string.rs +++ b/src/libcollections/string.rs @@ -20,8 +20,8 @@ use core::ops::{self, Deref, Add, Index}; use core::ptr; use core::slice; use core::str::pattern::Pattern; +use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER}; use rustc_unicode::str as unicode_str; -use rustc_unicode::str::Utf16Item; use borrow::{Cow, IntoCow}; use range::RangeArgument; @@ -267,14 +267,7 @@ impl String { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf16(v: &[u16]) -> Result { - let mut s = String::with_capacity(v.len()); - for c in unicode_str::utf16_items(v) { - match c { - Utf16Item::ScalarValue(c) => s.push(c), - Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())), - } - } - Ok(s) + decode_utf16(v.iter().cloned()).collect::>().map_err(|_| FromUtf16Error(())) } /// Decode a UTF-16 encoded vector `v` into a string, replacing @@ -294,7 +287,7 @@ impl String { #[inline] #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf16_lossy(v: &[u16]) -> String { - unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect() + decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect() } /// Creates a new `String` from a length, capacity, and pointer. diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index e077a0d8fe7..9512b35267c 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -211,3 +211,12 @@ fn test_len_utf16() { assert!('\u{a66e}'.len_utf16() == 1); assert!('\u{1f4a9}'.len_utf16() == 2); } + +#[test] +fn test_decode_utf16() { + fn check(s: &[u16], expected: &[Result]) { + assert_eq!(::std::char::decode_utf16(s.iter().cloned()).collect::>(), expected); + } + check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]); + check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]); +} diff --git a/src/libcoretest/lib.rs b/src/libcoretest/lib.rs index 6313e42e0ed..dda1b096e88 100644 --- a/src/libcoretest/lib.rs +++ b/src/libcoretest/lib.rs @@ -19,6 +19,7 @@ #![feature(float_from_str_radix)] #![feature(flt2dec)] #![feature(dec2flt)] +#![feature(decode_utf16)] #![feature(fmt_radix)] #![feature(iter_arith)] #![feature(iter_arith)] diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs index 780f8aa5be9..e08b3244109 100644 --- a/src/librustc_unicode/char.rs +++ b/src/librustc_unicode/char.rs @@ -503,3 +503,116 @@ impl char { ToUppercase(CaseMappingIter::new(conversions::to_upper(self))) } } + +/// An iterator that decodes UTF-16 encoded codepoints from an iterator of `u16`s. +#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")] +#[derive(Clone)] +pub struct DecodeUtf16 where I: Iterator { + iter: I, + buf: Option, +} + +/// Create an iterator over the UTF-16 encoded codepoints in `iterable`, +/// returning unpaired surrogates as `Err`s. +/// +/// # Examples +/// +/// ``` +/// #![feature(decode_utf16)] +/// +/// use std::char::decode_utf16; +/// +/// fn main() { +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(decode_utf16(v.iter().cloned()).collect::>(), +/// vec![Ok('𝄞'), +/// Ok('m'), Ok('u'), Ok('s'), +/// Err(0xDD1E), +/// Ok('i'), Ok('c'), +/// Err(0xD834)]); +/// } +/// ``` +/// +/// A lossy decoder can be obtained by replacing `Err` results with the replacement character: +/// +/// ``` +/// #![feature(decode_utf16)] +/// +/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; +/// +/// fn main() { +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(decode_utf16(v.iter().cloned()) +/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) +/// .collect::(), +/// "𝄞mus�ic�"); +/// } +/// ``` +#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")] +#[inline] +pub fn decode_utf16>(iterable: I) -> DecodeUtf16 { + DecodeUtf16 { + iter: iterable.into_iter(), + buf: None, + } +} + +#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")] +impl> Iterator for DecodeUtf16 { + type Item = Result; + + fn next(&mut self) -> Option> { + let u = match self.buf.take() { + Some(buf) => buf, + None => match self.iter.next() { + Some(u) => u, + None => return None + } + }; + + if u < 0xD800 || 0xDFFF < u { + // not a surrogate + Some(Ok(unsafe { from_u32_unchecked(u as u32) })) + } else if u >= 0xDC00 { + // a trailing surrogate + Some(Err(u)) + } else { + let u2 = match self.iter.next() { + Some(u2) => u2, + // eof + None => return Some(Err(u)) + }; + if u2 < 0xDC00 || u2 > 0xDFFF { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.buf = Some(u2); + return Some(Err(u)) + } + + // all ok, so lets decode it. + let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; + Some(Ok(unsafe { from_u32_unchecked(c) })) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) + (low / 2, high) + } +} + +/// U+FFFD REPLACEMENT CHARACTER (�) is used in Unicode to represent a decoding error. +/// It can occur, for example, when giving ill-formed UTF-8 bytes to `String::from_utf8_lossy`. +#[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")] +pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; diff --git a/src/librustc_unicode/lib.rs b/src/librustc_unicode/lib.rs index d046393cdeb..4f0aa69d771 100644 --- a/src/librustc_unicode/lib.rs +++ b/src/librustc_unicode/lib.rs @@ -46,6 +46,7 @@ mod tables; mod u_str; pub mod char; +#[allow(deprecated)] pub mod str { pub use u_str::{UnicodeStr, SplitWhitespace}; pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item}; diff --git a/src/librustc_unicode/u_str.rs b/src/librustc_unicode/u_str.rs index f6e6ac508a7..67333c98fcf 100644 --- a/src/librustc_unicode/u_str.rs +++ b/src/librustc_unicode/u_str.rs @@ -13,8 +13,9 @@ //! This module provides functionality to `str` that requires the Unicode methods provided by the //! unicode parts of the CharExt trait. +use char::{DecodeUtf16, decode_utf16}; use core::char; -use core::iter::Filter; +use core::iter::{Cloned, Filter}; use core::slice; use core::str::Split; @@ -119,11 +120,18 @@ pub fn is_utf16(v: &[u16]) -> bool { /// An iterator that decodes UTF-16 encoded codepoints from a vector /// of `u16`s. +#[deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")] +#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] +#[allow(deprecated)] #[derive(Clone)] pub struct Utf16Items<'a> { - iter: slice::Iter<'a, u16> + decoder: DecodeUtf16>> } + /// The possibilities for values decoded from a `u16` stream. +#[deprecated(since = "1.4.0", reason = "`char::DecodeUtf16` uses `Result` instead")] +#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] +#[allow(deprecated)] #[derive(Copy, PartialEq, Eq, Clone, Debug)] pub enum Utf16Item { /// A valid codepoint. @@ -132,6 +140,7 @@ pub enum Utf16Item { LoneSurrogate(u16) } +#[allow(deprecated)] impl Utf16Item { /// Convert `self` to a `char`, taking `LoneSurrogate`s to the /// replacement character (U+FFFD). @@ -144,49 +153,22 @@ impl Utf16Item { } } +#[deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")] +#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] +#[allow(deprecated)] impl<'a> Iterator for Utf16Items<'a> { type Item = Utf16Item; fn next(&mut self) -> Option { - let u = match self.iter.next() { - Some(u) => *u, - None => return None - }; - - if u < 0xD800 || 0xDFFF < u { - // not a surrogate - Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(u as u32) })) - } else if u >= 0xDC00 { - // a trailing surrogate - Some(Utf16Item::LoneSurrogate(u)) - } else { - // preserve state for rewinding. - let old = self.iter.clone(); - - let u2 = match self.iter.next() { - Some(u2) => *u2, - // eof - None => return Some(Utf16Item::LoneSurrogate(u)) - }; - if u2 < 0xDC00 || u2 > 0xDFFF { - // not a trailing surrogate so we're not a valid - // surrogate pair, so rewind to redecode u2 next time. - self.iter = old.clone(); - return Some(Utf16Item::LoneSurrogate(u)) - } - - // all ok, so lets decode it. - let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; - Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(c) })) - } + self.decoder.next().map(|result| match result { + Ok(c) => Utf16Item::ScalarValue(c), + Err(s) => Utf16Item::LoneSurrogate(s), + }) } #[inline] fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.iter.size_hint(); - // we could be entirely valid surrogates (2 elements per - // char), or entirely non-surrogates (1 element per char) - (low / 2, high) + self.decoder.size_hint() } } @@ -196,7 +178,7 @@ impl<'a> Iterator for Utf16Items<'a> { /// # Examples /// /// ``` -/// #![feature(unicode)] +/// #![feature(unicode, decode_utf16)] /// /// extern crate rustc_unicode; /// @@ -216,8 +198,11 @@ impl<'a> Iterator for Utf16Items<'a> { /// LoneSurrogate(0xD834)]); /// } /// ``` +#[deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")] +#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] +#[allow(deprecated)] pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> { - Utf16Items { iter : v.iter() } + Utf16Items { decoder: decode_utf16(v.iter().cloned()) } } /// Iterator adaptor for encoding `char`s to UTF-16. diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs index e474f47a1b5..09f98978653 100644 --- a/src/libserialize/json.rs +++ b/src/libserialize/json.rs @@ -209,8 +209,6 @@ use std::str::FromStr; use std::string; use std::{char, f64, fmt, str}; use std; -use rustc_unicode::str as unicode_str; -use rustc_unicode::str::Utf16Item; use Encodable; @@ -1712,11 +1710,13 @@ impl> Parser { _ => return self.error(UnexpectedEndOfHexEscape), } - let buf = [n1, try!(self.decode_hex_escape())]; - match unicode_str::utf16_items(&buf).next() { - Some(Utf16Item::ScalarValue(c)) => res.push(c), - _ => return self.error(LoneLeadingSurrogateInHexEscape), + let n2 = try!(self.decode_hex_escape()); + if n2 < 0xDC00 || n2 > 0xDFFF { + return self.error(LoneLeadingSurrogateInHexEscape) } + let c = (((n1 - 0xD800) as u32) << 10 | + (n2 - 0xDC00) as u32) + 0x1_0000; + res.push(char::from_u32(c).unwrap()); } n => match char::from_u32(n as u32) { diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs index 179f0727d46..fca4c66112e 100644 --- a/src/libstd/lib.rs +++ b/src/libstd/lib.rs @@ -242,6 +242,7 @@ #![feature(unicode)] #![feature(unique)] #![feature(unsafe_no_drop_flag, filling_drop)] +#![feature(decode_utf16)] #![feature(vec_push_all)] #![feature(vec_resize)] #![feature(wrapping)] diff --git a/src/libstd/sys/common/wtf8.rs b/src/libstd/sys/common/wtf8.rs index 9e4a80a411b..eb313d275a1 100644 --- a/src/libstd/sys/common/wtf8.rs +++ b/src/libstd/sys/common/wtf8.rs @@ -37,7 +37,6 @@ use hash::{Hash, Hasher}; use iter::FromIterator; use mem; use ops; -use rustc_unicode::str::{Utf16Item, utf16_items}; use slice; use str; use string::String; @@ -186,14 +185,14 @@ impl Wtf8Buf { /// will always return the original code units. pub fn from_wide(v: &[u16]) -> Wtf8Buf { let mut string = Wtf8Buf::with_capacity(v.len()); - for item in utf16_items(v) { + for item in char::decode_utf16(v.iter().cloned()) { match item { - Utf16Item::ScalarValue(c) => string.push_char(c), - Utf16Item::LoneSurrogate(s) => { + Ok(ch) => string.push_char(ch), + Err(surrogate) => { // Surrogates are known to be in the code point range. - let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) }; + let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; // Skip the WTF-8 concatenation check, - // surrogate pairs are already decoded by utf16_items + // surrogate pairs are already decoded by decode_utf16 string.push_code_point_unchecked(code_point) } }