std: Speed up str::is_utf8

Use unchecked vec indexing since the vector bounds are checked by the loop. Iterators are not easy to use in this case since we skip 1-4 bytes each lap. This part of the commit speeds up is_utf8 for ASCII input. Check codepoint ranges by checking the byte ranges manually instead of computing a full decoding for multibyte encodings. This is easy to read and corresponds to the UTF-8 syntax in the RFC. No changes to what we accept. A comment notes that surrogate halves are accepted. Before: test str::bench::is_utf8_100_ascii ... bench: 165 ns/iter (+/- 3) test str::bench::is_utf8_100_multibyte ... bench: 218 ns/iter (+/- 5) After: test str::bench::is_utf8_100_ascii ... bench: 130 ns/iter (+/- 1) test str::bench::is_utf8_100_multibyte ... bench: 156 ns/iter (+/- 3)
2013-08-02 18:34:00 +02:00 · 2013-08-02 18:34:00 +02:00 · 0504d7e57b
commit 0504d7e57b
parent 2460170e6a
1 changed files with 66 additions and 34 deletions
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@ -564,51 +564,63 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
 Section: Misc
 */

-// Return the initial codepoint accumulator for the first byte.
-// The first byte is special, only want bottom 5 bits for width 2, 4 bits
-// for width 3, and 3 bits for width 4
-macro_rules! utf8_first_byte(
-    ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
-)
-
-// return the value of $ch updated with continuation byte $byte
-macro_rules! utf8_acc_cont_byte(
-    ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
-)
-
 /// Determines if a vector of bytes contains valid UTF-8
 pub fn is_utf8(v: &[u8]) -> bool {
    let mut i = 0u;
    let total = v.len();
+    fn unsafe_get(xs: &[u8], i: uint) -> u8 {
+        unsafe { *xs.unsafe_ref(i) }
+    }
    while i < total {
-        if v[i] < 128u8 {
+        let v_i = unsafe_get(v, i);
+        if v_i < 128u8 {
            i += 1u;
        } else {
-            let w = utf8_char_width(v[i]);
+            let w = utf8_char_width(v_i);
            if w == 0u { return false; }

            let nexti = i + w;
            if nexti > total { return false; }
-            // 1. Make sure the correct number of continuation bytes are present
-            // 2. Check codepoint ranges (deny overlong encodings)
-            //    2-byte encoding is for codepoints  \u0080 to  \u07ff
-            //    3-byte encoding is for codepoints  \u0800 to  \uffff
-            //    4-byte encoding is for codepoints \u10000 to \u10ffff

-            //    2-byte encodings are correct if the width and continuation match up
-            if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
-            if w > 2 {
-                let mut ch;
-                ch = utf8_first_byte!(v[i], w);
-                ch = utf8_acc_cont_byte!(ch, v[i + 1]);
-                if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
-                ch = utf8_acc_cont_byte!(ch, v[i + 2]);
-                if w == 3 && ch < MAX_TWO_B { return false; }
-                if w > 3 {
-                    if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; }
-                    ch = utf8_acc_cont_byte!(ch, v[i + 3]);
-                    if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; }
-                }
+            // 2-byte encoding is for codepoints  \u0080 to  \u07ff
+            //        first  C2 80        last DF BF
+            // 3-byte encoding is for codepoints  \u0800 to  \uffff
+            //        first  E0 A0 80     last EF BF BF
+            // 4-byte encoding is for codepoints \u10000 to \u10ffff
+            //        first  F0 90 80 80  last F4 8F BF BF
+            //
+            // Use the UTF-8 syntax from the RFC
+            //
+            // https://tools.ietf.org/html/rfc3629
+            // UTF8-1      = %x00-7F
+            // UTF8-2      = %xC2-DF UTF8-tail
+            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
+            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
+            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
+            //               %xF4 %x80-8F 2( UTF8-tail )
+            // UTF8-tail   = %x80-BF
+            // --
+            // This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
+            match w {
+                2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
+                    return false
+                },
+                3 => match (v_i,
+                            unsafe_get(v, i + 1),
+                            unsafe_get(v, i + 2) & 192u8) {
+                    (0xE0        , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
+                    (0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
+                    _ => return false,
+                },
+                _ => match (v_i,
+                            unsafe_get(v, i + 1),
+                            unsafe_get(v, i + 2) & 192u8,
+                            unsafe_get(v, i + 3) & 192u8) {
+                    (0xF0        , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
+                    (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
+                    (0xF4        , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
+                    _ => return false,
+                },
            }

            i = nexti;
@ -756,6 +768,18 @@ pub struct CharRange {
    next: uint
 }

+// Return the initial codepoint accumulator for the first byte.
+// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+// for width 3, and 3 bits for width 4
+macro_rules! utf8_first_byte(
+    ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
+)
+
+// return the value of $ch updated with continuation byte $byte
+macro_rules! utf8_acc_cont_byte(
+    ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
+)
+
 // UTF-8 tags and ranges
 priv static TAG_CONT_U8: u8 = 128u8;
 priv static TAG_CONT: uint = 128u;
@ -2833,13 +2857,21 @@ mod tests {
    }

    #[test]
-    fn test_is_utf8_deny_overlong() {
+    fn test_is_utf8() {
        assert!(!is_utf8([0xc0, 0x80]));
        assert!(!is_utf8([0xc0, 0xae]));
        assert!(!is_utf8([0xe0, 0x80, 0x80]));
        assert!(!is_utf8([0xe0, 0x80, 0xaf]));
        assert!(!is_utf8([0xe0, 0x81, 0x81]));
        assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
+        assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
+
+        assert!(is_utf8([0xC2, 0x80]));
+        assert!(is_utf8([0xDF, 0xBF]));
+        assert!(is_utf8([0xE0, 0xA0, 0x80]));
+        assert!(is_utf8([0xEF, 0xBF, 0xBF]));
+        assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
+        assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
    }