auto merge of #8590 : blake2-ppc/rust/std-str, r=alexcrichton

Implement CharIterator as a separate struct, so that it can be .clone()'d. Fix `.char_range_at_reverse` so that it performs better, closer to the forwards version. This makes the reverse iterators and users like `.rfind()` perform better. Before test str::bench::char_iterator ... bench: 146 ns/iter (+/- 0) test str::bench::char_iterator_ascii ... bench: 397 ns/iter (+/- 49) test str::bench::char_iterator_rev ... bench: 576 ns/iter (+/- 8) test str::bench::char_offset_iterator ... bench: 128 ns/iter (+/- 2) test str::bench::char_offset_iterator_rev ... bench: 425 ns/iter (+/- 59) After test str::bench::char_iterator ... bench: 130 ns/iter (+/- 1) test str::bench::char_iterator_ascii ... bench: 307 ns/iter (+/- 5) test str::bench::char_iterator_rev ... bench: 185 ns/iter (+/- 8) test str::bench::char_offset_iterator ... bench: 131 ns/iter (+/- 13) test str::bench::char_offset_iterator_rev ... bench: 183 ns/iter (+/- 2) To be able to use a string slice to represent the CharIterator, a function `slice_unchecked` is added, that does the same as `slice_bytes` but without any boundary checks. It would be possible to implement CharIterator with pointer arithmetic to make it *much more efficient*, but since vec iterator is still improving, it's too early to attempt to re-implement it in other places. Hopefully CharIterator can be implemented on top of vec iterator without any unsafe code later. Additional changes fix the documentation about null termination.
2013-08-21 21:51:30 -07:00 · 2013-08-21 21:51:30 -07:00 · f1132496dd
commit f1132496dd
parent 8e776c75e9 93de60e511
1 changed files with 259 additions and 136 deletions
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@ -8,13 +8,12 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-/*!
- * String manipulation
- *
- * Strings are a packed UTF-8 representation of text, stored as null
- * terminated buffers of u8 bytes.  Strings should be indexed in bytes,
- * for efficiency, but UTF-8 unsafe operations should be avoided.
- */
+//! String manipulation
+//!
+//! Strings are a packed UTF-8 representation of text, stored as
+//! buffers of u8 bytes. The buffer is not null terminated.
+//! Strings should be indexed in bytes, for efficiency, but UTF-8 unsafe
+//! operations should be avoided.

 use at_vec;
 use cast;
@ -24,10 +23,10 @@ use clone::{Clone, DeepClone};
 use container::{Container, Mutable};
 use iter::Times;
 use iterator::{Iterator, FromIterator, Extendable};
-use iterator::{Filter, AdditiveIterator, Map};
+use iterator::{Filter, AdditiveIterator, Map, Enumerate};
 use iterator::{Invert, DoubleEndedIterator};
 use libc;
-use num::Zero;
+use num::{Saturating, Zero};
 use option::{None, Option, Some};
 use ptr;
 use ptr::RawPtr;
@ -255,56 +254,101 @@ impl<'self, C: CharEq> CharEq for &'self [C] {
 Section: Iterators
 */

+/// External iterator for a string's characters.
+/// Use with the `std::iterator` module.
+#[deriving(Clone)]
+pub struct CharIterator<'self> {
+    /// The slice remaining to be iterated
+    priv string: &'self str,
+}
+
+impl<'self> Iterator<char> for CharIterator<'self> {
+    #[inline]
+    fn next(&mut self) -> Option<char> {
+        // Decode the next codepoint, then update
+        // the slice to be just the remaining part
+        if self.string.len() != 0 {
+            let CharRange {ch, next} = self.string.char_range_at(0);
+            unsafe {
+                self.string = raw::slice_unchecked(self.string, next, self.string.len());
+            }
+            Some(ch)
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        (self.string.len().saturating_add(3)/4, Some(self.string.len()))
+    }
+}
+
+impl<'self> DoubleEndedIterator<char> for CharIterator<'self> {
+    #[inline]
+    fn next_back(&mut self) -> Option<char> {
+        if self.string.len() != 0 {
+            let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
+            unsafe {
+                self.string = raw::slice_unchecked(self.string, 0, next);
+            }
+            Some(ch)
+        } else {
+            None
+        }
+    }
+}
+
+
 /// External iterator for a string's characters and their byte offsets.
 /// Use with the `std::iterator` module.
 #[deriving(Clone)]
 pub struct CharOffsetIterator<'self> {
-    priv index_front: uint,
-    priv index_back: uint,
+    /// The original string to be iterated
    priv string: &'self str,
+    priv iter: CharIterator<'self>,
 }

 impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> {
    #[inline]
    fn next(&mut self) -> Option<(uint, char)> {
-        if self.index_front < self.index_back {
-            let CharRange {ch, next} = self.string.char_range_at(self.index_front);
-            let index = self.index_front;
-            self.index_front = next;
-            Some((index, ch))
-        } else {
-            None
-        }
+        // Compute the byte offset by using the pointer offset between
+        // the original string slice and the iterator's remaining part
+        let offset = do self.string.as_imm_buf |a, _| {
+            do self.iter.string.as_imm_buf |b, _| {
+                b as uint - a as uint
+            }
+        };
+        self.iter.next().map_move(|ch| (offset, ch))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        self.iter.size_hint()
    }
 }

 impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> {
    #[inline]
    fn next_back(&mut self) -> Option<(uint, char)> {
-        if self.index_front < self.index_back {
-            let CharRange {ch, next} = self.string.char_range_at_reverse(self.index_back);
-            self.index_back = next;
-            Some((next, ch))
-        } else {
-            None
-        }
+        self.iter.next_back().map_move(|ch| {
+            let offset = do self.string.as_imm_buf |a, _| {
+                do self.iter.string.as_imm_buf |b, len| {
+                    b as uint - a as uint + len
+                }
+            };
+            (offset, ch)
+        })
    }
 }

-/// External iterator for a string's characters and their byte offsets in reverse order.
-/// Use with the `std::iterator` module.
-pub type CharOffsetRevIterator<'self> =
-    Invert<CharOffsetIterator<'self>>;
-
-/// External iterator for a string's characters.
-/// Use with the `std::iterator` module.
-pub type CharIterator<'self> =
-    Map<'self, (uint, char), char, CharOffsetIterator<'self>>;
-
 /// External iterator for a string's characters in reverse order.
 /// Use with the `std::iterator` module.
-pub type CharRevIterator<'self> =
-    Invert<Map<'self, (uint, char), char, CharOffsetIterator<'self>>>;
+pub type CharRevIterator<'self> = Invert<CharIterator<'self>>;
+
+/// External iterator for a string's characters and their byte offsets in reverse order.
+/// Use with the `std::iterator` module.
+pub type CharOffsetRevIterator<'self> = Invert<CharOffsetIterator<'self>>;

 /// External iterator for a string's bytes.
 /// Use with the `std::iterator` module.
@ -313,12 +357,20 @@ pub type ByteIterator<'self> =

 /// External iterator for a string's bytes in reverse order.
 /// Use with the `std::iterator` module.
-pub type ByteRevIterator<'self> =
-    Invert<Map<'self, &'self u8, u8, vec::VecIterator<'self, u8>>>;
+pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;
+
+/// An iterator over byte index and either &u8 or char
+#[deriving(Clone)]
+enum OffsetIterator<'self> {
+    // use ByteIterator here when it can be cloned
+    ByteOffset(Enumerate<vec::VecIterator<'self, u8>>),
+    CharOffset(CharOffsetIterator<'self>),
+}

 /// An iterator over the substrings of a string, separated by `sep`.
 #[deriving(Clone)]
 pub struct CharSplitIterator<'self,Sep> {
+    priv iter: OffsetIterator<'self>,
    priv string: &'self str,
    priv position: uint,
    priv sep: Sep,
@ -327,7 +379,6 @@ pub struct CharSplitIterator<'self,Sep> {
    /// Whether an empty string at the end is allowed
    priv allow_trailing_empty: bool,
    priv finished: bool,
-    priv only_ascii: bool
 }

 /// An iterator over the words of a string, separated by an sequence of whitespace
@ -343,39 +394,39 @@ impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep>
    fn next(&mut self) -> Option<&'self str> {
        if self.finished { return None }

-        let l = self.string.len();
        let start = self.position;
+        let len = self.string.len();

-        if self.only_ascii {
-            // this gives a *huge* speed up for splitting on ASCII
-            // characters (e.g. '\n' or ' ')
-            while self.position < l && self.count > 0 {
-                let byte = self.string[self.position];
-
-                if self.sep.matches(byte as char) {
-                    let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
-                    self.position += 1;
-                    self.count -= 1;
-                    return Some(slice);
-                }
-                self.position += 1;
-            }
-        } else {
-            while self.position < l && self.count > 0 {
-                let CharRange {ch, next} = self.string.char_range_at(self.position);
-
-                if self.sep.matches(ch) {
-                    let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
-                    self.position = next;
-                    self.count -= 1;
-                    return Some(slice);
-                }
-                self.position = next;
+        if self.count > 0 {
+            match self.iter {
+                // this gives a *huge* speed up for splitting on ASCII
+                // characters (e.g. '\n' or ' ')
+                ByteOffset(ref mut iter) =>
+                    for (idx, &byte) in *iter {
+                        if self.sep.matches(byte as char) {
+                            self.position = idx + 1;
+                            self.count -= 1;
+                            return Some(unsafe {
+                                raw::slice_bytes(self.string, start, idx)
+                            })
+                        }
+                    },
+                CharOffset(ref mut iter) =>
+                    for (idx, ch) in *iter {
+                        if self.sep.matches(ch) {
+                            // skip over the separator
+                            self.position = self.string.char_range_at(idx).next;
+                            self.count -= 1;
+                            return Some(unsafe {
+                                raw::slice_bytes(self.string, start, idx)
+                            })
+                        }
+                    },
            }
        }
        self.finished = true;
-        if self.allow_trailing_empty || start < l {
-            Some(unsafe { raw::slice_bytes(self.string, start, l) })
+        if self.allow_trailing_empty || start < len {
+            Some(unsafe { raw::slice_bytes(self.string, start, len) })
        } else {
            None
        }
@ -938,12 +989,21 @@ pub mod raw {
    /// If end is greater than the length of the string.
    #[inline]
    pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
-        do s.as_imm_buf |sbuf, n| {
-             assert!((begin <= end));
-             assert!((end <= n));
+        assert!(begin <= end);
+        assert!(end <= s.len());
+        slice_unchecked(s, begin, end)
+    }

+    /// Takes a bytewise (not UTF-8) slice from a string.
+    ///
+    /// Returns the substring from [`begin`..`end`).
+    ///
+    /// Caller must check slice boundaries!
+    #[inline]
+    pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
+        do s.as_imm_buf |sbuf, _n| {
             cast::transmute(Slice {
-                 data: ptr::offset(sbuf, begin as int),
+                 data: sbuf.offset_inbounds(begin as int),
                 len: end - begin,
             })
        }
@ -1302,7 +1362,7 @@ impl<'self> StrSlice<'self> for &'self str {
    /// ~~~
    #[inline]
    fn iter(&self) -> CharIterator<'self> {
-        self.char_offset_iter().map(|(_, c)| c)
+        CharIterator{string: *self}
    }

    /// An iterator over the characters of `self`, in reverse order.
@ -1326,14 +1386,11 @@ impl<'self> StrSlice<'self> for &'self str {
    /// An iterator over the characters of `self` and their byte offsets.
    #[inline]
    fn char_offset_iter(&self) -> CharOffsetIterator<'self> {
-        CharOffsetIterator {
-            index_front: 0,
-            index_back: self.len(),
-            string: *self
-        }
+        CharOffsetIterator{string: *self, iter: self.iter()}
    }

-    /// An iterator over the characters of `self` and their byte offsets.
+    /// An iterator over the characters of `self` and their byte offsets,
+    /// in reverse order.
    #[inline]
    fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self> {
        self.char_offset_iter().invert()
@ -1371,15 +1428,19 @@ impl<'self> StrSlice<'self> for &'self str {
    #[inline]
    fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
        -> CharSplitIterator<'self, Sep> {
-        let only_ascii = sep.only_ascii();
+        let iter = if sep.only_ascii() {
+            ByteOffset(self.as_bytes().iter().enumerate())
+        } else {
+            CharOffset(self.char_offset_iter())
+        };
        CharSplitIterator {
+            iter: iter,
            string: *self,
            position: 0,
            sep: sep,
            count: count,
            allow_trailing_empty: allow_trailing_empty,
            finished: false,
-            only_ascii: only_ascii
        }
    }

@ -1481,8 +1542,7 @@ impl<'self> StrSlice<'self> for &'self str {
    /// beyond the last character of the string
    #[inline]
    fn slice(&self, begin: uint, end: uint) -> &'self str {
-        assert!(self.is_char_boundary(begin));
-        assert!(self.is_char_boundary(end));
+        assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
        unsafe { raw::slice_bytes(*self, begin, end) }
    }

@ -1502,7 +1562,8 @@ impl<'self> StrSlice<'self> for &'self str {
    /// out of bounds.
    #[inline]
    fn slice_to(&self, end: uint) -> &'self str {
-        self.slice(0, end)
+        assert!(self.is_char_boundary(end));
+        unsafe { raw::slice_bytes(*self, 0, end) }
    }

    /// Returns a slice of the string from the char range
@ -1512,23 +1573,24 @@ impl<'self> StrSlice<'self> for &'self str {
    /// beyond the last character of the string.
    fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
        assert!(begin <= end);
-        // not sure how to use the iterators for this nicely.
-        let mut position = 0;
        let mut count = 0;
-        let l = self.len();
-        while count < begin && position < l {
-            position = self.char_range_at(position).next;
-            count += 1;
-        }
-        if count < begin { fail!("Attempted to begin slice_chars beyond end of string") }
-        let start_byte = position;
-        while count < end && position < l {
-            position = self.char_range_at(position).next;
-            count += 1;
-        }
-        if count < end { fail!("Attempted to end slice_chars beyond end of string") }
+        let mut begin_byte = None;
+        let mut end_byte = None;

-        self.slice(start_byte, position)
+        // This could be even more efficient by not decoding,
+        // only finding the char boundaries
+        for (idx, _) in self.char_offset_iter() {
+            if count == begin { begin_byte = Some(idx); }
+            if count == end { end_byte = Some(idx); break; }
+            count += 1;
+        }
+        if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
+
+        match (begin_byte, end_byte) {
+            (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
+            (_, None) => fail!("slice_chars: `end` is beyond end of string"),
+            (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
+        }
    }

    /// Returns true if `needle` is a prefix of the string.
@ -1724,6 +1786,7 @@ impl<'self> StrSlice<'self> for &'self str {

    /// Returns false if the index points into the middle of a multi-byte
    /// character sequence.
+    #[inline]
    fn is_char_boundary(&self, index: uint) -> bool {
        if index == self.len() { return true; }
        let b = self[index];
@ -1809,24 +1872,33 @@ impl<'self> StrSlice<'self> for &'self str {
    /// This function can be used to iterate over a unicode string in reverse.
    ///
    /// Returns 0 for next index if called on start index 0.
+    #[inline]
    fn char_range_at_reverse(&self, start: uint) -> CharRange {
        let mut prev = start;

-        // while there is a previous byte == 10......
-        while prev > 0u && self[prev - 1u] & 192u8 == TAG_CONT_U8 {
-            prev -= 1u;
+        prev = prev.saturating_sub(1);
+        if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
+
+        // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
+        fn multibyte_char_range_at_rev(s: &str, mut i: uint) -> CharRange {
+            // while there is a previous byte == 10......
+            while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
+                i -= 1u;
+            }
+
+            let mut val = s[i] as uint;
+            let w = UTF8_CHAR_WIDTH[val] as uint;
+            assert!((w != 0));
+
+            val = utf8_first_byte!(val, w);
+            val = utf8_acc_cont_byte!(val, s[i + 1]);
+            if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
+            if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
+
+            return CharRange {ch: val as char, next: i};
        }

-        // now refer to the initial byte of previous char
-        if prev > 0u {
-            prev -= 1u;
-        } else {
-            prev = 0u;
-        }
-
-
-        let ch = self.char_at(prev);
-        return CharRange {ch:ch, next:prev};
+        return multibyte_char_range_at_rev(*self, prev);
    }

    /// Plucks the character ending at the `i`th byte of a string
@ -1836,8 +1908,6 @@ impl<'self> StrSlice<'self> for &'self str {
    }

    /// Work with the byte buffer of a string as a byte slice.
-    ///
-    /// The byte slice does not include the null terminator.
    fn as_bytes(&self) -> &'self [u8] {
        unsafe { cast::transmute(*self) }
    }
@ -1854,10 +1924,8 @@ impl<'self> StrSlice<'self> for &'self str {
                if search.matches(b as char) { return Some(i) }
            }
        } else {
-            let mut index = 0;
-            for c in self.iter() {
+            for (index, c) in self.char_offset_iter() {
                if search.matches(c) { return Some(index); }
-                index += c.len_utf8_bytes();
            }
        }

@ -1871,15 +1939,14 @@ impl<'self> StrSlice<'self> for &'self str {
    /// `Some` containing the byte index of the last matching character
    /// or `None` if there is no match
    fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
-        let mut index = self.len();
        if search.only_ascii() {
+            let mut index = self.len();
            for b in self.byte_rev_iter() {
                index -= 1;
                if search.matches(b as char) { return Some(index); }
            }
        } else {
-            for c in self.rev_iter() {
-                index -= c.len_utf8_bytes();
+            for (index, c) in self.char_offset_rev_iter() {
                if search.matches(c) { return Some(index); }
            }
        }
@ -2020,10 +2087,7 @@ impl<'self> StrSlice<'self> for &'self str {

    /// Work with the byte buffer and length of a slice.
    ///
-    /// The given length is one byte longer than the 'official' indexable
-    /// length of the string. This is to permit probing the byte past the
-    /// indexable area for a null byte, as is the case in slices pointing
-    /// to full strings, or suffixes of them.
+    /// The buffer does not have a null terminator.
    #[inline]
    fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T {
        let v: &[u8] = unsafe { cast::transmute(*self) };
@ -2046,12 +2110,10 @@ pub trait OwnedStr {

    /// Work with the mutable byte buffer and length of a slice.
    ///
-    /// The given length is one byte longer than the 'official' indexable
-    /// length of the string. This is to permit probing the byte past the
-    /// indexable area for a null byte, as is the case in slices pointing
-    /// to full strings, or suffixes of them.
+    /// The buffer does not have a null terminator.
    ///
-    /// Make sure any mutations to this buffer keep this string valid UTF8.
+    /// The caller must make sure any mutations to this buffer keep the string
+    /// valid UTF-8!
    fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T;
 }

@ -2152,12 +2214,10 @@ impl OwnedStr for ~str {
        new_str
    }

-    /// Reserves capacity for exactly `n` bytes in the given string, not including
-    /// the null terminator.
+    /// Reserves capacity for exactly `n` bytes in the given string.
    ///
    /// Assuming single-byte characters, the resulting string will be large
-    /// enough to hold a string of length `n`. To account for the null terminator,
-    /// the underlying buffer will have the size `n` + 1.
+    /// enough to hold a string of length `n`.
    ///
    /// If the capacity for `s` is already equal to or greater than the requested
    /// capacity, then no action is taken.
@ -2177,8 +2237,7 @@ impl OwnedStr for ~str {
    /// Reserves capacity for at least `n` bytes in the given string.
    ///
    /// Assuming single-byte characters, the resulting string will be large
-    /// enough to hold a string of length `n`. To account for the null terminator,
-    /// the underlying buffer will have the size `n` + 1.
+    /// enough to hold a string of length `n`.
    ///
    /// This function will over-allocate in order to amortize the allocation costs
    /// in scenarios where the caller may need to repeatedly reserve additional
@ -3210,6 +3269,14 @@ mod tests {
        assert_eq!(pos, v.len());
    }

+    #[test]
+    fn test_iterator_clone() {
+        let s = "ศไทย中华Việt Nam";
+        let mut it = s.iter();
+        it.next();
+        assert!(it.zip(it.clone()).all(|(x,y)| x == y));
+    }
+
    #[test]
    fn test_byte_iterator() {
        let s = ~"ศไทย中华Việt Nam";
@ -3425,6 +3492,62 @@ mod tests {
 mod bench {
    use extra::test::BenchHarness;
    use super::*;
+    use prelude::*;
+
+    #[bench]
+    fn char_iterator(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_iterator_ascii(bh: &mut BenchHarness) {
+        let s = "Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_iterator_rev(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.rev_iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_offset_iterator(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.char_offset_iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_offset_iterator_rev(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.char_offset_rev_iter().len(), len);
+        }
+    }

    #[bench]
    fn is_utf8_100_ascii(bh: &mut BenchHarness) {