Auto merge of #89165 - jkugelman:read-to-end-overallocation, r=joshtriplett

Fix read_to_end to not grow an exact size buffer If you know how much data to expect and use `Vec::with_capacity` to pre-allocate a buffer of that capacity, `Read::read_to_end` will still double its capacity. It needs some space to perform a read, even though that read ends up returning `0`. It's a bummer to carefully pre-allocate 1GB to read a 1GB file into memory and end up using 2GB. This fixes that behavior by special casing a full buffer and reading into a small "probe" buffer instead. If that read returns `0` then it's confirmed that the buffer was the perfect size. If it doesn't, the probe buffer is appended to the normal buffer and the read loop continues. Fixing this allows several workarounds in the standard library to be removed: - `Take` no longer needs to override `Read::read_to_end`. - The `reservation_size` callback that allowed `Take` to inhibit the previous over-allocation behavior isn't needed. - `fs::read` doesn't need to reserve an extra byte in `initial_buffer_size`. Curiously, there was a unit test that specifically checked that `Read::read_to_end` *does* over-allocate. I removed that test, too.
2021-10-04 04:44:56 +00:00 · 2021-10-04 04:44:56 +00:00 · d25de31a0e
commit d25de31a0e
parent e737694a4d 9b9c24ec7f
3 changed files with 37 additions and 41 deletions
--- a/library/std/src/fs.rs
+++ b/library/std/src/fs.rs
@ -200,10 +200,9 @@ pub struct DirBuilder {

 /// Indicates how large a buffer to pre-allocate before reading the entire file.
 fn initial_buffer_size(file: &File) -> usize {
-    // Allocate one extra byte so the buffer doesn't need to grow before the
-    // final `read` call at the end of the file.  Don't worry about `usize`
-    // overflow because reading will fail regardless in that case.
-    file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0)
+    // Don't worry about `usize` overflow because reading will fail regardless
+    // in that case.
+    file.metadata().map(|m| m.len() as usize).unwrap_or(0)
 }

 /// Read the entire contents of a file into a bytes vector.
--- a/library/std/src/io/mod.rs
+++ b/library/std/src/io/mod.rs
@ -362,22 +362,18 @@ where
 // Because we're extending the buffer with uninitialized data for trusted
 // readers, we need to make sure to truncate that if any of this panics.
 fn read_to_end<R: Read + ?Sized>(r: &mut R, buf: &mut Vec<u8>) -> Result<usize> {
-    read_to_end_with_reservation(r, buf, |_| 32)
-}
-
-fn read_to_end_with_reservation<R, F>(
-    r: &mut R,
-    buf: &mut Vec<u8>,
-    mut reservation_size: F,
-) -> Result<usize>
-where
-    R: Read + ?Sized,
-    F: FnMut(&R) -> usize,
-{
    let start_len = buf.len();
+    let start_cap = buf.capacity();
    let mut g = Guard { len: buf.len(), buf };
    loop {
-        if g.len == g.buf.len() {
+        // If we've read all the way up to the capacity, reserve more space.
+        if g.len == g.buf.capacity() {
+            g.buf.reserve(32);
+        }
+
+        // Initialize any excess capacity and adjust the length so we can write
+        // to it.
+        if g.buf.len() < g.buf.capacity() {
            unsafe {
                // FIXME(danielhenrymantilla): #42788
                //
@ -387,7 +383,6 @@ where
                //   - Only the standard library gets to soundly "ignore" this,
                //     based on its privileged knowledge of unstable rustc
                //     internals;
-                g.buf.reserve(reservation_size(r));
                let capacity = g.buf.capacity();
                g.buf.set_len(capacity);
                r.initializer().initialize(&mut g.buf[g.len..]);
@ -404,9 +399,30 @@ where
                assert!(n <= buf.len());
                g.len += n;
            }
-            Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
+            Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
            Err(e) => return Err(e),
        }
+
+        if g.len == g.buf.capacity() && g.buf.capacity() == start_cap {
+            // The buffer might be an exact fit. Let's read into a probe buffer
+            // and see if it returns `Ok(0)`. If so, we've avoided an
+            // unnecessary doubling of the capacity. But if not, append the
+            // probe buffer to the primary buffer and let its capacity grow.
+            let mut probe = [0u8; 32];
+
+            loop {
+                match r.read(&mut probe) {
+                    Ok(0) => return Ok(g.len - start_len),
+                    Ok(n) => {
+                        g.buf.extend_from_slice(&probe[..n]);
+                        g.len += n;
+                        break;
+                    }
+                    Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
+                    Err(e) => return Err(e),
+                }
+            }
+        }
    }
 }

@ -2584,13 +2600,6 @@ impl<T: Read> Read for Take<T> {
    unsafe fn initializer(&self) -> Initializer {
        self.inner.initializer()
    }
-
-    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<usize> {
-        // Pass in a reservation_size closure that respects the current value
-        // of limit for each read. If we hit the read limit, this prevents the
-        // final zero-byte read from allocating again.
-        read_to_end_with_reservation(self, buf, |self_| cmp::min(self_.limit, 32) as usize)
-    }
 }

 #[stable(feature = "rust1", since = "1.0.0")]
--- a/library/std/src/io/tests.rs
+++ b/library/std/src/io/tests.rs
@ -362,24 +362,12 @@ impl<'a> Read for ExampleSliceReader<'a> {
 fn test_read_to_end_capacity() -> io::Result<()> {
    let input = &b"foo"[..];

-    // read_to_end() generally needs to over-allocate, both for efficiency
-    // and so that it can distinguish EOF. Assert that this is the case
-    // with this simple ExampleSliceReader struct, which uses the default
-    // implementation of read_to_end. Even though vec1 is allocated with
-    // exactly enough capacity for the read, read_to_end will allocate more
-    // space here.
+    // read_to_end() takes care not to over-allocate when a buffer is the
+    // exact size needed.
    let mut vec1 = Vec::with_capacity(input.len());
    ExampleSliceReader { slice: input }.read_to_end(&mut vec1)?;
    assert_eq!(vec1.len(), input.len());
-    assert!(vec1.capacity() > input.len(), "allocated more");
-
-    // However, std::io::Take includes an implementation of read_to_end
-    // that will not allocate when the limit has already been reached. In
-    // this case, vec2 never grows.
-    let mut vec2 = Vec::with_capacity(input.len());
-    ExampleSliceReader { slice: input }.take(input.len() as u64).read_to_end(&mut vec2)?;
-    assert_eq!(vec2.len(), input.len());
-    assert_eq!(vec2.capacity(), input.len(), "did not allocate more");
+    assert_eq!(vec1.capacity(), input.len(), "did not allocate more");

    Ok(())
 }