rustc: Wait for all codegen threads to exit

This commit updates rustc to wait for all codegen threads to exit before allowing the main thread to exit. This is a stab in the dark to fix the mysterious segfaults appearing on #55238, and hopefully we'll see whether this actually fixes things in practice...
2018-10-31 11:32:27 -07:00 · 2018-10-31 11:32:27 -07:00 · 14c6835e03
commit 14c6835e03
parent 016eaf88f5
3 changed files with 105 additions and 23 deletions
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@ -15,17 +15,6 @@ dependencies = [
 "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

-[[package]]
-name = "alloc_jemalloc"
-version = "0.0.0"
-dependencies = [
- "build_helper 0.1.0",
- "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
- "compiler_builtins 0.0.0",
- "core 0.0.0",
- "libc 0.0.0",
-]
-
 [[package]]
 name = "alloc_system"
 version = "0.0.0"
@ -2696,7 +2685,6 @@ name = "std"
 version = "0.0.0"
 dependencies = [
 "alloc 0.0.0",
- "alloc_jemalloc 0.0.0",
 "alloc_system 0.0.0",
 "build_helper 0.1.0",
 "cc 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/src/librustc_codegen_llvm/back/write.rs
+++ b/src/librustc_codegen_llvm/back/write.rs
@ -1508,6 +1508,7 @@ enum Message {
    },
    CodegenComplete,
    CodegenItem,
+    CodegenAborted,
 }

 struct Diagnostic {
@ -1788,6 +1789,7 @@ fn start_executing_work(tcx: TyCtxt,
        let mut needs_lto = Vec::new();
        let mut lto_import_only_modules = Vec::new();
        let mut started_lto = false;
+        let mut codegen_aborted = false;

        // This flag tracks whether all items have gone through codegens
        let mut codegen_done = false;
@ -1805,13 +1807,19 @@ fn start_executing_work(tcx: TyCtxt,
        let mut llvm_start_time = None;

        // Run the message loop while there's still anything that needs message
-        // processing:
+        // processing. Note that as soon as codegen is aborted we simply want to
+        // wait for all existing work to finish, so many of the conditions here
+        // only apply if codegen hasn't been aborted as they represent pending
+        // work to be done.
        while !codegen_done ||
-              work_items.len() > 0 ||
              running > 0 ||
-              needs_lto.len() > 0 ||
-              lto_import_only_modules.len() > 0 ||
-              main_thread_worker_state != MainThreadWorkerState::Idle {
+              (!codegen_aborted && (
+                  work_items.len() > 0 ||
+                  needs_lto.len() > 0 ||
+                  lto_import_only_modules.len() > 0 ||
+                  main_thread_worker_state != MainThreadWorkerState::Idle
+              ))
+        {

            // While there are still CGUs to be codegened, the coordinator has
            // to decide how to utilize the compiler processes implicit Token:
@ -1840,6 +1848,9 @@ fn start_executing_work(tcx: TyCtxt,
                        spawn_work(cgcx, item);
                    }
                }
+            } else if codegen_aborted {
+                // don't queue up any more work if codegen was aborted, we're
+                // just waiting for our existing children to finish
            } else {
                // If we've finished everything related to normal codegen
                // then it must be the case that we've got some LTO work to do.
@ -1904,7 +1915,7 @@ fn start_executing_work(tcx: TyCtxt,

            // Spin up what work we can, only doing this while we've got available
            // parallelism slots and work left to spawn.
-            while work_items.len() > 0 && running < tokens.len() {
+            while !codegen_aborted && work_items.len() > 0 && running < tokens.len() {
                let (item, _) = work_items.pop().unwrap();

                maybe_start_llvm_timer(cgcx.config(item.module_kind()),
@ -1969,6 +1980,7 @@ fn start_executing_work(tcx: TyCtxt,
                    if !cgcx.opts.debugging_opts.no_parallel_llvm {
                        helper.request_token();
                    }
+                    assert!(!codegen_aborted);
                    assert_eq!(main_thread_worker_state,
                               MainThreadWorkerState::Codegenning);
                    main_thread_worker_state = MainThreadWorkerState::Idle;
@ -1976,11 +1988,26 @@ fn start_executing_work(tcx: TyCtxt,

                Message::CodegenComplete => {
                    codegen_done = true;
+                    assert!(!codegen_aborted);
                    assert_eq!(main_thread_worker_state,
                               MainThreadWorkerState::Codegenning);
                    main_thread_worker_state = MainThreadWorkerState::Idle;
                }

+                // If codegen is aborted that means translation was aborted due
+                // to some normal-ish compiler error. In this situation we want
+                // to exit as soon as possible, but we want to make sure all
+                // existing work has finished. Flag codegen as being done, and
+                // then conditions above will ensure no more work is spawned but
+                // we'll keep executing this loop until `running` hits 0.
+                Message::CodegenAborted => {
+                    assert!(!codegen_aborted);
+                    codegen_done = true;
+                    codegen_aborted = true;
+                    assert_eq!(main_thread_worker_state,
+                               MainThreadWorkerState::Codegenning);
+                }
+
                // If a thread exits successfully then we drop a token associated
                // with that worker and update our `running` count. We may later
                // re-acquire a token to continue running more work. We may also not
@ -2446,6 +2473,19 @@ impl OngoingCodegen {
        drop(self.coordinator_send.send(Box::new(Message::CodegenComplete)));
    }

+    /// Consume this context indicating that codegen was entirely aborted, and
+    /// we need to exit as quickly as possible.
+    ///
+    /// This method blocks the current thread until all worker threads have
+    /// finished, and all worker threads should have exited or be real close to
+    /// exiting at this point.
+    pub fn codegen_aborted(self) {
+        // Signal to the coordinator it should spawn no more work and start
+        // shutdown.
+        drop(self.coordinator_send.send(Box::new(Message::CodegenAborted)));
+        drop(self.future.join());
+    }
+
    pub fn check_for_errors(&self, sess: &Session) {
        self.shared_emitter_main.check(sess, false);
    }
@ -2464,6 +2504,11 @@ impl OngoingCodegen {
    }
 }

+// impl Drop for OngoingCodegen {
+//     fn drop(&mut self) {
+//     }
+// }
+
 pub(crate) fn submit_codegened_module_to_llvm(tcx: TyCtxt,
                                              module: ModuleCodegen,
                                              cost: u64) {
--- a/src/librustc_codegen_llvm/base.rs
+++ b/src/librustc_codegen_llvm/base.rs
@ -76,12 +76,13 @@ use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_data_structures::sync::Lrc;

 use std::any::Any;
-use std::ffi::CString;
-use std::sync::Arc;
-use std::time::{Instant, Duration};
-use std::i32;
 use std::cmp;
+use std::ffi::CString;
+use std::i32;
+use std::ops::{Deref, DerefMut};
+use std::sync::Arc;
 use std::sync::mpsc;
+use std::time::{Instant, Duration};
 use syntax_pos::Span;
 use syntax_pos::symbol::InternedString;
 use syntax::attr;
@ -820,6 +821,7 @@ pub fn codegen_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
        metadata,
        rx,
        codegen_units.len());
+    let ongoing_codegen = AbortCodegenOnDrop(Some(ongoing_codegen));

    // Codegen an allocator shim, if necessary.
    //
@ -949,7 +951,54 @@ pub fn codegen_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
    ongoing_codegen.check_for_errors(tcx.sess);

    assert_and_save_dep_graph(tcx);
-    ongoing_codegen
+    ongoing_codegen.into_inner()
+}
+
+/// A curious wrapper structure whose only purpose is to call `codegen_aborted`
+/// when it's dropped abnormally.
+///
+/// In the process of working on rust-lang/rust#55238 a mysterious segfault was
+/// stumbled upon. The segfault was never reproduced locally, but it was
+/// suspected to be releated to the fact that codegen worker threads were
+/// sticking around by the time the main thread was exiting, causing issues.
+///
+/// This structure is an attempt to fix that issue where the `codegen_aborted`
+/// message will block until all workers have finished. This should ensure that
+/// even if the main codegen thread panics we'll wait for pending work to
+/// complete before returning from the main thread, hopefully avoiding
+/// segfaults.
+///
+/// If you see this comment in the code, then it means that this workaround
+/// worked! We may yet one day track down the mysterious cause of that
+/// segfault...
+struct AbortCodegenOnDrop(Option<OngoingCodegen>);
+
+impl AbortCodegenOnDrop {
+    fn into_inner(mut self) -> OngoingCodegen {
+        self.0.take().unwrap()
+    }
+}
+
+impl Deref for AbortCodegenOnDrop {
+    type Target = OngoingCodegen;
+
+    fn deref(&self) -> &OngoingCodegen {
+        self.0.as_ref().unwrap()
+    }
+}
+
+impl DerefMut for AbortCodegenOnDrop {
+    fn deref_mut(&mut self) -> &mut OngoingCodegen {
+        self.0.as_mut().unwrap()
+    }
+}
+
+impl Drop for AbortCodegenOnDrop {
+    fn drop(&mut self) {
+        if let Some(codegen) = self.0.take() {
+            codegen.codegen_aborted();
+        }
+    }
 }

 fn assert_and_save_dep_graph<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>) {