[1/4 for #2365, #2671] Fix create/kill race with schedulers and tasks during rust_kernel::fail

This commit is contained in:
Ben Blum 2012-07-20 18:06:17 -04:00
parent f55999fd7a
commit 5bb4a12900
10 changed files with 69 additions and 32 deletions

View file

@ -19,6 +19,7 @@ rust_kernel::rust_kernel(rust_env *env) :
max_port_id(1),
rval(0),
max_sched_id(1),
killed(false),
sched_reaper(this),
osmain_driver(NULL),
non_weak_tasks(0),
@ -103,7 +104,8 @@ rust_kernel::create_scheduler(rust_sched_launcher_factory *launchfac,
id = max_sched_id++;
assert(id != INTPTR_MAX && "Hit the maximum scheduler id");
sched = new (this, "rust_scheduler")
rust_scheduler(this, num_threads, id, allow_exit, launchfac);
rust_scheduler(this, num_threads, id, allow_exit, killed,
launchfac);
bool is_new = sched_table
.insert(std::pair<rust_sched_id,
rust_scheduler*>(id, sched)).second;
@ -197,6 +199,10 @@ rust_kernel::fail() {
#endif
// Copy the list of schedulers so that we don't hold the lock while
// running kill_all_tasks.
// I think this only needs to be done by one task ever; as it is,
// multiple tasks invoking kill_all might get here. Currently libcore
// ensures only one task will ever invoke it, but this would really be
// fine either way, so I'm leaving it as it is. -- bblum
// FIXME (#2671): There's a lot that happens under kill_all_tasks,
// and I don't know that holding sched_lock here is ok, but we need
// to hold the sched lock to prevent the scheduler from being
@ -205,15 +211,13 @@ rust_kernel::fail() {
std::vector<rust_scheduler*> scheds;
{
scoped_lock with(sched_lock);
killed = true;
for (sched_map::iterator iter = sched_table.begin();
iter != sched_table.end(); iter++) {
scheds.push_back(iter->second);
}
}
// FIXME (#2671): This is not a foolproof way to kill all tasks
// while ensuring that no new tasks or schedulers are created in the
// meantime that keep the scheduler alive.
for (std::vector<rust_scheduler*>::iterator iter = scheds.begin();
iter != scheds.end(); iter++) {
(*iter)->kill_all_tasks();

View file

@ -72,7 +72,7 @@ class rust_kernel {
lock_and_signal rval_lock;
int rval;
// Protects max_sched_id and sched_table, join_list
// Protects max_sched_id and sched_table, join_list, killed
lock_and_signal sched_lock;
// The next scheduler id
rust_sched_id max_sched_id;
@ -81,6 +81,10 @@ class rust_kernel {
sched_map sched_table;
// A list of scheduler ids that are ready to exit
std::vector<rust_sched_id> join_list;
// Whether or not the runtime has to die (triggered when the root/main
// task group fails). This propagates to all new schedulers and tasks
// created after it is set.
bool killed;
rust_sched_reaper sched_reaper;
// The single-threaded scheduler that uses the main thread

View file

@ -4,33 +4,36 @@
const size_t SCHED_STACK_SIZE = 1024*100;
rust_sched_launcher::rust_sched_launcher(rust_scheduler *sched, int id)
rust_sched_launcher::rust_sched_launcher(rust_scheduler *sched, int id,
bool killed)
: kernel(sched->kernel),
sched_loop(sched, id),
sched_loop(sched, id, killed),
driver(&sched_loop) {
}
rust_thread_sched_launcher::rust_thread_sched_launcher(rust_scheduler *sched,
int id)
: rust_sched_launcher(sched, id),
int id, bool killed)
: rust_sched_launcher(sched, id, killed),
rust_thread(SCHED_STACK_SIZE) {
}
rust_manual_sched_launcher::rust_manual_sched_launcher(rust_scheduler *sched,
int id)
: rust_sched_launcher(sched, id) {
int id, bool killed)
: rust_sched_launcher(sched, id, killed) {
}
rust_sched_launcher *
rust_thread_sched_launcher_factory::create(rust_scheduler *sched, int id) {
rust_thread_sched_launcher_factory::create(rust_scheduler *sched, int id,
bool killed) {
return new(sched->kernel, "rust_thread_sched_launcher")
rust_thread_sched_launcher(sched, id);
rust_thread_sched_launcher(sched, id, killed);
}
rust_sched_launcher *
rust_manual_sched_launcher_factory::create(rust_scheduler *sched, int id) {
rust_manual_sched_launcher_factory::create(rust_scheduler *sched, int id,
bool killed) {
assert(launcher == NULL && "I can only track one sched_launcher");
launcher = new(sched->kernel, "rust_manual_sched_launcher")
rust_manual_sched_launcher(sched, id);
rust_manual_sched_launcher(sched, id, killed);
return launcher;
}

View file

@ -17,7 +17,7 @@ protected:
rust_sched_driver driver;
public:
rust_sched_launcher(rust_scheduler *sched, int id);
rust_sched_launcher(rust_scheduler *sched, int id, bool killed);
virtual ~rust_sched_launcher() { }
virtual void start() = 0;
@ -29,7 +29,7 @@ class rust_thread_sched_launcher
:public rust_sched_launcher,
private rust_thread {
public:
rust_thread_sched_launcher(rust_scheduler *sched, int id);
rust_thread_sched_launcher(rust_scheduler *sched, int id, bool killed);
virtual void start() { rust_thread::start(); }
virtual void join() { rust_thread::join(); }
virtual void run() { driver.start_main_loop(); }
@ -37,7 +37,7 @@ public:
class rust_manual_sched_launcher : public rust_sched_launcher {
public:
rust_manual_sched_launcher(rust_scheduler *sched, int id);
rust_manual_sched_launcher(rust_scheduler *sched, int id, bool killed);
virtual void start() { }
virtual void join() { }
rust_sched_driver *get_driver() { return &driver; };
@ -47,13 +47,14 @@ class rust_sched_launcher_factory {
public:
virtual ~rust_sched_launcher_factory() { }
virtual rust_sched_launcher *
create(rust_scheduler *sched, int id) = 0;
create(rust_scheduler *sched, int id, bool killed) = 0;
};
class rust_thread_sched_launcher_factory
: public rust_sched_launcher_factory {
public:
virtual rust_sched_launcher *create(rust_scheduler *sched, int id);
virtual rust_sched_launcher *create(rust_scheduler *sched, int id,
bool killed);
};
class rust_manual_sched_launcher_factory
@ -62,7 +63,8 @@ private:
rust_manual_sched_launcher *launcher;
public:
rust_manual_sched_launcher_factory() : launcher(NULL) { }
virtual rust_sched_launcher *create(rust_scheduler *sched, int id);
virtual rust_sched_launcher *create(rust_scheduler *sched, int id,
bool killed);
rust_sched_driver *get_driver() {
assert(launcher != NULL);
return launcher->get_driver();

View file

@ -13,12 +13,13 @@ const size_t C_STACK_SIZE = 1024*1024;
bool rust_sched_loop::tls_initialized = false;
rust_sched_loop::rust_sched_loop(rust_scheduler *sched,int id) :
rust_sched_loop::rust_sched_loop(rust_scheduler *sched, int id, bool killed) :
_log(this),
id(id),
should_exit(false),
cached_c_stack(NULL),
dead_task(NULL),
killed(killed),
pump_signal(NULL),
kernel(sched->kernel),
sched(sched),
@ -63,6 +64,8 @@ rust_sched_loop::kill_all_tasks() {
{
scoped_lock with(lock);
// Any task created after this will be killed. See transition, below.
killed = true;
for (size_t i = 0; i < running_tasks.length(); i++) {
all_tasks.push_back(running_tasks[i]);
@ -319,6 +322,11 @@ rust_sched_loop::transition(rust_task *task,
}
task->set_state(dst, cond, cond_name);
// If the entire runtime is failing, newborn tasks must be doomed.
if (src == task_state_newborn && killed) {
task->kill_inner();
}
pump_loop();
}

View file

@ -60,6 +60,7 @@ private:
rust_task_list running_tasks;
rust_task_list blocked_tasks;
rust_task *dead_task;
bool killed;
rust_signal *pump_signal;
@ -91,7 +92,7 @@ public:
// Only a pointer to 'name' is kept, so it must live as long as this
// domain.
rust_sched_loop(rust_scheduler *sched, int id);
rust_sched_loop(rust_scheduler *sched, int id, bool killed);
void activate(rust_task *task);
rust_log & get_log();
void fail();
@ -107,6 +108,7 @@ public:
void log_state();
void kill_all_tasks();
bool doomed();
rust_task *create_task(rust_task *spawner, const char *name);

View file

@ -9,6 +9,7 @@ rust_scheduler::rust_scheduler(rust_kernel *kernel,
size_t num_threads,
rust_sched_id id,
bool allow_exit,
bool killed,
rust_sched_launcher_factory *launchfac) :
kernel(kernel),
live_threads(num_threads),
@ -18,7 +19,7 @@ rust_scheduler::rust_scheduler(rust_kernel *kernel,
num_threads(num_threads),
id(id)
{
create_task_threads(launchfac);
create_task_threads(launchfac, killed);
}
rust_scheduler::~rust_scheduler() {
@ -27,8 +28,8 @@ rust_scheduler::~rust_scheduler() {
rust_sched_launcher *
rust_scheduler::create_task_thread(rust_sched_launcher_factory *launchfac,
int id) {
rust_sched_launcher *thread = launchfac->create(this, id);
int id, bool killed) {
rust_sched_launcher *thread = launchfac->create(this, id, killed);
KLOG(kernel, kern, "created task thread: " PTR ", id: %d",
thread, id);
return thread;
@ -41,11 +42,12 @@ rust_scheduler::destroy_task_thread(rust_sched_launcher *thread) {
}
void
rust_scheduler::create_task_threads(rust_sched_launcher_factory *launchfac) {
rust_scheduler::create_task_threads(rust_sched_launcher_factory *launchfac,
bool killed) {
KLOG(kernel, kern, "Using %d scheduler threads.", num_threads);
for(size_t i = 0; i < num_threads; ++i) {
threads.push(create_task_thread(launchfac, i));
threads.push(create_task_thread(launchfac, i, killed));
}
}

View file

@ -34,18 +34,20 @@ private:
rust_sched_id id;
void create_task_threads(rust_sched_launcher_factory *launchfac);
void create_task_threads(rust_sched_launcher_factory *launchfac,
bool killed);
void destroy_task_threads();
rust_sched_launcher *
create_task_thread(rust_sched_launcher_factory *launchfac, int id);
create_task_thread(rust_sched_launcher_factory *launchfac, int id,
bool killed);
void destroy_task_thread(rust_sched_launcher *thread);
void exit();
public:
rust_scheduler(rust_kernel *kernel, size_t num_threads,
rust_sched_id id, bool allow_exit,
rust_sched_id id, bool allow_exit, bool killed,
rust_sched_launcher_factory *launchfac);
~rust_scheduler();

View file

@ -257,8 +257,17 @@ rust_task::yield(bool *killed) {
void
rust_task::kill() {
scoped_lock with(lifecycle_lock);
kill_inner();
}
// XXX: bblum: kill/kill race
void rust_task::kill_inner() {
lifecycle_lock.must_have_lock();
// Multiple kills should be able to safely race, but check anyway.
if (killed) {
LOG(this, task, "task %s @0x%" PRIxPTR " already killed", name, this);
return;
}
// Note the distinction here: kill() is when you're in an upcall
// from task A and want to force-fail task B, you do B->kill().

View file

@ -264,6 +264,7 @@ public:
// Fail this task (assuming caller-on-stack is different task).
void kill();
void kill_inner();
// Indicates that we've been killed and now is an apropriate
// time to fail as a result