New scheduler and interruptible blocking.

A new scheduler replaces the old one. - There are no sched_xxx_notify() calls that ask scheduler to change task state. - Tasks now have priorities and different timeslices. - One second interval is distributed among processes. - There are just runnable and expired queues. - SCHED_GRANULARITY determines a maximum running boundary for tasks. - Scheduler can now detect a safe point and suspend a task. Interruptible blocking is implemented. - Mutexes, waitqueues and ipc are modified to have an interruptible nature. - Sleep information is stored on the ktcb. (which waitqueue? etc.)
2026-07-18 21:25:24 +02:00 · 2008-10-01 12:43:44 +03:00
parent c54d505709
commit f6d0a79298
21 changed files with 681 additions and 429 deletions
--- a/src/api/ipc.c
+++ b/src/api/ipc.c
@@ -41,7 +41,7 @@ int ipc_msg_copy(struct ktcb *to, struct ktcb *from)
 	memcpy(mr0_dst, mr0_src, MR_TOTAL * sizeof(unsigned int));

 	/* Save the sender id in case of ANYTHREAD receiver */
-	if (to->senderid == L4_ANYTHREAD)
+	if (to->expected_sender == L4_ANYTHREAD)
 		mr0_dst[MR_SENDER] = from->tid;

 	return 0;
@@ -52,106 +52,134 @@ int sys_ipc_control(syscall_context_t *regs)
 	return -ENOSYS;
 }

+/*
+ * Why can we safely copy registers and resume task
+ * after we release the locks? Because even if someone
+ * tried to interrupt and wake up the other party, they
+ * won't be able to, because the task's all hooks to its
+ * waitqueue have been removed at that stage.
+ */
+
+/* Interruptible ipc */
 int ipc_send(l4id_t recv_tid)
 {
 	struct ktcb *receiver = find_task(recv_tid);
 	struct waitqueue_head *wqhs, *wqhr;

-	if (!receiver) {
-		printk("%s: tid: %d, no such task.\n", __FUNCTION__,
-		       recv_tid);
-		return -EINVAL;
-	}
 	wqhs = &receiver->wqh_send;
 	wqhr = &receiver->wqh_recv;

 	spin_lock(&wqhs->slock);
 	spin_lock(&wqhr->slock);

-	/* Is my receiver waiting? */
-	if (wqhr->sleepers > 0) {
-		struct waitqueue *wq, *n;
-		struct ktcb *sleeper;
+	/* Ready to receive and expecting us? */
+	if (receiver->state == TASK_SLEEPING &&
+	    receiver->waiting_on == wqhr &&
+	    (receiver->expected_sender == current->tid ||
+	     receiver->expected_sender == L4_ANYTHREAD)) {
+		struct waitqueue *wq = receiver->wq;

-		list_for_each_entry_safe(wq, n, &wqhr->task_list, task_list) {
-			sleeper = wq->task;
-			/* Found the receiver. Does it sleep for this sender? */
-			BUG_ON(sleeper->tid != recv_tid);
-			if ((sleeper->senderid == current->tid) ||
-			    (sleeper->senderid == L4_ANYTHREAD)) {
-				list_del_init(&wq->task_list);
-				spin_unlock(&wqhr->slock);
-				spin_unlock(&wqhs->slock);
+		/* Remove from waitqueue */
+		list_del_init(&wq->task_list);
+		wqhr->sleepers--;

-				/* Do the work */
-				ipc_msg_copy(sleeper, current);
-				//printk("%s: (%d) Waking up (%d)\n", __FUNCTION__,
-				//       current->tid, sleeper->tid);
+		/* Release locks */
+		spin_unlock(&wqhr->slock);
+		spin_unlock(&wqhs->slock);

-				/* Wake it up, we can yield here. */
-				sched_resume_task(sleeper);
-				return 0;
-			}
-		}
+		/* Copy message registers */
+		ipc_msg_copy(receiver, current);
+
+		// printk("%s: (%d) Waking up (%d)\n", __FUNCTION__,
+		//       current->tid, receiver->tid);
+
+		/* Wake it up, we can yield here. */
+		sched_resume_sync(receiver);
+		return 0;
 	}
-	/* Could not find a receiver that's waiting */
-	DECLARE_WAITQUEUE(wq, current);
+
+	/* The receiver is not ready and/or not expecting us */
+	CREATE_WAITQUEUE_ON_STACK(wq, current);
 	wqhs->sleepers++;
 	list_add_tail(&wq.task_list, &wqhs->task_list);
-	sched_notify_sleep(current);
-	need_resched = 1;
-	// printk("%s: (%d) waiting for (%d)\n", __FUNCTION__, current->tid, recv_tid);
+	task_set_wqh(current, wqhs, &wq);
+	current->state = TASK_SLEEPING;
 	spin_unlock(&wqhr->slock);
 	spin_unlock(&wqhs->slock);
+	// printk("%s: (%d) waiting for (%d)\n", __FUNCTION__,
+	//       current->tid, recv_tid);
+	schedule();
+
+	/* Did we wake up normally or get interrupted */
+	if (current->flags & TASK_INTERRUPTED) {
+		current->flags &= ~TASK_INTERRUPTED;
+		return -EINTR;
+	}
 	return 0;
 }

 int ipc_recv(l4id_t senderid)
 {
-	struct waitqueue_head *wqhs = &current->wqh_send;
-	struct waitqueue_head *wqhr = &current->wqh_recv;
+	struct waitqueue_head *wqhs, *wqhr;

-	/* Specify who to receiver from, so senders know. */
-	current->senderid = senderid;
+	wqhs = &current->wqh_send;
+	wqhr = &current->wqh_recv;
+
+	/*
+	 * Indicate who we expect to receive from,
+	 * so senders know.
+	 */
+	current->expected_sender = senderid;

 	spin_lock(&wqhs->slock);
 	spin_lock(&wqhr->slock);

-	/* Is my sender waiting? */
+	/* Are there senders? */
 	if (wqhs->sleepers > 0) {
 		struct waitqueue *wq, *n;
 		struct ktcb *sleeper;

+		BUG_ON(list_empty(&wqhs->task_list));
+
+		/* Look for a sender we want to receive from */
 		list_for_each_entry_safe(wq, n, &wqhs->task_list, task_list) {
 			sleeper = wq->task;
-			/* Found a sender */
-			if ((sleeper->tid == current->senderid) ||
-			    (current->senderid == L4_ANYTHREAD)) {
+
+			/* Found a sender that we wanted to receive from */
+			if ((sleeper->tid == current->expected_sender) ||
+			    (current->expected_sender == L4_ANYTHREAD)) {
 				list_del_init(&wq->task_list);
+				wqhs->sleepers--;
+				task_unset_wqh(sleeper);
 				spin_unlock(&wqhr->slock);
 				spin_unlock(&wqhs->slock);
-
-				/* Do the work */
 				ipc_msg_copy(current, sleeper);
+
 				// printk("%s: (%d) Waking up (%d)\n", __FUNCTION__,
 				//       current->tid, sleeper->tid);
-
-				/* Wake it up */
-				sched_resume_task(sleeper);
+				sched_resume_sync(sleeper);
 				return 0;
-
 			}
 		}
 	}
-	/* Could not find a sender that's waiting */
-	DECLARE_WAITQUEUE(wq, current);
+
+	/* The sender is not ready */
+	CREATE_WAITQUEUE_ON_STACK(wq, current);
 	wqhr->sleepers++;
 	list_add_tail(&wq.task_list, &wqhr->task_list);
-	sched_notify_sleep(current);
-	need_resched = 1;
-	// printk("%s: (%d) waiting for (%d) \n", __FUNCTION__, current->tid, current->senderid);
+	task_set_wqh(current, wqhr, &wq);
+	current->state = TASK_SLEEPING;
+	// printk("%s: (%d) waiting for (%d)\n", __FUNCTION__,
+	//       current->tid, current->expected_sender);
 	spin_unlock(&wqhr->slock);
 	spin_unlock(&wqhs->slock);
+	schedule();
+
+	/* Did we wake up normally or get interrupted */
+	if (current->flags & TASK_INTERRUPTED) {
+		current->flags &= ~TASK_INTERRUPTED;
+		return -EINTR;
+	}
 	return 0;
 }

--- a/src/api/thread.c
+++ b/src/api/thread.c
@@ -16,25 +16,12 @@

 int sys_thread_switch(syscall_context_t *regs)
 {
-	sched_yield();
+	schedule();
 	return 0;
 }

 int thread_suspend(struct task_ids *ids)
 {
-	struct ktcb *task;
-
-	if (!(task = find_task(ids->tid)))
-		return -ESRCH;
-
-	/*
-	 * The thread_control_lock is protecting from
-	 * indirect modification of thread context, this
-	 * does not cause any such operation so we don't
-	 * need to acquire that lock here.
-	 */
-	sched_suspend_task(task);
-
 	return 0;
 }

@@ -48,14 +35,15 @@ int thread_resume(struct task_ids *ids)
 	if (!mutex_trylock(&task->thread_control_lock))
 		return -EAGAIN;

-	/* Notify scheduler of task resume */
-	sched_notify_resume(task);
+	/* Put task into runqueue as runnable */
+	sched_resume_async(task);

 	/* Release lock and return */
 	mutex_unlock(&task->thread_control_lock);
 	return 0;
 }

+/* Runs a thread for the first time */
 int thread_start(struct task_ids *ids)
 {
 	struct ktcb *task;
@@ -67,7 +55,7 @@ int thread_start(struct task_ids *ids)
 		return -EAGAIN;

 	/* Notify scheduler of task resume */
-	sched_notify_resume(task);
+	sched_resume_async(task);

 	/* Release lock and return */
 	mutex_unlock(&task->thread_control_lock);
@@ -264,7 +252,7 @@ out:
 	thread_setup_new_ids(ids, flags, new, task);

 	/* Initialise task's scheduling state and parameters. */
-	sched_init_task(new);
+	sched_init_task(new, TASK_PRIO_NORMAL);

 	/* Initialise ipc waitqueues */
 	waitqueue_head_init(&new->wqh_send);
@@ -302,7 +290,6 @@ int sys_thread_control(syscall_context_t *regs)
 	case THREAD_RESUME:
 		ret = thread_resume(ids);
 		break;
-	/* TODO: Add THREAD_DESTROY! */
 	default:
 		ret = -EINVAL;
 	}
--- a/src/arch/arm/exception.c
+++ b/src/arch/arm/exception.c
@@ -212,18 +212,27 @@ error:
 		;
 }

-void prefetch_abort_handler(u32 faulted_pc, u32 fsr, u32 far)
+void prefetch_abort_handler(u32 faulted_pc, u32 fsr, u32 far, u32 lr)
 {
 	set_abort_type(fsr, ARM_PABT);
 	if (check_aborts(faulted_pc, fsr, far) < 0) {
 		printascii("This abort can't be handled by any pager.\n");
 		goto error;
 	}
+
+	if (KERN_ADDR(lr))
+		goto error;
 	fault_ipc_to_pager(faulted_pc, fsr, far);
 	return;

 error:
 	disable_irqs();
+	dprintk("Unhandled prefetch abort @ address: ", faulted_pc);
+	dprintk("FAR:", far);
+	dprintk("FSR:", fsr);
+	dprintk("LR:", lr);
+	printascii("Kernel panic.\n");
+	printascii("Halting system...\n");
 	while (1)
 		;
 }
--- a/src/arch/arm/v5/mm.c
+++ b/src/arch/arm/v5/mm.c
@@ -530,3 +530,12 @@ void copy_pgds_by_vrange(pgd_table_t *to, pgd_table_t *from,
 	       irange * sizeof(pgd_t));
 }

+/* Scheduler uses this to switch context */
+void arch_hardware_flush(pgd_table_t *pgd)
+{
+	arm_clean_invalidate_cache();
+	arm_invalidate_tlb();
+	arm_set_ttb(virt_to_phys(pgd));
+	arm_invalidate_tlb();
+}
+
--- a/src/arch/arm/vectors.S
+++ b/src/arch/arm/vectors.S
@@ -207,6 +207,12 @@ END_PROC(arm_swi_exception)
 		sub	\sp, \sp, #8	@ Adjust SP, since stack op on banked regs is no writeback.
 		@ stack state: (Low) |->SP_USR|LR_USR|(Original)| (High)
 	.endm
+
+	.macro is_psr_usr rx
+	and	\rx, \rx, #ARM_MODE_MASK
+	cmp	\rx, #ARM_MODE_USR
+	.endm
+
 /*
 * vect_pabt
 *
@@ -264,6 +270,11 @@ read_pabt_state:
 	bne	1f			@ Branch here based on previous irq judgement.
 	enable_irqs r3
 1:
+	/* Now check in what mode abort occured, and return that mode's LR in R4 */
+	ldr	r0, [sp, #28]		@ Load PABT_SPSR
+	is_psr_usr r0			@ Test if PABT_SPSR was user mode.
+	ldrne	r3, [sp, #32]		@ Abort occured in kernel, load LR_SVC
+	ldreq	r3, [sp, #4]		@ Abort occured in user, load LR_USR
 	ldr	r0, [sp, #36]		@ Load LR_PABT saved previously.
 	mov	lr, pc
 	ldr	pc, =prefetch_abort_handler @ Jump to function outside this page.
@@ -448,6 +459,11 @@ preempted_psr:
 current_irq_nest_count:
 .word	0

+/*
+ * FIXME: current_irq_nest_count also counts for any preempt_disable() calls.
+ * However this nesting check assumes all nests come from real irqs.
+ * We should make this check just the real ones.
+ */
 #define IRQ_NESTING_MAX			15
 	.macro	inc_irq_cnt_with_overnest_check rx, ry
 	ldr	\rx, =current_irq_nest_count	@ Load the irq nest status word.
@@ -480,10 +496,6 @@ current_irq_nest_count:
 	ldreq	\rx, =preempted_psr
 	streq	\process_psr, [\rx]
 	.endm
-	.macro is_psr_usr rx
-	and	\rx, \rx, #ARM_MODE_MASK
-	cmp	\rx, #ARM_MODE_USR
-	.endm

 #define	CONTEXT_PSR		0
 #define CONTEXT_R0		4
@@ -584,7 +596,10 @@ save_usr_context:
 	str	r1, [r0, #CONTEXT_R0]
 	@ stack state: (Low) |..|..|..|..|..|..|..|..|->(Original)| (High)
 prepare_schedule:
+	mov lr, pc
 	ldr pc, =schedule
+1:
+	b	1b	/* To catch if schedule returns in irq mode */
 END_PROC(arm_irq_exception_reentrant_with_schedule)

 /*
@@ -612,7 +627,7 @@ END_PROC(arm_irq_exception_reentrant_with_schedule)
 *   Furthermore, irqs are also disabled shortly before calling switch_to() from both contexts.
 *   This happens at points where stack state would be irrecoverable if an irq occured.
 */
-BEGIN_PROC(switch_to)
+BEGIN_PROC(arch_switch)
 	in_process_context r2		@ Note this depends on preempt count being 0.
 	beq	save_process_context	@ Voluntary switch needs explicit saving of current state.
 	dec_irq_nest_cnt r2, r3		@ Soon leaving irq context, so reduce preempt count here.
@@ -639,7 +654,7 @@ load_next_context_usr:
 load_next_context_svc:
 	ldmib	sp, {r0-r15}^		@ Switch to svc context and jump, loading R13 and R14 from stack.
 					@ This is OK since the jump is to current context.
-END_PROC(switch_to)
+END_PROC(arch_switch)


 /*
--- a/src/generic/pgalloc.c
+++ b/src/generic/pgalloc.c
@@ -12,6 +12,12 @@
 #include <l4/generic/physmem.h>
 #include INC_GLUE(memory.h)

+/* FIXME:
+ *
+ * mem_cache_alloc() now has an interruptible mutex.
+ * All routines defined here should check returned errors.
+ */
+
 #define	PGALLOC_PGD_CACHE		0
 #define PGALLOC_PMD_CACHE		1
 #define PGALLOC_PG_CACHE		2
--- a/src/generic/scheduler.c
+++ b/src/generic/scheduler.c
@@ -1,12 +1,13 @@
 /*
- * A basic scheduler that does the job for now.
+ * A basic priority-based scheduler.
 *
- * Copyright (C) 2007 Bahadir Balban
+ * Copyright (C) 2007, 2008 Bahadir Balban
 */
 #include <l4/lib/list.h>
 #include <l4/lib/printk.h>
 #include <l4/lib/string.h>
 #include <l4/lib/mutex.h>
+#include <l4/lib/math.h>
 #include <l4/lib/bit.h>
 #include <l4/lib/spinlock.h>
 #include <l4/generic/scheduler.h>
@@ -21,16 +22,19 @@
 #include INC_PLAT(platform.h)
 #include INC_ARCH(exception.h)

-/* A very basic runqueue */
+
+/* A basic runqueue */
 struct runqueue {
-	struct spinlock lock;
-	struct list_head task_list;
-	unsigned int total;
+	struct spinlock lock;		/* Lock */
+	struct list_head task_list;	/* List of tasks in rq */
+	unsigned int total;		/* Total tasks */
+	int recalc_timeslice;		/* Need timeslice redistribution */
 };

-static struct runqueue sched_rq[3];
-static struct runqueue *rq_runnable, *rq_expired, *rq_pending;
-
+#define SCHED_RQ_TOTAL					2
+static struct runqueue sched_rq[SCHED_RQ_TOTAL];
+static struct runqueue *rq_runnable, *rq_expired;
+static int prio_total;			/* Total priority of all tasks */

 /* This is incremented on each irq or voluntarily by preempt_disable() */
 extern unsigned int current_irq_nest_count;
@@ -52,16 +56,6 @@ void preempt_enable(void)
 {
 	voluntary_preempt--;
 	current_irq_nest_count--;
-
-	/*
-	 * Even if count increases after we check it, it will come back to zero.
-	 * This test really is asking "is this the outmost explicit
-	 * preempt_enable() that will really enable context switching?"
-	 */
-	if (current_irq_nest_count == 0) {
-		/* Then, give scheduler a chance to check need_resched == 1 */
-		schedule();
-	}
 }

 /* A positive irq nest count implies current context cannot be preempted. */
@@ -71,9 +65,30 @@ void preempt_disable(void)
 	voluntary_preempt++;
 }

-void sched_runqueue_init(void)
+int in_irq_context(void)
 {
-	for (int i = 0; i < 3; i++) {
+	/*
+	 * If there was a real irq, irq nest count must be
+	 * one more than all preempt_disable()'s which are
+	 * counted by voluntary_preempt.
+	 */
+	return (current_irq_nest_count == (voluntary_preempt + 1));
+}
+
+int in_nested_irq_context(void)
+{
+	/* Deducing voluntary preemptions we get real irq nesting */
+	return (current_irq_nest_count - voluntary_preempt) > 1;
+}
+
+int in_task_context(void)
+{
+	return !in_irq_context();
+}
+
+void sched_init_runqueues(void)
+{
+	for (int i = 0; i < SCHED_RQ_TOTAL; i++) {
 		memset(&sched_rq[i], 0, sizeof(struct runqueue));
 		INIT_LIST_HEAD(&sched_rq[i].task_list);
 		spin_lock_init(&sched_rq[i].lock);
@@ -81,203 +96,93 @@ void sched_runqueue_init(void)

 	rq_runnable = &sched_rq[0];
 	rq_expired = &sched_rq[1];
-	rq_pending = &sched_rq[2];
+	prio_total = 0;
 }

-/* Lock scheduler. Should only be used when scheduling. */
-static inline void sched_lock(void)
-{
-	preempt_disable();
-}
-
-/* Sched unlock */
-static inline void sched_unlock(void)
-{
-	/*
-	 * This is to make sure preempt_enable() does not
-	 * try to schedule since we're already scheduling.
-	 */
-	need_resched = 0;
-	preempt_enable();
-}
-
-/* Swaps runnable and expired queues *if* runnable queue is empty. */
-static void sched_rq_swap_expired_runnable(void)
+/* Swap runnable and expired runqueues. */
+static void sched_rq_swap_runqueues(void)
 {
 	struct runqueue *temp;

-	if (list_empty(&rq_runnable->task_list) &&
-	    !list_empty(&rq_expired->task_list)) {
+	BUG_ON(list_empty(&rq_expired->task_list));
+	BUG_ON(rq_expired->total == 0);

-		/* Queues are swapped and expired list becomes runnable */
-		temp = rq_runnable;
-		rq_runnable = rq_expired;
-		rq_expired = temp;
-	}
+	/* Queues are swapped and expired list becomes runnable */
+	temp = rq_runnable;
+	rq_runnable = rq_expired;
+	rq_expired = temp;
 }

+/* FIXME:
+ * Sleepers should not affect runqueue priority.
+ * Suspended tasks should affect runqueue priority.
+ *
+ * Also make sure that if sleepers get suspended,
+ * they do affect runqueue priority.
+ */
+
+/* Set policy on where to add tasks in the runqueue */
+#define RQ_ADD_BEHIND		0
+#define RQ_ADD_FRONT		1
+
 /* Helper for adding a new task to a runqueue */
 static void sched_rq_add_task(struct ktcb *task, struct runqueue *rq, int front)
 {
-	BUG_ON(task->rq);
-
-	/*
-	 * If the task is sinfully in a runqueue, this may still keep silent
-	 * upon a racing condition, since its rq can't be locked in advance.
-	 */
 	BUG_ON(!list_empty(&task->rq_list));

+	spin_lock(&rq->lock);
 	if (front)
 		list_add(&task->rq_list, &rq->task_list);
 	else
 		list_add_tail(&task->rq_list, &rq->task_list);
 	rq->total++;
-	task->rq = rq;
-}
-
-static inline void
-sched_rq_add_task_front(struct ktcb *task, struct runqueue *rq)
-{
-	sched_rq_add_task(task, rq, 1);
-}
-
-static inline void
-sched_rq_add_task_behind(struct ktcb *task, struct runqueue *rq)
-{
-	sched_rq_add_task(task, rq, 0);
+	spin_unlock(&rq->lock);
 }

 /* Helper for removing a task from its runqueue. */
-static inline void sched_rq_remove_task(struct ktcb *task)
+static inline void sched_rq_remove_task(struct ktcb *task, struct runqueue *rq)
 {
+	spin_lock(&rq->lock);
 	list_del_init(&task->rq_list);
-	task->rq->total--;
-	task->rq = 0;
+	rq->total--;
+
+	BUG_ON(rq->total < 0);
+	spin_unlock(&rq->lock);
 }

-void sched_init_task(struct ktcb *task)
+
+void sched_init_task(struct ktcb *task, int prio)
 {
 	INIT_LIST_HEAD(&task->rq_list);
-	task->ticks_left = TASK_TIMESLICE_DEFAULT;
+	task->priority = prio;
+	task->ticks_left = 0;
 	task->state = TASK_INACTIVE;
 	task->ts_need_resched = 0;
+	task->flags |= TASK_RESUMING;
 }

-void sched_tell(struct ktcb *task, unsigned int fl)
+/* Synchronously resumes a task */
+void sched_resume_sync(struct ktcb *task)
 {
-	BUG_ON(!(SCHED_FL_MASK & fl));
-	/* The last flag overrrides all existing flags. */
-	task->schedfl = fl;
-}
+	task->state = TASK_RUNNABLE;

-void sched_yield()
-{
-	need_resched = 1;
+	sched_rq_add_task(task, rq_runnable, RQ_ADD_FRONT);
 	schedule();
 }

 /*
- * Any task that wants the scheduler's attention and not in its any one of
- * its currently runnable realms, would call this. E.g. dormant tasks
- * sleeping tasks, newly created tasks. But not currently runnable tasks.
+ * Asynchronously resumes a task.
+ * The task will run in the future, but at
+ * the scheduler's discretion.
 */
-void sched_add_pending_task(struct ktcb *task)
+void sched_resume_async(struct ktcb *task)
 {
-	BUG_ON(task->rq);
-	spin_lock(&rq_pending->lock);
-	sched_rq_add_task_behind(task, rq_pending);
-	spin_unlock(&rq_pending->lock);
+	task->state = TASK_RUNNABLE;
+
+	sched_rq_add_task(task, rq_runnable, RQ_ADD_FRONT);
 }

-/* Tells scheduler to remove given runnable task from runqueues */
-void sched_notify_sleep(struct ktcb *task)
-{
-	sched_tell(task, SCHED_FL_SLEEP);
-}
-
-void sched_sleep_task(struct ktcb *task)
-{
-	sched_notify_sleep(task);
-	if (task == current)
-		sched_yield();
-}
-
-/* Tells scheduler to remove given runnable task from runqueues */
-void sched_notify_suspend(struct ktcb *task)
-{
-	sched_tell(task, SCHED_FL_SUSPEND);
-}
-
-void sched_suspend_task(struct ktcb *task)
-{
-	sched_notify_suspend(task);
-	if (task == current)
-		sched_yield();
-}
-
-/* Tells scheduler to add given task into runqueues whenever possible */
-void sched_notify_resume(struct ktcb *task)
-{
-	BUG_ON(current == task);
-	sched_tell(task, SCHED_FL_RESUME);
-	sched_add_pending_task(task);
-}
-
-/* NOTE: Might as well just set need_resched instead of full yield.
- * This would work on irq context as well. */
-/* Same as resume, but also yields. */
-void sched_resume_task(struct ktcb *task)
-{
-	sched_notify_resume(task);
-	sched_yield();
-}
-
-void sched_start_task(struct ktcb *task)
-{
-	sched_init_task(task);
-	sched_resume_task(task);
-}
-
-/*
- * Checks currently pending scheduling flags on the task and does two things:
- * 1) Modify their state.
- * 2) Modify their runqueues.
- *
- * An inactive/sleeping task that is pending-runnable would change state here.
- * A runnable task that is pending-inactive would also change state here.
- * Returns 1 if it has changed anything, e.g. task state, runqueues, and
- * 0 otherwise.
- */
-static int sched_next_state(struct ktcb *task)
-{
-	unsigned int flags = task->schedfl;
-	int ret = 0;
-
-	switch(flags) {
-	case 0:
-		ret = 0;
-		break;
-	case SCHED_FL_SUSPEND:
-		task->state = TASK_INACTIVE;
-		ret = 1;
-		break;
-	case SCHED_FL_RESUME:
-		task->state = TASK_RUNNABLE;
-		ret = 1;
-		break;
-	case SCHED_FL_SLEEP:
-		task->state = TASK_SLEEPING;
-		ret = 1;
-		break;
-	default:
-		BUG();
-	}
-	task->schedfl = 0;
-	return ret;
-}
-
-
-extern void switch_to(struct ktcb *cur, struct ktcb *next);
+extern void arch_switch(struct ktcb *cur, struct ktcb *next);

 static inline void context_switch(struct ktcb *next)
 {
@@ -286,84 +191,179 @@ static inline void context_switch(struct ktcb *next)
 	// printk("(%d) to (%d)\n", cur->tid, next->tid);

 	/* Flush caches and everything */
-	arm_clean_invalidate_cache();
-	arm_invalidate_tlb();
-	arm_set_ttb(virt_to_phys(next->pgd));
-	arm_invalidate_tlb();
-	switch_to(cur, next);
+	arch_hardware_flush(next->pgd);
+
+	/* Switch context */
+	arch_switch(cur, next);
+
 	// printk("Returning from yield. Tid: (%d)\n", cur->tid);
 }

-void scheduler()
+/*
+ * Priority calculation is so simple it is inlined. The task gets
+ * the ratio of its priority to total priority of all runnable tasks.
+ */
+static inline int sched_recalc_ticks(struct ktcb *task, int prio_total)
 {
-	struct ktcb *next = 0, *pending = 0, *n = 0;
+	return task->ticks_assigned =
+		SCHED_TICKS * task->priority / prio_total;
+}

-	sched_lock();
+/*
+ * Tasks come here, either by setting need_resched (via next irq),
+ * or by directly calling it (in process context).
+ *
+ * The scheduler is similar to Linux's so called O(1) scheduler,
+ * although a lot simpler. Task priorities determine task timeslices.
+ * Each task gets a ratio of its priority to the total priority of
+ * all runnable tasks. When this total changes, (e.g. threads die or
+ * are created, or a thread's priority is changed) the timeslices are
+ * recalculated on a per-task basis as each thread becomes runnable.
+ * Once all runnable tasks expire, runqueues are swapped. Sleeping
+ * tasks are removed from the runnable queue, and added back later
+ * without affecting the timeslices. Suspended tasks however,
+ * necessitate a timeslice recalculation as they are considered to go
+ * inactive indefinitely or for a very long time. They are put back
+ * to the expired queue if they want to run again.
+ *
+ * A task is rescheduled either when it hits a SCHED_GRANULARITY
+ * boundary, or when its timeslice has expired. SCHED_GRANULARITY
+ * ensures context switches do occur at a maximum boundary even if a
+ * task's timeslice is very long. In the future, real-time tasks will
+ * be added, and they will be able to ignore SCHED_GRANULARITY.
+ *
+ * In the future, the tasks will be sorted by priority in their
+ * runqueue, as well as having an adjusted timeslice.
+ *
+ * Runqueues are swapped at a single second's interval. This implies
+ * the timeslice recalculations would also occur at this interval.
+ */
+void schedule()
+{
+	struct ktcb *next;
+
+	/* Should not schedule with preemption disabled */
+	BUG_ON(voluntary_preempt);
+
+	/* Should not have more ticks than SCHED_TICKS */
+	BUG_ON(current->ticks_left > SCHED_TICKS);
+
+	/* Cannot have any irqs that schedule after this */
+	preempt_disable();
+
+	/* NOTE:
+	 * We could avoid double-scheduling by detecting a task
+	 * that's about to schedule voluntarily and skipping the
+	 * schedule() call in irq mode.
+	 */
+
+	/* Reset schedule flag */
 	need_resched = 0;
-	BUG_ON(current->rq != rq_runnable);

-	/* Current task */
-	sched_rq_remove_task(current);
-	sched_next_state(current);
+	/* Remove from runnable queue */
+	sched_rq_remove_task(current, rq_runnable);

+	/* Put it into appropriate runqueue */
 	if (current->state == TASK_RUNNABLE) {
-		BUG_ON(current->ticks_left < 0);
-		if (current->ticks_left == 0)
-			current->ticks_left = TASK_TIMESLICE_DEFAULT;
-		sched_rq_add_task_behind(current, rq_expired);
+		if (current->ticks_left)
+			sched_rq_add_task(current, rq_runnable, RQ_ADD_BEHIND);
+		else
+			sched_rq_add_task(current, rq_expired, RQ_ADD_BEHIND);
 	}
-	sched_rq_swap_expired_runnable();

-	/* Runnable-pending tasks */
-	spin_lock(&rq_pending->lock);
-	list_for_each_entry_safe(pending, n, &rq_pending->task_list, rq_list) {
-		sched_next_state(pending);
-		sched_rq_remove_task(pending);
-		if (pending->state == TASK_RUNNABLE)
-			sched_rq_add_task_front(pending, rq_runnable);
-	}
-	spin_unlock(&rq_pending->lock);
+	/* Check if there's a pending suspend for thread */
+	if (current->flags & TASK_SUSPENDING) {
+		/*
+		 * The task should have no locks and be in a runnable state.
+		 * (e.g. properly woken up by the suspender)
+		 */
+		if (current->nlocks == 0 && current->state == TASK_RUNNABLE) {
+			/* Suspend it if suitable */
+			current->state = TASK_INACTIVE;
+			current->flags &= ~TASK_SUSPENDING;

-	/* Next task */
-retry_next:
-	if (rq_runnable->total > 0) {
-		next = list_entry(rq_runnable->task_list.next, struct ktcb, rq_list);
-		sched_next_state(next);
-		if (next->state != TASK_RUNNABLE) {
-			sched_rq_remove_task(next);
-			sched_rq_swap_expired_runnable();
-			goto retry_next;
+			/*
+			 * The task has been made inactive here.
+			 * A suspended task affects timeslices whereas
+			 * a sleeping task doesn't as it is believed
+			 * sleepers would become runnable soon.
+			 */
+			prio_total -= current->priority;
+			BUG_ON(prio_total <= 0);
+		} else {
+			/*
+			 * Top up task's ticks temporarily, and
+			 * wait for it to release its locks.
+			 */
+			current->state = TASK_RUNNABLE;
+			current->ticks_left = max(current->ticks_left,
+						  SCHED_GRANULARITY);
+			sched_rq_add_task(current, rq_runnable, RQ_ADD_FRONT);
 		}
-	} else {
-		printk("Idle task.\n");
-		while (1);
 	}

+	/* Determine the next task to be run */
+	if (rq_runnable->total > 0) {
+		next = list_entry(rq_runnable->task_list.next,
+				  struct ktcb, rq_list);
+	} else {
+		if (rq_expired->total > 0) {
+			sched_rq_swap_runqueues();
+			next = list_entry(rq_runnable->task_list.next,
+					  struct ktcb, rq_list);
+		} else {
+			printk("Idle task.\n");
+			while(1);
+		}
+	}
+
+	/* Zero ticks indicates task hasn't ran since last rq swap */
+	if (next->ticks_left == 0) {
+
+		/* New tasks affect runqueue total priority. */
+		if (next->flags & TASK_RESUMING) {
+			prio_total += next->priority;
+			next->flags &= ~TASK_RESUMING;
+		}
+
+		/*
+		 * Redistribute timeslice. We do this as each task
+		 * becomes runnable rather than all at once. It's also
+		 * done only upon a runqueue swap.
+		 */
+		sched_recalc_ticks(next, prio_total);
+		next->ticks_left = next->ticks_assigned;
+	}
+
+	/* Reinitialise task's schedule granularity boundary */
+	next->sched_granule = SCHED_GRANULARITY;
+
+	/* Finish */
 	disable_irqs();
-	sched_unlock();
+	preempt_enable();
 	context_switch(next);
 }

-void schedule(void)
-{
-	/* It's a royal bug to call schedule when preemption is disabled */
-	BUG_ON(voluntary_preempt);
-
-	if (need_resched)
-		scheduler();
-}
-
+/*
+ * Initialise pager as runnable for first-ever scheduling,
+ * and start the scheduler.
+ */
 void scheduler_start()
 {
 	/* Initialise runqueues */
-	sched_runqueue_init();
+	sched_init_runqueues();

-	/* Initialse inittask as runnable for first-ever scheduling */
-	sched_init_task(current);
+	/* Initialise scheduler fields of pager */
+	sched_init_task(current, TASK_PRIO_PAGER);
+
+	/* Add task to runqueue first */
+	sched_rq_add_task(current, rq_runnable, RQ_ADD_FRONT);
+
+	/* Give it a kick-start tick and make runnable */
+	current->ticks_left = 1;
 	current->state = TASK_RUNNABLE;
-	sched_rq_add_task_front(current, rq_runnable);

-	/* Start the timer */
+	/* Start the timer and switch */
 	timer_start();
 	switch_to_user(current);
 }
--- a/src/generic/time.c
+++ b/src/generic/time.c
@@ -10,6 +10,7 @@
 #include <l4/generic/irq.h>
 #include <l4/generic/scheduler.h>
 #include <l4/generic/time.h>
+#include <l4/generic/preempt.h>
 #include <l4/generic/space.h>
 #include INC_ARCH(exception.h)
 #include <l4/api/syscall.h>
@@ -54,11 +55,16 @@ void update_system_time(void)
 	if (systime.reader)
 		systime.reader = 0;

-	/* Increase just like jiffies, but reset every HZ */
+	/* Increase just like jiffies, but reset every second */
 	systime.thz++;

-	/* On every HZ increase seconds */
-	if (systime.thz == HZ) {
+	/*
+	 * On every 1 second of timer ticks, increase seconds
+	 *
+	 * TODO: Investigate: how do we make sure timer_irq is
+	 * called SCHED_TICKS times per second?
+	 */
+	if (systime.thz == SCHED_TICKS) {
 		systime.thz = 0;
 		systime.sec++;
 	}
@@ -79,7 +85,7 @@ int sys_time(syscall_context_t *args)
 		while(retries > 0) {
 			systime.reader = 1;
 			tv->tv_sec = systime.sec;
-			tv->tv_usec = 1000000 * systime.thz / HZ;
+			tv->tv_usec = 1000000 * systime.thz / SCHED_TICKS;

 			retries--;
 			if (systime.reader)
@@ -108,21 +114,37 @@ void update_process_times(void)
 {
 	struct ktcb *cur = current;

-	BUG_ON(cur->ticks_left < 0);
-
 	if (cur->ticks_left == 0) {
-		need_resched = 1;
-		return;
+		/*
+		 * Nested irqs and irqs during non-preemptive
+		 * times could try to deduct ticks below zero.
+		 * We ignore such states and return.
+		 */
+		if (in_nested_irq_context() || !preemptive())
+			return;
+		else /* Otherwise its a bug. */
+			BUG();
 	}

+	/*
+	 * These are TASK_RUNNABLE times, i.e. exludes sleeps
+	 * In the future we may use timestamps for accuracy
+	 */
 	if (in_kernel())
 		cur->kernel_time++;
 	else
 		cur->user_time++;

 	cur->ticks_left--;
+	cur->sched_granule--;
+
+	/* Task has expired its timeslice */
 	if (!cur->ticks_left)
 		need_resched = 1;
+
+	/* Task has expired its schedule granularity */
+	if (!cur->sched_granule)
+		need_resched = 1;
 }


--- a/src/lib/memcache.c
+++ b/src/lib/memcache.c
@@ -8,6 +8,7 @@
 #include <l4/lib/printk.h>
 #include INC_GLUE(memory.h)
 #include <l4/lib/bit.h>
+#include <l4/api/errno.h>

 /* Allocate, clear and return element */
 void *mem_cache_zalloc(struct mem_cache *cache)
@@ -21,8 +22,11 @@ void *mem_cache_zalloc(struct mem_cache *cache)
 void *mem_cache_alloc(struct mem_cache *cache)
 {
 	int bit;
+	int err;
+
 	if (cache->free > 0) {
-		mutex_lock(&cache->mutex);
+		if ((err = mutex_lock(&cache->mutex)) < 0)
+			return PTR_ERR(err);	/* Interruptible mutex */
 		cache->free--;
 		if ((bit = find_and_set_first_free_bit(cache->bitmap,
 						       cache->total)) < 0) {
@@ -64,7 +68,9 @@ int mem_cache_free(struct mem_cache *cache, void *addr)
 		return err;
 	}

-	mutex_lock(&cache->mutex);
+	if ((err = mutex_lock(&cache->mutex)) < 0)
+		return err; /* Interruptible mutex */
+
 	/* Check free/occupied state */
 	if (check_and_clear_bit(cache->bitmap, bit) < 0) {
 		printk("Error: Anomaly in cache occupied state:\n"
--- a/src/lib/mutex.c
+++ b/src/lib/mutex.c
@@ -6,6 +6,7 @@
 #include <l4/lib/mutex.h>
 #include <l4/generic/scheduler.h>
 #include <l4/generic/tcb.h>
+#include <l4/api/errno.h>

 /*
 * Semaphore usage:
@@ -17,6 +18,8 @@
 * Consumer locks/consumes/unlocks data.
 */

+#if 0
+/* Update it */
 /*
 * Semaphore *up* for multiple producers. If any consumer is waiting, wake them
 * up, otherwise, sleep. Effectively producers and consumers use the same
@@ -48,10 +51,10 @@ void sem_up(struct mutex *mutex)
 		INIT_LIST_HEAD(&wq.task_list);
 		list_add_tail(&wq.task_list, &mutex->wq.task_list);
 		mutex->sleepers++;
-		sched_notify_sleep(current);
-		need_resched = 1;
+		current->state = TASK_SLEEPING;
 		printk("(%d) produced, now sleeping...\n", current->tid);
 		spin_unlock(&mutex->slock);
+		schedule();
 	}
 }

@@ -86,76 +89,91 @@ void sem_down(struct mutex *mutex)
 		INIT_LIST_HEAD(&wq.task_list);
 		list_add_tail(&wq.task_list, &mutex->wq.task_list);
 		mutex->sleepers++;
-		sched_notify_sleep(current);
-		need_resched = 1;
+		current->state = TASK_SLEEPING;
 		printk("(%d) Waiting to consume, now sleeping...\n", current->tid);
 		spin_unlock(&mutex->slock);
+		schedule();
 	}
 }
+#endif

 /* Non-blocking attempt to lock mutex */
 int mutex_trylock(struct mutex *mutex)
 {
 	int success;

-	spin_lock(&mutex->slock);
-	success = __mutex_lock(&mutex->lock);
-	spin_unlock(&mutex->slock);
+	spin_lock(&mutex->wqh.slock);
+	if ((success = __mutex_lock(&mutex->lock)))
+		current->nlocks++;
+	spin_unlock(&mutex->wqh.slock);

 	return success;
 }

-void mutex_lock(struct mutex *mutex)
+int mutex_lock(struct mutex *mutex)
 {
 	/* NOTE:
 	 * Everytime we're woken up we retry acquiring the mutex. It is
 	 * undeterministic as to how many retries will result in success.
+	 * We may need to add priority-based locking.
 	 */
 	for (;;) {
-		spin_lock(&mutex->slock);
+		spin_lock(&mutex->wqh.slock);
 		if (!__mutex_lock(&mutex->lock)) { /* Could not lock, sleep. */
-			DECLARE_WAITQUEUE(wq, current);
-			INIT_LIST_HEAD(&wq.task_list);
-			list_add_tail(&wq.task_list, &mutex->wq.task_list);
-			mutex->sleepers++;
-			sched_notify_sleep(current);
+			CREATE_WAITQUEUE_ON_STACK(wq, current);
+			task_set_wqh(current, &mutex->wqh, &wq);
+			list_add_tail(&wq.task_list, &mutex->wqh.task_list);
+			mutex->wqh.sleepers++;
+			current->state = TASK_SLEEPING;
+			spin_unlock(&mutex->wqh.slock);
 			printk("(%d) sleeping...\n", current->tid);
-			spin_unlock(&mutex->slock);
-		} else
+			schedule();
+
+			/* Did we wake up normally or get interrupted */
+			if (current->flags & TASK_INTERRUPTED) {
+				current->flags &= ~TASK_INTERRUPTED;
+				return -EINTR;
+			}
+		} else {
+			current->nlocks++;
 			break;
+		}
 	}
-	spin_unlock(&mutex->slock);
+	spin_unlock(&mutex->wqh.slock);
+	return 0;
 }

 void mutex_unlock(struct mutex *mutex)
 {
-	spin_lock(&mutex->slock);
+	spin_lock(&mutex->wqh.slock);
 	__mutex_unlock(&mutex->lock);
-	BUG_ON(mutex->sleepers < 0);
-	if (mutex->sleepers > 0) {
-		struct waitqueue *wq;
-		struct ktcb *sleeper;
+	current->nlocks--;
+	BUG_ON(current->nlocks < 0);
+	BUG_ON(mutex->wqh.sleepers < 0);
+	if (mutex->wqh.sleepers > 0) {
+		struct waitqueue *wq = list_entry(mutex->wqh.task_list.next,
+						  struct waitqueue,
+						  task_list);
+		struct ktcb *sleeper = wq->task;

-		/* Each unlocker wakes one other sleeper in queue. */
-		mutex->sleepers--;
-		BUG_ON(list_empty(&mutex->wq.task_list));
-		list_for_each_entry(wq, &mutex->wq.task_list, task_list) {
-			list_del_init(&wq->task_list);
-			spin_unlock(&mutex->slock);
-			/*
-			 * Here, someone else may get the lock, well before we
-			 * wake up the sleeper that we *hope* would get it. This
-			 * is fine as the sleeper would retry and re-sleep. BUT,
-			 * this may potentially starve the sleeper causing
-			 * non-determinisim.
-			 */
-			sleeper = wq->task;
-			printk("(%d) Waking up (%d)\n", current->tid,
-			       sleeper->tid);
-			sched_resume_task(sleeper);
-			return;	/* Don't iterate, wake only one task. */
-		}
+		task_unset_wqh(sleeper);
+		BUG_ON(list_empty(&mutex->wqh.task_list));
+		list_del_init(&wq->task_list);
+		mutex->wqh.sleepers--;
+		sleeper->state = TASK_RUNNABLE;
+		spin_unlock(&mutex->wqh.slock);
+
+		/*
+		 * TODO:
+		 * Here someone could grab the mutex, this is fine
+		 * but it may potentially starve the sleeper causing
+		 * non-determinism. We may consider priorities here.
+		 */
+		sched_resume_sync(sleeper);
+
+		/* Don't iterate, wake only one task. */
+		return;
 	}
-	spin_unlock(&mutex->slock);
+	spin_unlock(&mutex->wqh.slock);
 }

--- a/src/lib/wait.c
+++ b/src/lib/wait.c
@@ -1,45 +1,117 @@
 /*
 * Implementation of wakeup/wait for processes.
 *
- * Copyright (C) 2007 Bahadir Balban
+ * Copyright (C) 2007, 2008 Bahadir Balban
 */
 #include <l4/generic/scheduler.h>
 #include <l4/lib/wait.h>
 #include <l4/lib/spinlock.h>
+#include <l4/api/errno.h>

-/* Sleep if the given condition isn't true. */
-#define wait_event(wqh, condition)				\
+/*
+ * This sets any wait details of a task so that any arbitrary
+ * wakers can know where the task is sleeping.
+ */
+void task_set_wqh(struct ktcb *task, struct waitqueue_head *wqh,
+		  struct waitqueue *wq)
+{
+	spin_lock(&task->waitlock);
+	task->waiting_on = wqh;
+	task->wq = wq;
+	spin_unlock(&task->waitlock);
+}
+
+
+/*
+ * This clears all wait details of a task. Used as the
+ * task is removed from its queue and is about to wake up.
+ */
+void task_unset_wqh(struct ktcb *task)
+{
+	spin_lock(&task->waitlock);
+	task->waiting_on = 0;
+	task->wq = 0;
+	spin_unlock(&task->waitlock);
+
+}
+
+/*
+ * Sleep if the given condition isn't true.
+ * ret will tell whether condition was met
+ * or we got interrupted.
+ */
+#define WAIT_EVENT(wqh, condition, ret)				\
 do {								\
+	ret = 0;						\
 	for (;;) {						\
 		if (condition)					\
 			break;					\
-		DECLARE_WAITQUEUE(wq, current);			\
+		CREATE_WAITQUEUE_ON_STACK(wq, current);		\
 		spin_lock(&wqh->slock);				\
+		task_set_wqh(current, wqh, wq);			\
 		wqh->sleepers++;				\
 		list_add_tail(&wq.task_list, &wqh->task_list);	\
-		sched_tell(current, SCHED_FL_SLEEP);		\
-		need_resched = 1;				\
+		task->state = TASK_SLEEPING;			\
 		printk("(%d) waiting...\n", current->tid);	\
 		spin_unlock(&wqh->slock);			\
+		schedule();					\
+		/* Did we wake up normally or get interrupted */\
+		if (current->flags & TASK_INTERRUPTED) {	\
+			current->flags &= ~TASK_INTERRUPTED;	\
+			ret = -EINTR;				\
+			break;					\
+		}						\
 	}							\
 } while(0);

 /* Sleep without any condition */
-#define wait_on(wqh)					\
+#define WAIT_ON(wqh, ret)				\
 do {							\
-	DECLARE_WAITQUEUE(wq, current);			\
+	CREATE_WAITQUEUE_ON_STACK(wq, current);		\
 	spin_lock(&wqh->slock);				\
+	task_set_wqh(current, wqh, &wq);		\
 	wqh->sleepers++;				\
 	list_add_tail(&wq.task_list, &wqh->task_list);	\
-	sched_tell(current, SCHED_FL_SLEEP);		\
-	need_resched = 1;				\
-	printk("(%d) waiting...\n", current->tid);	\
+	current->state = TASK_SLEEPING;			\
+	printk("(%d) waiting on wqh at: 0x%p\n",	\
+	       current->tid, wqh);			\
 	spin_unlock(&wqh->slock);			\
+	schedule();					\
+							\
+	/* Did we wake up normally or get interrupted */\
+	if (current->flags & TASK_INTERRUPTED) {	\
+		current->flags &= ~TASK_INTERRUPTED;	\
+		ret = -EINTR;				\
+	} else						\
+		ret = 0;				\
 } while(0);

-/* FIXME: Wake up should take the task as an argument, rather than the queue */
+/* Sleep without any condition */
+int wait_on(struct waitqueue_head *wqh)
+{
+	CREATE_WAITQUEUE_ON_STACK(wq, current);
+	spin_lock(&wqh->slock);
+	task_set_wqh(current, wqh, &wq);
+	wqh->sleepers++;
+	list_add_tail(&wq.task_list, &wqh->task_list);
+	current->state = TASK_SLEEPING;
+	printk("(%d) waiting on wqh at: 0x%p\n",
+	       current->tid, wqh);
+	spin_unlock(&wqh->slock);
+	schedule();
+
+	/* Did we wake up normally or get interrupted */
+	if (current->flags & TASK_INTERRUPTED) {
+		current->flags &= ~TASK_INTERRUPTED;
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+
 /* Wake up single waiter */
-void wake_up(struct waitqueue_head *wqh)
+void wake_up(struct waitqueue_head *wqh, int sync)
 {
 	BUG_ON(wqh->sleepers < 0);
 	spin_lock(&wqh->slock);
@@ -48,14 +120,82 @@ void wake_up(struct waitqueue_head *wqh)
 						  struct waitqueue,
 						  task_list);
 		struct ktcb *sleeper = wq->task;
+		task_unset_wqh(sleeper);
+		BUG_ON(list_empty(&wqh->task_list));
 		list_del_init(&wq->task_list);
 		wqh->sleepers--;
-		BUG_ON(list_empty(&wqh->task_list));
+		sleeper->state = TASK_RUNNABLE;
 		printk("(%d) Waking up (%d)\n", current->tid, sleeper->tid);
-		sched_notify_resume(sleeper);
 		spin_unlock(&wqh->slock);
+
+		if (sync)
+			sched_resume_sync(sleeper);
+		else
+			sched_resume_async(sleeper);
 		return;
 	}
 	spin_unlock(&wqh->slock);
 }

+/*
+ * Wakes up a task. If task is not waiting, or has been woken up
+ * as we were peeking on it, returns -1. @sync makes us immediately
+ * yield or else leave it to scheduler's discretion.
+ */
+int wake_up_task(struct ktcb *task, int sync)
+{
+	struct waitqueue_head *wqh;
+	struct waitqueue *wq;
+
+	spin_lock(&task->waitlock);
+	if (!task->waiting_on) {
+		spin_unlock(&task->waitlock);
+		return -1;
+	}
+
+	/*
+	 * We have found the waitqueue head.
+	 * That needs to be locked first to conform with
+	 * lock order and avoid deadlocks. Release task's
+	 * waitlock and take the wqh's one.
+	 */
+	wqh = task->waiting_on;
+	wq = task->wq;
+	spin_unlock(&task->waitlock);
+
+	/* -- Task can be woken up by someone else here -- */
+
+	spin_lock(&wqh->slock);
+
+	/*
+	 * Now lets check if the task is still
+	 * waiting and in the same queue
+	 */
+	spin_lock(&task->waitlock);
+	if (task->waiting_on != wqh) {
+		/* No, task has been woken by someone else */
+		spin_unlock(&wqh->slock);
+		spin_unlock(&task->waitlock);
+		return -1;
+	}
+
+	/* Now we can remove the task from its waitqueue */
+	list_del_init(&wq->task_list);
+	wqh->sleepers--;
+	task->waiting_on = 0;
+	task->wq = 0;
+	task->state = TASK_RUNNABLE;
+	spin_unlock(&wqh->slock);
+	spin_unlock(&task->waitlock);
+
+	/* Removed from waitqueue, we can now safely resume task */
+	if (sync)
+		sched_resume_sync(task);
+	else
+		sched_resume_async(task);
+
+	return 0;
+}
+
+
+