diff --git a/include/l4/arch/arm/v5/mm.h b/include/l4/arch/arm/v5/mm.h
index 753005f..6943ff9 100644
--- a/include/l4/arch/arm/v5/mm.h
+++ b/include/l4/arch/arm/v5/mm.h
@@ -132,7 +132,7 @@ typedef struct fault_kdata {
 	pte_t pte;
 } __attribute__ ((__packed__)) fault_kdata_t;
 
-
+void arch_hardware_flush(pgd_table_t *pgd);
 void add_section_mapping_init(unsigned int paddr, unsigned int vaddr,
 			      unsigned int size, unsigned int flags);
 
diff --git a/include/l4/arch/arm/v5/mmu_ops.h b/include/l4/arch/arm/v5/mmu_ops.h
index d3bbec6..f7f0b88 100644
--- a/include/l4/arch/arm/v5/mmu_ops.h
+++ b/include/l4/arch/arm/v5/mmu_ops.h
@@ -24,6 +24,7 @@ void arm_drain_writebuffer(void);
 void arm_invalidate_tlb(void);
 void arm_invalidate_itlb(void);
 void arm_invalidate_dtlb(void);
+
 static inline void arm_enable_caches(void)
 {
 	arm_enable_icache();
diff --git a/include/l4/generic/preempt.h b/include/l4/generic/preempt.h
index 6ccb012..4322260 100644
--- a/include/l4/generic/preempt.h
+++ b/include/l4/generic/preempt.h
@@ -9,4 +9,7 @@ void preempt_disable(void);
 int preemptive(void);
 int preempt_count(void);
 
+int in_nested_irq_context(void);
+int in_irq_context(void);
+int in_task_context(void);
 #endif /* __PREEMPT_H__ */
diff --git a/include/l4/generic/scheduler.h b/include/l4/generic/scheduler.h
index 01e7fb9..0d9a352 100644
--- a/include/l4/generic/scheduler.h
+++ b/include/l4/generic/scheduler.h
@@ -10,9 +10,22 @@
 #include INC_SUBARCH(mm.h)
 #include INC_GLUE(memory.h)
 
+/* Task priorities */
+#define TASK_PRIO_MAX		10
+#define TASK_PRIO_REALTIME	10
+#define TASK_PRIO_PAGER		8
+#define TASK_PRIO_SERVER	6
+#define TASK_PRIO_NORMAL	4
+#define TASK_PRIO_LOW		2
+
 /* Ticks per second, try ticks = 1000 + timeslice = 1 for regressed preemption test. */
-#define HZ					100
-#define	TASK_TIMESLICE_DEFAULT			HZ/100
+#define SCHED_TICKS				100
+
+/*
+ * A task can run continuously at this granularity,
+ * even if it has a greater total time slice.
+ */
+#define SCHED_GRANULARITY			SCHED_TICKS/10
 
 static inline struct ktcb *current_task(void)
 {
@@ -23,29 +36,10 @@ static inline struct ktcb *current_task(void)
 #define current			current_task()
 #define need_resched		(current->ts_need_resched)
 
-/* Flags set by kernel to direct the scheduler about future task state. */
-#define __SCHED_FL_SUSPEND		1
-#define SCHED_FL_SUSPEND		(1 << __SCHED_FL_SUSPEND)
-#define __SCHED_FL_RESUME		2
-#define SCHED_FL_RESUME			(1 << __SCHED_FL_RESUME)
-#define __SCHED_FL_SLEEP		3
-#define SCHED_FL_SLEEP			(1 << __SCHED_FL_SLEEP)
-#define SCHED_FL_MASK			(SCHED_FL_SLEEP | SCHED_FL_RESUME \
-					 | SCHED_FL_SUSPEND)
-
-void sched_runqueue_init(void);
-void sched_init_task(struct ktcb *task);
-void sched_start_task(struct ktcb *task);
-void sched_resume_task(struct ktcb *task);
-void sched_suspend_task(struct ktcb *task);
-void sched_tell(struct ktcb *task, unsigned int flags);
+void sched_init_task(struct ktcb *task, int priority);
+void sched_resume_sync(struct ktcb *task);
+void sched_resume_async(struct ktcb *task);
 void scheduler_start(void);
-void sched_yield(void);
 void schedule(void);
 
-/* Asynchronous notifications to scheduler */
-void sched_notify_resume(struct ktcb *task);
-void sched_notify_sleep(struct ktcb *task);
-void sched_notify_suspend(struct ktcb *task);
-
 #endif /* __SCHEDULER_H__ */
diff --git a/include/l4/generic/tcb.h b/include/l4/generic/tcb.h
index abe5627..b649de0 100644
--- a/include/l4/generic/tcb.h
+++ b/include/l4/generic/tcb.h
@@ -16,6 +16,17 @@
 #include INC_GLUE(context.h)
 #include INC_SUBARCH(mm.h)
 
+/*
+ * These are a mixture of flags that indicate the task is
+ * in a transitional state that could include one or more
+ * scheduling states.
+ */
+#define TASK_INTERRUPTED		(1 << 0)
+#define TASK_SUSPENDING			(1 << 1)
+#define TASK_RESUMING			(1 << 2)
+
+
+/* Scheduler states */
 enum task_state {
 	TASK_INACTIVE	= 0,
 	TASK_SLEEPING	= 1,
@@ -41,15 +52,14 @@ struct ktcb {
 
 	/* Runqueue related */
 	struct list_head rq_list;
-	struct runqueue *rq;
 
 	/* Thread information */
 	l4id_t tid;		/* Global thread id */
 	l4id_t spid;		/* Global space id */
 	l4id_t tgid;		/* Global thread group id */
 
-	/* Flags to hint scheduler on future task state */
-	unsigned int schedfl;
+	/* Flags to indicate various task status */
+	unsigned int flags;
 
 	/* Lock for blocking thread state modifications via a syscall */
 	struct mutex thread_control_lock;
@@ -65,7 +75,13 @@ struct ktcb {
 	/* Thread times */
 	u32 kernel_time;	/* Ticks spent in kernel */
 	u32 user_time;		/* Ticks spent in userland */
-	u32 ticks_left;		/* Ticks left for reschedule */
+	u32 ticks_left;		/* Timeslice ticks left for reschedule */
+	u32 ticks_assigned;	/* Ticks assigned to this task on this HZ */
+	u32 sched_granule;	/* Granularity ticks left for reschedule */
+	int priority;		/* Task's fixed, default priority */
+
+	/* Number of locks the task currently has acquired */
+	int nlocks;
 
 	/* Page table information */
 	pgd_table_t *pgd;
@@ -73,8 +89,12 @@ struct ktcb {
 	/* Fields for ipc rendezvous */
 	struct waitqueue_head wqh_recv;
 	struct waitqueue_head wqh_send;
+	l4id_t expected_sender;
 
-	l4id_t senderid;	/* Sender checks this for ipc */
+	/* Tells where we are when we sleep */
+	struct spinlock waitlock;
+	struct waitqueue_head *waiting_on;
+	struct waitqueue *wq;
 };
 
 /* Per thread kernel stack unified on a single page. */
diff --git a/include/l4/lib/math.h b/include/l4/lib/math.h
index 7d5c5ad..fd12c08 100644
--- a/include/l4/lib/math.h
+++ b/include/l4/lib/math.h
@@ -2,5 +2,6 @@
 #define __LIB_MATH_H__
 
 #define min(x, y)		(((x) < (y)) ? x : y)
+#define max(x, y)		(((x) > (y)) ? x : y)
 
 #endif /* __LIB_MATH_H__ */
diff --git a/include/l4/lib/mutex.h b/include/l4/lib/mutex.h
index 45bfc62..4116b82 100644
--- a/include/l4/lib/mutex.h
+++ b/include/l4/lib/mutex.h
@@ -16,20 +16,18 @@
 
 /* A mutex is a binary semaphore that can sleep. */
 struct mutex {
-	int sleepers;			/* Number of sleepers */
-	struct spinlock slock;		/* Locks sleeper queue */
-	unsigned int lock;		/* The mutex lock itself */
-	struct waitqueue wq;		/* Sleeper queue head */
+	struct waitqueue_head wqh;
+	unsigned int lock;
 };
 
 static inline void mutex_init(struct mutex *mutex)
 {
 	memset(mutex, 0, sizeof(struct mutex));
-	INIT_LIST_HEAD(&mutex->wq.task_list);
+	waitqueue_head_init(&mutex->wqh);
 }
 
 int mutex_trylock(struct mutex *mutex);
-void mutex_lock(struct mutex *mutex);
+int mutex_lock(struct mutex *mutex);
 void mutex_unlock(struct mutex *mutex);
 
 /* NOTE: Since spinlocks guard mutex acquiring & sleeping, no locks needed */
diff --git a/include/l4/lib/wait.h b/include/l4/lib/wait.h
index 055c51c..b3eff3f 100644
--- a/include/l4/lib/wait.h
+++ b/include/l4/lib/wait.h
@@ -10,23 +10,16 @@ struct waitqueue {
 	struct ktcb *task;
 };
 
-#define DECLARE_WAITQUEUE(wq, tsk)			\
+#define CREATE_WAITQUEUE_ON_STACK(wq, tsk)		\
 struct waitqueue wq = {					\
 	.task_list = { &wq.task_list, &wq.task_list },	\
 	.task = tsk,					\
 };
-//	LIST_HEAD_INIT(task_list),
 
-/*
- * The waitqueue spinlock ensures waiters are added and removed atomically so
- * that wake-ups and sleeps occur in sync. Otherwise, a task could try to wake
- * up a waitqueue **during when a task has decided to sleep but is not in the
- * queue yet. (** Take "during" here as a pseudo-concurrency term on UP)
- */
 struct waitqueue_head {
 	int sleepers;
-	struct spinlock slock;		/* Locks sleeper queue */
-	struct list_head task_list;	/* Sleeper queue head */
+	struct spinlock slock;
+	struct list_head task_list;
 };
 
 static inline void waitqueue_head_init(struct waitqueue_head *head)
@@ -35,11 +28,14 @@ static inline void waitqueue_head_init(struct waitqueue_head *head)
 	INIT_LIST_HEAD(&head->task_list);
 }
 
-/*
- * Used for ipc related waitqueues who have special wait queue manipulation
- * conditions.
- */
-void wake_up(struct waitqueue_head *wqh);
+void task_set_wqh(struct ktcb *task, struct waitqueue_head *wqh,
+		  struct waitqueue *wq);
+
+void task_unset_wqh(struct ktcb *task);
+
+
+void wake_up(struct waitqueue_head *wqh, int sync);
+int wake_up_task(struct ktcb *task, int sync);
 
 #endif /* __LIB_WAIT_H__ */
 
diff --git a/src/api/ipc.c b/src/api/ipc.c
index cdc1368..cbfb8c1 100644
--- a/src/api/ipc.c
+++ b/src/api/ipc.c
@@ -41,7 +41,7 @@ int ipc_msg_copy(struct ktcb *to, struct ktcb *from)
 	memcpy(mr0_dst, mr0_src, MR_TOTAL * sizeof(unsigned int));
 
 	/* Save the sender id in case of ANYTHREAD receiver */
-	if (to->senderid == L4_ANYTHREAD)
+	if (to->expected_sender == L4_ANYTHREAD)
 		mr0_dst[MR_SENDER] = from->tid;
 
 	return 0;
@@ -52,106 +52,134 @@ int sys_ipc_control(syscall_context_t *regs)
 	return -ENOSYS;
 }
 
+/*
+ * Why can we safely copy registers and resume task
+ * after we release the locks? Because even if someone
+ * tried to interrupt and wake up the other party, they
+ * won't be able to, because the task's all hooks to its
+ * waitqueue have been removed at that stage.
+ */
+
+/* Interruptible ipc */
 int ipc_send(l4id_t recv_tid)
 {
 	struct ktcb *receiver = find_task(recv_tid);
 	struct waitqueue_head *wqhs, *wqhr;
 
-	if (!receiver) {
-		printk("%s: tid: %d, no such task.\n", __FUNCTION__,
-		       recv_tid);
-		return -EINVAL;
-	}
 	wqhs = &receiver->wqh_send;
 	wqhr = &receiver->wqh_recv;
 
 	spin_lock(&wqhs->slock);
 	spin_lock(&wqhr->slock);
 
-	/* Is my receiver waiting? */
-	if (wqhr->sleepers > 0) {
-		struct waitqueue *wq, *n;
-		struct ktcb *sleeper;
+	/* Ready to receive and expecting us? */
+	if (receiver->state == TASK_SLEEPING &&
+	    receiver->waiting_on == wqhr &&
+	    (receiver->expected_sender == current->tid ||
+	     receiver->expected_sender == L4_ANYTHREAD)) {
+		struct waitqueue *wq = receiver->wq;
 
-		list_for_each_entry_safe(wq, n, &wqhr->task_list, task_list) {
-			sleeper = wq->task;
-			/* Found the receiver. Does it sleep for this sender? */
-			BUG_ON(sleeper->tid != recv_tid);
-			if ((sleeper->senderid == current->tid) ||
-			    (sleeper->senderid == L4_ANYTHREAD)) {
-				list_del_init(&wq->task_list);
-				spin_unlock(&wqhr->slock);
-				spin_unlock(&wqhs->slock);
+		/* Remove from waitqueue */
+		list_del_init(&wq->task_list);
+		wqhr->sleepers--;
 
-				/* Do the work */
-				ipc_msg_copy(sleeper, current);
-				//printk("%s: (%d) Waking up (%d)\n", __FUNCTION__,
-				//       current->tid, sleeper->tid);
+		/* Release locks */
+		spin_unlock(&wqhr->slock);
+		spin_unlock(&wqhs->slock);
 
-				/* Wake it up, we can yield here. */
-				sched_resume_task(sleeper);
-				return 0;
-			}
-		}
+		/* Copy message registers */
+		ipc_msg_copy(receiver, current);
+
+		// printk("%s: (%d) Waking up (%d)\n", __FUNCTION__,
+		//       current->tid, receiver->tid);
+
+		/* Wake it up, we can yield here. */
+		sched_resume_sync(receiver);
+		return 0;
 	}
-	/* Could not find a receiver that's waiting */
-	DECLARE_WAITQUEUE(wq, current);
+
+	/* The receiver is not ready and/or not expecting us */
+	CREATE_WAITQUEUE_ON_STACK(wq, current);
 	wqhs->sleepers++;
 	list_add_tail(&wq.task_list, &wqhs->task_list);
-	sched_notify_sleep(current);
-	need_resched = 1;
-	// printk("%s: (%d) waiting for (%d)\n", __FUNCTION__, current->tid, recv_tid);
+	task_set_wqh(current, wqhs, &wq);
+	current->state = TASK_SLEEPING;
 	spin_unlock(&wqhr->slock);
 	spin_unlock(&wqhs->slock);
+	// printk("%s: (%d) waiting for (%d)\n", __FUNCTION__,
+	//       current->tid, recv_tid);
+	schedule();
+
+	/* Did we wake up normally or get interrupted */
+	if (current->flags & TASK_INTERRUPTED) {
+		current->flags &= ~TASK_INTERRUPTED;
+		return -EINTR;
+	}
 	return 0;
 }
 
 int ipc_recv(l4id_t senderid)
 {
-	struct waitqueue_head *wqhs = &current->wqh_send;
-	struct waitqueue_head *wqhr = &current->wqh_recv;
+	struct waitqueue_head *wqhs, *wqhr;
 
-	/* Specify who to receiver from, so senders know. */
-	current->senderid = senderid;
+	wqhs = &current->wqh_send;
+	wqhr = &current->wqh_recv;
+
+	/*
+	 * Indicate who we expect to receive from,
+	 * so senders know.
+	 */
+	current->expected_sender = senderid;
 
 	spin_lock(&wqhs->slock);
 	spin_lock(&wqhr->slock);
 
-	/* Is my sender waiting? */
+	/* Are there senders? */
 	if (wqhs->sleepers > 0) {
 		struct waitqueue *wq, *n;
 		struct ktcb *sleeper;
 
+		BUG_ON(list_empty(&wqhs->task_list));
+
+		/* Look for a sender we want to receive from */
 		list_for_each_entry_safe(wq, n, &wqhs->task_list, task_list) {
 			sleeper = wq->task;
-			/* Found a sender */
-			if ((sleeper->tid == current->senderid) ||
-			    (current->senderid == L4_ANYTHREAD)) {
+
+			/* Found a sender that we wanted to receive from */
+			if ((sleeper->tid == current->expected_sender) ||
+			    (current->expected_sender == L4_ANYTHREAD)) {
 				list_del_init(&wq->task_list);
+				wqhs->sleepers--;
+				task_unset_wqh(sleeper);
 				spin_unlock(&wqhr->slock);
 				spin_unlock(&wqhs->slock);
-
-				/* Do the work */
 				ipc_msg_copy(current, sleeper);
+
 				// printk("%s: (%d) Waking up (%d)\n", __FUNCTION__,
 				//       current->tid, sleeper->tid);
-
-				/* Wake it up */
-				sched_resume_task(sleeper);
+				sched_resume_sync(sleeper);
 				return 0;
-
 			}
 		}
 	}
-	/* Could not find a sender that's waiting */
-	DECLARE_WAITQUEUE(wq, current);
+
+	/* The sender is not ready */
+	CREATE_WAITQUEUE_ON_STACK(wq, current);
 	wqhr->sleepers++;
 	list_add_tail(&wq.task_list, &wqhr->task_list);
-	sched_notify_sleep(current);
-	need_resched = 1;
-	// printk("%s: (%d) waiting for (%d) \n", __FUNCTION__, current->tid, current->senderid);
+	task_set_wqh(current, wqhr, &wq);
+	current->state = TASK_SLEEPING;
+	// printk("%s: (%d) waiting for (%d)\n", __FUNCTION__,
+	//       current->tid, current->expected_sender);
 	spin_unlock(&wqhr->slock);
 	spin_unlock(&wqhs->slock);
+	schedule();
+
+	/* Did we wake up normally or get interrupted */
+	if (current->flags & TASK_INTERRUPTED) {
+		current->flags &= ~TASK_INTERRUPTED;
+		return -EINTR;
+	}
 	return 0;
 }
 
diff --git a/src/api/thread.c b/src/api/thread.c
index 3ac37ba..cc6cdde 100644
--- a/src/api/thread.c
+++ b/src/api/thread.c
@@ -16,25 +16,12 @@
 
 int sys_thread_switch(syscall_context_t *regs)
 {
-	sched_yield();
+	schedule();
 	return 0;
 }
 
 int thread_suspend(struct task_ids *ids)
 {
-	struct ktcb *task;
-
-	if (!(task = find_task(ids->tid)))
-		return -ESRCH;
-
-	/*
-	 * The thread_control_lock is protecting from
-	 * indirect modification of thread context, this
-	 * does not cause any such operation so we don't
-	 * need to acquire that lock here.
-	 */
-	sched_suspend_task(task);
-
 	return 0;
 }
 
@@ -48,14 +35,15 @@ int thread_resume(struct task_ids *ids)
 	if (!mutex_trylock(&task->thread_control_lock))
 		return -EAGAIN;
 
-	/* Notify scheduler of task resume */
-	sched_notify_resume(task);
+	/* Put task into runqueue as runnable */
+	sched_resume_async(task);
 
 	/* Release lock and return */
 	mutex_unlock(&task->thread_control_lock);
 	return 0;
 }
 
+/* Runs a thread for the first time */
 int thread_start(struct task_ids *ids)
 {
 	struct ktcb *task;
@@ -67,7 +55,7 @@ int thread_start(struct task_ids *ids)
 		return -EAGAIN;
 
 	/* Notify scheduler of task resume */
-	sched_notify_resume(task);
+	sched_resume_async(task);
 
 	/* Release lock and return */
 	mutex_unlock(&task->thread_control_lock);
@@ -264,7 +252,7 @@ out:
 	thread_setup_new_ids(ids, flags, new, task);
 
 	/* Initialise task's scheduling state and parameters. */
-	sched_init_task(new);
+	sched_init_task(new, TASK_PRIO_NORMAL);
 
 	/* Initialise ipc waitqueues */
 	waitqueue_head_init(&new->wqh_send);
@@ -302,7 +290,6 @@ int sys_thread_control(syscall_context_t *regs)
 	case THREAD_RESUME:
 		ret = thread_resume(ids);
 		break;
-	/* TODO: Add THREAD_DESTROY! */
 	default:
 		ret = -EINVAL;
 	}
diff --git a/src/arch/arm/exception.c b/src/arch/arm/exception.c
index 2e6b3e5..77176d0 100644
--- a/src/arch/arm/exception.c
+++ b/src/arch/arm/exception.c
@@ -212,18 +212,27 @@ error:
 		;
 }
 
-void prefetch_abort_handler(u32 faulted_pc, u32 fsr, u32 far)
+void prefetch_abort_handler(u32 faulted_pc, u32 fsr, u32 far, u32 lr)
 {
 	set_abort_type(fsr, ARM_PABT);
 	if (check_aborts(faulted_pc, fsr, far) < 0) {
 		printascii("This abort can't be handled by any pager.\n");
 		goto error;
 	}
+
+	if (KERN_ADDR(lr))
+		goto error;
 	fault_ipc_to_pager(faulted_pc, fsr, far);
 	return;
 
 error:
 	disable_irqs();
+	dprintk("Unhandled prefetch abort @ address: ", faulted_pc);
+	dprintk("FAR:", far);
+	dprintk("FSR:", fsr);
+	dprintk("LR:", lr);
+	printascii("Kernel panic.\n");
+	printascii("Halting system...\n");
 	while (1)
 		;
 }
diff --git a/src/arch/arm/v5/mm.c b/src/arch/arm/v5/mm.c
index e199eb7..db61a86 100644
--- a/src/arch/arm/v5/mm.c
+++ b/src/arch/arm/v5/mm.c
@@ -530,3 +530,12 @@ void copy_pgds_by_vrange(pgd_table_t *to, pgd_table_t *from,
 	       irange * sizeof(pgd_t));
 }
 
+/* Scheduler uses this to switch context */
+void arch_hardware_flush(pgd_table_t *pgd)
+{
+	arm_clean_invalidate_cache();
+	arm_invalidate_tlb();
+	arm_set_ttb(virt_to_phys(pgd));
+	arm_invalidate_tlb();
+}
+
diff --git a/src/arch/arm/vectors.S b/src/arch/arm/vectors.S
index ed8f4af..f979a2c 100644
--- a/src/arch/arm/vectors.S
+++ b/src/arch/arm/vectors.S
@@ -207,6 +207,12 @@ END_PROC(arm_swi_exception)
 		sub	\sp, \sp, #8	@ Adjust SP, since stack op on banked regs is no writeback.
 		@ stack state: (Low) |->SP_USR|LR_USR|(Original)| (High)
 	.endm
+
+	.macro is_psr_usr rx
+	and	\rx, \rx, #ARM_MODE_MASK
+	cmp	\rx, #ARM_MODE_USR
+	.endm
+
 /*
  * vect_pabt
  *
@@ -264,6 +270,11 @@ read_pabt_state:
 	bne	1f			@ Branch here based on previous irq judgement.
 	enable_irqs r3
 1:
+	/* Now check in what mode abort occured, and return that mode's LR in R4 */
+	ldr	r0, [sp, #28]		@ Load PABT_SPSR
+	is_psr_usr r0			@ Test if PABT_SPSR was user mode.
+	ldrne	r3, [sp, #32]		@ Abort occured in kernel, load LR_SVC
+	ldreq	r3, [sp, #4]		@ Abort occured in user, load LR_USR
 	ldr	r0, [sp, #36]		@ Load LR_PABT saved previously.
 	mov	lr, pc
 	ldr	pc, =prefetch_abort_handler @ Jump to function outside this page.
@@ -448,6 +459,11 @@ preempted_psr:
 current_irq_nest_count:
 .word	0
 
+/*
+ * FIXME: current_irq_nest_count also counts for any preempt_disable() calls.
+ * However this nesting check assumes all nests come from real irqs.
+ * We should make this check just the real ones.
+ */
 #define IRQ_NESTING_MAX			15
 	.macro	inc_irq_cnt_with_overnest_check rx, ry
 	ldr	\rx, =current_irq_nest_count	@ Load the irq nest status word.
@@ -480,10 +496,6 @@ current_irq_nest_count:
 	ldreq	\rx, =preempted_psr
 	streq	\process_psr, [\rx]
 	.endm
-	.macro is_psr_usr rx
-	and	\rx, \rx, #ARM_MODE_MASK
-	cmp	\rx, #ARM_MODE_USR
-	.endm
 
 #define	CONTEXT_PSR		0
 #define CONTEXT_R0		4
@@ -584,7 +596,10 @@ save_usr_context:
 	str	r1, [r0, #CONTEXT_R0]
 	@ stack state: (Low) |..|..|..|..|..|..|..|..|->(Original)| (High)
 prepare_schedule:
+	mov lr, pc
 	ldr pc, =schedule
+1:
+	b	1b	/* To catch if schedule returns in irq mode */
 END_PROC(arm_irq_exception_reentrant_with_schedule)
 
 /*
@@ -612,7 +627,7 @@ END_PROC(arm_irq_exception_reentrant_with_schedule)
  *   Furthermore, irqs are also disabled shortly before calling switch_to() from both contexts.
  *   This happens at points where stack state would be irrecoverable if an irq occured.
  */
-BEGIN_PROC(switch_to)
+BEGIN_PROC(arch_switch)
 	in_process_context r2		@ Note this depends on preempt count being 0.
 	beq	save_process_context	@ Voluntary switch needs explicit saving of current state.
 	dec_irq_nest_cnt r2, r3		@ Soon leaving irq context, so reduce preempt count here.
@@ -639,7 +654,7 @@ load_next_context_usr:
 load_next_context_svc:
 	ldmib	sp, {r0-r15}^		@ Switch to svc context and jump, loading R13 and R14 from stack.
 					@ This is OK since the jump is to current context.
-END_PROC(switch_to)
+END_PROC(arch_switch)
 
 
 /*
diff --git a/src/generic/pgalloc.c b/src/generic/pgalloc.c
index 9b927c7..872650f 100644
--- a/src/generic/pgalloc.c
+++ b/src/generic/pgalloc.c
@@ -12,6 +12,12 @@
 #include <l4/generic/physmem.h>
 #include INC_GLUE(memory.h)
 
+/* FIXME:
+ *
+ * mem_cache_alloc() now has an interruptible mutex.
+ * All routines defined here should check returned errors.
+ */
+
 #define	PGALLOC_PGD_CACHE		0
 #define PGALLOC_PMD_CACHE		1
 #define PGALLOC_PG_CACHE		2
diff --git a/src/generic/scheduler.c b/src/generic/scheduler.c
index bed17b8..ec54f8d 100644
--- a/src/generic/scheduler.c
+++ b/src/generic/scheduler.c
@@ -1,12 +1,13 @@
 /*
- * A basic scheduler that does the job for now.
+ * A basic priority-based scheduler.
  *
- * Copyright (C) 2007 Bahadir Balban
+ * Copyright (C) 2007, 2008 Bahadir Balban
  */
 #include <l4/lib/list.h>
 #include <l4/lib/printk.h>
 #include <l4/lib/string.h>
 #include <l4/lib/mutex.h>
+#include <l4/lib/math.h>
 #include <l4/lib/bit.h>
 #include <l4/lib/spinlock.h>
 #include <l4/generic/scheduler.h>
@@ -21,16 +22,19 @@
 #include INC_PLAT(platform.h)
 #include INC_ARCH(exception.h)
 
-/* A very basic runqueue */
+
+/* A basic runqueue */
 struct runqueue {
-	struct spinlock lock;
-	struct list_head task_list;
-	unsigned int total;
+	struct spinlock lock;		/* Lock */
+	struct list_head task_list;	/* List of tasks in rq */
+	unsigned int total;		/* Total tasks */
+	int recalc_timeslice;		/* Need timeslice redistribution */
 };
 
-static struct runqueue sched_rq[3];
-static struct runqueue *rq_runnable, *rq_expired, *rq_pending;
-
+#define SCHED_RQ_TOTAL					2
+static struct runqueue sched_rq[SCHED_RQ_TOTAL];
+static struct runqueue *rq_runnable, *rq_expired;
+static int prio_total;			/* Total priority of all tasks */
 
 /* This is incremented on each irq or voluntarily by preempt_disable() */
 extern unsigned int current_irq_nest_count;
@@ -52,16 +56,6 @@ void preempt_enable(void)
 {
 	voluntary_preempt--;
 	current_irq_nest_count--;
-
-	/*
-	 * Even if count increases after we check it, it will come back to zero.
-	 * This test really is asking "is this the outmost explicit
-	 * preempt_enable() that will really enable context switching?"
-	 */
-	if (current_irq_nest_count == 0) {
-		/* Then, give scheduler a chance to check need_resched == 1 */
-		schedule();
-	}
 }
 
 /* A positive irq nest count implies current context cannot be preempted. */
@@ -71,9 +65,30 @@ void preempt_disable(void)
 	voluntary_preempt++;
 }
 
-void sched_runqueue_init(void)
+int in_irq_context(void)
 {
-	for (int i = 0; i < 3; i++) {
+	/*
+	 * If there was a real irq, irq nest count must be
+	 * one more than all preempt_disable()'s which are
+	 * counted by voluntary_preempt.
+	 */
+	return (current_irq_nest_count == (voluntary_preempt + 1));
+}
+
+int in_nested_irq_context(void)
+{
+	/* Deducing voluntary preemptions we get real irq nesting */
+	return (current_irq_nest_count - voluntary_preempt) > 1;
+}
+
+int in_task_context(void)
+{
+	return !in_irq_context();
+}
+
+void sched_init_runqueues(void)
+{
+	for (int i = 0; i < SCHED_RQ_TOTAL; i++) {
 		memset(&sched_rq[i], 0, sizeof(struct runqueue));
 		INIT_LIST_HEAD(&sched_rq[i].task_list);
 		spin_lock_init(&sched_rq[i].lock);
@@ -81,203 +96,93 @@ void sched_runqueue_init(void)
 
 	rq_runnable = &sched_rq[0];
 	rq_expired = &sched_rq[1];
-	rq_pending = &sched_rq[2];
+	prio_total = 0;
 }
 
-/* Lock scheduler. Should only be used when scheduling. */
-static inline void sched_lock(void)
-{
-	preempt_disable();
-}
-
-/* Sched unlock */
-static inline void sched_unlock(void)
-{
-	/*
-	 * This is to make sure preempt_enable() does not
-	 * try to schedule since we're already scheduling.
-	 */
-	need_resched = 0;
-	preempt_enable();
-}
-
-/* Swaps runnable and expired queues *if* runnable queue is empty. */
-static void sched_rq_swap_expired_runnable(void)
+/* Swap runnable and expired runqueues. */
+static void sched_rq_swap_runqueues(void)
 {
 	struct runqueue *temp;
 
-	if (list_empty(&rq_runnable->task_list) &&
-	    !list_empty(&rq_expired->task_list)) {
+	BUG_ON(list_empty(&rq_expired->task_list));
+	BUG_ON(rq_expired->total == 0);
 
-		/* Queues are swapped and expired list becomes runnable */
-		temp = rq_runnable;
-		rq_runnable = rq_expired;
-		rq_expired = temp;
-	}
+	/* Queues are swapped and expired list becomes runnable */
+	temp = rq_runnable;
+	rq_runnable = rq_expired;
+	rq_expired = temp;
 }
 
+/* FIXME:
+ * Sleepers should not affect runqueue priority.
+ * Suspended tasks should affect runqueue priority.
+ *
+ * Also make sure that if sleepers get suspended,
+ * they do affect runqueue priority.
+ */
+
+/* Set policy on where to add tasks in the runqueue */
+#define RQ_ADD_BEHIND		0
+#define RQ_ADD_FRONT		1
+
 /* Helper for adding a new task to a runqueue */
 static void sched_rq_add_task(struct ktcb *task, struct runqueue *rq, int front)
 {
-	BUG_ON(task->rq);
-
-	/*
-	 * If the task is sinfully in a runqueue, this may still keep silent
-	 * upon a racing condition, since its rq can't be locked in advance.
-	 */
 	BUG_ON(!list_empty(&task->rq_list));
 
+	spin_lock(&rq->lock);
 	if (front)
 		list_add(&task->rq_list, &rq->task_list);
 	else
 		list_add_tail(&task->rq_list, &rq->task_list);
 	rq->total++;
-	task->rq = rq;
-}
-
-static inline void
-sched_rq_add_task_front(struct ktcb *task, struct runqueue *rq)
-{
-	sched_rq_add_task(task, rq, 1);
-}
-
-static inline void
-sched_rq_add_task_behind(struct ktcb *task, struct runqueue *rq)
-{
-	sched_rq_add_task(task, rq, 0);
+	spin_unlock(&rq->lock);
 }
 
 /* Helper for removing a task from its runqueue. */
-static inline void sched_rq_remove_task(struct ktcb *task)
+static inline void sched_rq_remove_task(struct ktcb *task, struct runqueue *rq)
 {
+	spin_lock(&rq->lock);
 	list_del_init(&task->rq_list);
-	task->rq->total--;
-	task->rq = 0;
+	rq->total--;
+
+	BUG_ON(rq->total < 0);
+	spin_unlock(&rq->lock);
 }
 
-void sched_init_task(struct ktcb *task)
+
+void sched_init_task(struct ktcb *task, int prio)
 {
 	INIT_LIST_HEAD(&task->rq_list);
-	task->ticks_left = TASK_TIMESLICE_DEFAULT;
+	task->priority = prio;
+	task->ticks_left = 0;
 	task->state = TASK_INACTIVE;
 	task->ts_need_resched = 0;
+	task->flags |= TASK_RESUMING;
 }
 
-void sched_tell(struct ktcb *task, unsigned int fl)
+/* Synchronously resumes a task */
+void sched_resume_sync(struct ktcb *task)
 {
-	BUG_ON(!(SCHED_FL_MASK & fl));
-	/* The last flag overrrides all existing flags. */
-	task->schedfl = fl;
-}
+	task->state = TASK_RUNNABLE;
 
-void sched_yield()
-{
-	need_resched = 1;
+	sched_rq_add_task(task, rq_runnable, RQ_ADD_FRONT);
 	schedule();
 }
 
 /*
- * Any task that wants the scheduler's attention and not in its any one of
- * its currently runnable realms, would call this. E.g. dormant tasks
- * sleeping tasks, newly created tasks. But not currently runnable tasks.
+ * Asynchronously resumes a task.
+ * The task will run in the future, but at
+ * the scheduler's discretion.
  */
-void sched_add_pending_task(struct ktcb *task)
+void sched_resume_async(struct ktcb *task)
 {
-	BUG_ON(task->rq);
-	spin_lock(&rq_pending->lock);
-	sched_rq_add_task_behind(task, rq_pending);
-	spin_unlock(&rq_pending->lock);
+	task->state = TASK_RUNNABLE;
+
+	sched_rq_add_task(task, rq_runnable, RQ_ADD_FRONT);
 }
 
-/* Tells scheduler to remove given runnable task from runqueues */
-void sched_notify_sleep(struct ktcb *task)
-{
-	sched_tell(task, SCHED_FL_SLEEP);
-}
-
-void sched_sleep_task(struct ktcb *task)
-{
-	sched_notify_sleep(task);
-	if (task == current)
-		sched_yield();
-}
-
-/* Tells scheduler to remove given runnable task from runqueues */
-void sched_notify_suspend(struct ktcb *task)
-{
-	sched_tell(task, SCHED_FL_SUSPEND);
-}
-
-void sched_suspend_task(struct ktcb *task)
-{
-	sched_notify_suspend(task);
-	if (task == current)
-		sched_yield();
-}
-
-/* Tells scheduler to add given task into runqueues whenever possible */
-void sched_notify_resume(struct ktcb *task)
-{
-	BUG_ON(current == task);
-	sched_tell(task, SCHED_FL_RESUME);
-	sched_add_pending_task(task);
-}
-
-/* NOTE: Might as well just set need_resched instead of full yield.
- * This would work on irq context as well. */
-/* Same as resume, but also yields. */
-void sched_resume_task(struct ktcb *task)
-{
-	sched_notify_resume(task);
-	sched_yield();
-}
-
-void sched_start_task(struct ktcb *task)
-{
-	sched_init_task(task);
-	sched_resume_task(task);
-}
-
-/*
- * Checks currently pending scheduling flags on the task and does two things:
- * 1) Modify their state.
- * 2) Modify their runqueues.
- *
- * An inactive/sleeping task that is pending-runnable would change state here.
- * A runnable task that is pending-inactive would also change state here.
- * Returns 1 if it has changed anything, e.g. task state, runqueues, and
- * 0 otherwise.
- */
-static int sched_next_state(struct ktcb *task)
-{
-	unsigned int flags = task->schedfl;
-	int ret = 0;
-
-	switch(flags) {
-	case 0:
-		ret = 0;
-		break;
-	case SCHED_FL_SUSPEND:
-		task->state = TASK_INACTIVE;
-		ret = 1;
-		break;
-	case SCHED_FL_RESUME:
-		task->state = TASK_RUNNABLE;
-		ret = 1;
-		break;
-	case SCHED_FL_SLEEP:
-		task->state = TASK_SLEEPING;
-		ret = 1;
-		break;
-	default:
-		BUG();
-	}
-	task->schedfl = 0;
-	return ret;
-}
-
-
-extern void switch_to(struct ktcb *cur, struct ktcb *next);
+extern void arch_switch(struct ktcb *cur, struct ktcb *next);
 
 static inline void context_switch(struct ktcb *next)
 {
@@ -286,84 +191,179 @@ static inline void context_switch(struct ktcb *next)
 	// printk("(%d) to (%d)\n", cur->tid, next->tid);
 
 	/* Flush caches and everything */
-	arm_clean_invalidate_cache();
-	arm_invalidate_tlb();
-	arm_set_ttb(virt_to_phys(next->pgd));
-	arm_invalidate_tlb();
-	switch_to(cur, next);
+	arch_hardware_flush(next->pgd);
+
+	/* Switch context */
+	arch_switch(cur, next);
+
 	// printk("Returning from yield. Tid: (%d)\n", cur->tid);
 }
 
-void scheduler()
+/*
+ * Priority calculation is so simple it is inlined. The task gets
+ * the ratio of its priority to total priority of all runnable tasks.
+ */
+static inline int sched_recalc_ticks(struct ktcb *task, int prio_total)
 {
-	struct ktcb *next = 0, *pending = 0, *n = 0;
+	return task->ticks_assigned =
+		SCHED_TICKS * task->priority / prio_total;
+}
 
-	sched_lock();
+/*
+ * Tasks come here, either by setting need_resched (via next irq),
+ * or by directly calling it (in process context).
+ *
+ * The scheduler is similar to Linux's so called O(1) scheduler,
+ * although a lot simpler. Task priorities determine task timeslices.
+ * Each task gets a ratio of its priority to the total priority of
+ * all runnable tasks. When this total changes, (e.g. threads die or
+ * are created, or a thread's priority is changed) the timeslices are
+ * recalculated on a per-task basis as each thread becomes runnable.
+ * Once all runnable tasks expire, runqueues are swapped. Sleeping
+ * tasks are removed from the runnable queue, and added back later
+ * without affecting the timeslices. Suspended tasks however,
+ * necessitate a timeslice recalculation as they are considered to go
+ * inactive indefinitely or for a very long time. They are put back
+ * to the expired queue if they want to run again.
+ *
+ * A task is rescheduled either when it hits a SCHED_GRANULARITY
+ * boundary, or when its timeslice has expired. SCHED_GRANULARITY
+ * ensures context switches do occur at a maximum boundary even if a
+ * task's timeslice is very long. In the future, real-time tasks will
+ * be added, and they will be able to ignore SCHED_GRANULARITY.
+ *
+ * In the future, the tasks will be sorted by priority in their
+ * runqueue, as well as having an adjusted timeslice.
+ *
+ * Runqueues are swapped at a single second's interval. This implies
+ * the timeslice recalculations would also occur at this interval.
+ */
+void schedule()
+{
+	struct ktcb *next;
+
+	/* Should not schedule with preemption disabled */
+	BUG_ON(voluntary_preempt);
+
+	/* Should not have more ticks than SCHED_TICKS */
+	BUG_ON(current->ticks_left > SCHED_TICKS);
+
+	/* Cannot have any irqs that schedule after this */
+	preempt_disable();
+
+	/* NOTE:
+	 * We could avoid double-scheduling by detecting a task
+	 * that's about to schedule voluntarily and skipping the
+	 * schedule() call in irq mode.
+	 */
+
+	/* Reset schedule flag */
 	need_resched = 0;
-	BUG_ON(current->rq != rq_runnable);
 
-	/* Current task */
-	sched_rq_remove_task(current);
-	sched_next_state(current);
+	/* Remove from runnable queue */
+	sched_rq_remove_task(current, rq_runnable);
 
+	/* Put it into appropriate runqueue */
 	if (current->state == TASK_RUNNABLE) {
-		BUG_ON(current->ticks_left < 0);
-		if (current->ticks_left == 0)
-			current->ticks_left = TASK_TIMESLICE_DEFAULT;
-		sched_rq_add_task_behind(current, rq_expired);
+		if (current->ticks_left)
+			sched_rq_add_task(current, rq_runnable, RQ_ADD_BEHIND);
+		else
+			sched_rq_add_task(current, rq_expired, RQ_ADD_BEHIND);
 	}
-	sched_rq_swap_expired_runnable();
 
-	/* Runnable-pending tasks */
-	spin_lock(&rq_pending->lock);
-	list_for_each_entry_safe(pending, n, &rq_pending->task_list, rq_list) {
-		sched_next_state(pending);
-		sched_rq_remove_task(pending);
-		if (pending->state == TASK_RUNNABLE)
-			sched_rq_add_task_front(pending, rq_runnable);
-	}
-	spin_unlock(&rq_pending->lock);
+	/* Check if there's a pending suspend for thread */
+	if (current->flags & TASK_SUSPENDING) {
+		/*
+		 * The task should have no locks and be in a runnable state.
+		 * (e.g. properly woken up by the suspender)
+		 */
+		if (current->nlocks == 0 && current->state == TASK_RUNNABLE) {
+			/* Suspend it if suitable */
+			current->state = TASK_INACTIVE;
+			current->flags &= ~TASK_SUSPENDING;
 
-	/* Next task */
-retry_next:
-	if (rq_runnable->total > 0) {
-		next = list_entry(rq_runnable->task_list.next, struct ktcb, rq_list);
-		sched_next_state(next);
-		if (next->state != TASK_RUNNABLE) {
-			sched_rq_remove_task(next);
-			sched_rq_swap_expired_runnable();
-			goto retry_next;
+			/*
+			 * The task has been made inactive here.
+			 * A suspended task affects timeslices whereas
+			 * a sleeping task doesn't as it is believed
+			 * sleepers would become runnable soon.
+			 */
+			prio_total -= current->priority;
+			BUG_ON(prio_total <= 0);
+		} else {
+			/*
+			 * Top up task's ticks temporarily, and
+			 * wait for it to release its locks.
+			 */
+			current->state = TASK_RUNNABLE;
+			current->ticks_left = max(current->ticks_left,
+						  SCHED_GRANULARITY);
+			sched_rq_add_task(current, rq_runnable, RQ_ADD_FRONT);
 		}
-	} else {
-		printk("Idle task.\n");
-		while (1);
 	}
 
+	/* Determine the next task to be run */
+	if (rq_runnable->total > 0) {
+		next = list_entry(rq_runnable->task_list.next,
+				  struct ktcb, rq_list);
+	} else {
+		if (rq_expired->total > 0) {
+			sched_rq_swap_runqueues();
+			next = list_entry(rq_runnable->task_list.next,
+					  struct ktcb, rq_list);
+		} else {
+			printk("Idle task.\n");
+			while(1);
+		}
+	}
+
+	/* Zero ticks indicates task hasn't ran since last rq swap */
+	if (next->ticks_left == 0) {
+
+		/* New tasks affect runqueue total priority. */
+		if (next->flags & TASK_RESUMING) {
+			prio_total += next->priority;
+			next->flags &= ~TASK_RESUMING;
+		}
+
+		/*
+		 * Redistribute timeslice. We do this as each task
+		 * becomes runnable rather than all at once. It's also
+		 * done only upon a runqueue swap.
+		 */
+		sched_recalc_ticks(next, prio_total);
+		next->ticks_left = next->ticks_assigned;
+	}
+
+	/* Reinitialise task's schedule granularity boundary */
+	next->sched_granule = SCHED_GRANULARITY;
+
+	/* Finish */
 	disable_irqs();
-	sched_unlock();
+	preempt_enable();
 	context_switch(next);
 }
 
-void schedule(void)
-{
-	/* It's a royal bug to call schedule when preemption is disabled */
-	BUG_ON(voluntary_preempt);
-
-	if (need_resched)
-		scheduler();
-}
-
+/*
+ * Initialise pager as runnable for first-ever scheduling,
+ * and start the scheduler.
+ */
 void scheduler_start()
 {
 	/* Initialise runqueues */
-	sched_runqueue_init();
+	sched_init_runqueues();
 
-	/* Initialse inittask as runnable for first-ever scheduling */
-	sched_init_task(current);
+	/* Initialise scheduler fields of pager */
+	sched_init_task(current, TASK_PRIO_PAGER);
+
+	/* Add task to runqueue first */
+	sched_rq_add_task(current, rq_runnable, RQ_ADD_FRONT);
+
+	/* Give it a kick-start tick and make runnable */
+	current->ticks_left = 1;
 	current->state = TASK_RUNNABLE;
-	sched_rq_add_task_front(current, rq_runnable);
 
-	/* Start the timer */
+	/* Start the timer and switch */
 	timer_start();
 	switch_to_user(current);
 }
diff --git a/src/generic/time.c b/src/generic/time.c
index 6b387a2..617564e 100644
--- a/src/generic/time.c
+++ b/src/generic/time.c
@@ -10,6 +10,7 @@
 #include <l4/generic/irq.h>
 #include <l4/generic/scheduler.h>
 #include <l4/generic/time.h>
+#include <l4/generic/preempt.h>
 #include <l4/generic/space.h>
 #include INC_ARCH(exception.h)
 #include <l4/api/syscall.h>
@@ -54,11 +55,16 @@ void update_system_time(void)
 	if (systime.reader)
 		systime.reader = 0;
 
-	/* Increase just like jiffies, but reset every HZ */
+	/* Increase just like jiffies, but reset every second */
 	systime.thz++;
 
-	/* On every HZ increase seconds */
-	if (systime.thz == HZ) {
+	/*
+	 * On every 1 second of timer ticks, increase seconds
+	 *
+	 * TODO: Investigate: how do we make sure timer_irq is
+	 * called SCHED_TICKS times per second?
+	 */
+	if (systime.thz == SCHED_TICKS) {
 		systime.thz = 0;
 		systime.sec++;
 	}
@@ -79,7 +85,7 @@ int sys_time(syscall_context_t *args)
 		while(retries > 0) {
 			systime.reader = 1;
 			tv->tv_sec = systime.sec;
-			tv->tv_usec = 1000000 * systime.thz / HZ;
+			tv->tv_usec = 1000000 * systime.thz / SCHED_TICKS;
 
 			retries--;
 			if (systime.reader)
@@ -108,21 +114,37 @@ void update_process_times(void)
 {
 	struct ktcb *cur = current;
 
-	BUG_ON(cur->ticks_left < 0);
-
 	if (cur->ticks_left == 0) {
-		need_resched = 1;
-		return;
+		/*
+		 * Nested irqs and irqs during non-preemptive
+		 * times could try to deduct ticks below zero.
+		 * We ignore such states and return.
+		 */
+		if (in_nested_irq_context() || !preemptive())
+			return;
+		else /* Otherwise its a bug. */
+			BUG();
 	}
 
+	/*
+	 * These are TASK_RUNNABLE times, i.e. exludes sleeps
+	 * In the future we may use timestamps for accuracy
+	 */
 	if (in_kernel())
 		cur->kernel_time++;
 	else
 		cur->user_time++;
 
 	cur->ticks_left--;
+	cur->sched_granule--;
+
+	/* Task has expired its timeslice */
 	if (!cur->ticks_left)
 		need_resched = 1;
+
+	/* Task has expired its schedule granularity */
+	if (!cur->sched_granule)
+		need_resched = 1;
 }
 
 
diff --git a/src/lib/memcache.c b/src/lib/memcache.c
index 3d02959..3e1f8d3 100644
--- a/src/lib/memcache.c
+++ b/src/lib/memcache.c
@@ -8,6 +8,7 @@
 #include <l4/lib/printk.h>
 #include INC_GLUE(memory.h)
 #include <l4/lib/bit.h>
+#include <l4/api/errno.h>
 
 /* Allocate, clear and return element */
 void *mem_cache_zalloc(struct mem_cache *cache)
@@ -21,8 +22,11 @@ void *mem_cache_zalloc(struct mem_cache *cache)
 void *mem_cache_alloc(struct mem_cache *cache)
 {
 	int bit;
+	int err;
+
 	if (cache->free > 0) {
-		mutex_lock(&cache->mutex);
+		if ((err = mutex_lock(&cache->mutex)) < 0)
+			return PTR_ERR(err);	/* Interruptible mutex */
 		cache->free--;
 		if ((bit = find_and_set_first_free_bit(cache->bitmap,
 						       cache->total)) < 0) {
@@ -64,7 +68,9 @@ int mem_cache_free(struct mem_cache *cache, void *addr)
 		return err;
 	}
 
-	mutex_lock(&cache->mutex);
+	if ((err = mutex_lock(&cache->mutex)) < 0)
+		return err; /* Interruptible mutex */
+
 	/* Check free/occupied state */
 	if (check_and_clear_bit(cache->bitmap, bit) < 0) {
 		printk("Error: Anomaly in cache occupied state:\n"
diff --git a/src/lib/mutex.c b/src/lib/mutex.c
index 7202edb..60881d8 100644
--- a/src/lib/mutex.c
+++ b/src/lib/mutex.c
@@ -6,6 +6,7 @@
 #include <l4/lib/mutex.h>
 #include <l4/generic/scheduler.h>
 #include <l4/generic/tcb.h>
+#include <l4/api/errno.h>
 
 /*
  * Semaphore usage:
@@ -17,6 +18,8 @@
  * Consumer locks/consumes/unlocks data.
  */
 
+#if 0
+/* Update it */
 /*
  * Semaphore *up* for multiple producers. If any consumer is waiting, wake them
  * up, otherwise, sleep. Effectively producers and consumers use the same
@@ -48,10 +51,10 @@ void sem_up(struct mutex *mutex)
 		INIT_LIST_HEAD(&wq.task_list);
 		list_add_tail(&wq.task_list, &mutex->wq.task_list);
 		mutex->sleepers++;
-		sched_notify_sleep(current);
-		need_resched = 1;
+		current->state = TASK_SLEEPING;
 		printk("(%d) produced, now sleeping...\n", current->tid);
 		spin_unlock(&mutex->slock);
+		schedule();
 	}
 }
 
@@ -86,76 +89,91 @@ void sem_down(struct mutex *mutex)
 		INIT_LIST_HEAD(&wq.task_list);
 		list_add_tail(&wq.task_list, &mutex->wq.task_list);
 		mutex->sleepers++;
-		sched_notify_sleep(current);
-		need_resched = 1;
+		current->state = TASK_SLEEPING;
 		printk("(%d) Waiting to consume, now sleeping...\n", current->tid);
 		spin_unlock(&mutex->slock);
+		schedule();
 	}
 }
+#endif
 
 /* Non-blocking attempt to lock mutex */
 int mutex_trylock(struct mutex *mutex)
 {
 	int success;
 
-	spin_lock(&mutex->slock);
-	success = __mutex_lock(&mutex->lock);
-	spin_unlock(&mutex->slock);
+	spin_lock(&mutex->wqh.slock);
+	if ((success = __mutex_lock(&mutex->lock)))
+		current->nlocks++;
+	spin_unlock(&mutex->wqh.slock);
 
 	return success;
 }
 
-void mutex_lock(struct mutex *mutex)
+int mutex_lock(struct mutex *mutex)
 {
 	/* NOTE:
 	 * Everytime we're woken up we retry acquiring the mutex. It is
 	 * undeterministic as to how many retries will result in success.
+	 * We may need to add priority-based locking.
 	 */
 	for (;;) {
-		spin_lock(&mutex->slock);
+		spin_lock(&mutex->wqh.slock);
 		if (!__mutex_lock(&mutex->lock)) { /* Could not lock, sleep. */
-			DECLARE_WAITQUEUE(wq, current);
-			INIT_LIST_HEAD(&wq.task_list);
-			list_add_tail(&wq.task_list, &mutex->wq.task_list);
-			mutex->sleepers++;
-			sched_notify_sleep(current);
+			CREATE_WAITQUEUE_ON_STACK(wq, current);
+			task_set_wqh(current, &mutex->wqh, &wq);
+			list_add_tail(&wq.task_list, &mutex->wqh.task_list);
+			mutex->wqh.sleepers++;
+			current->state = TASK_SLEEPING;
+			spin_unlock(&mutex->wqh.slock);
 			printk("(%d) sleeping...\n", current->tid);
-			spin_unlock(&mutex->slock);
-		} else
+			schedule();
+
+			/* Did we wake up normally or get interrupted */
+			if (current->flags & TASK_INTERRUPTED) {
+				current->flags &= ~TASK_INTERRUPTED;
+				return -EINTR;
+			}
+		} else {
+			current->nlocks++;
 			break;
+		}
 	}
-	spin_unlock(&mutex->slock);
+	spin_unlock(&mutex->wqh.slock);
+	return 0;
 }
 
 void mutex_unlock(struct mutex *mutex)
 {
-	spin_lock(&mutex->slock);
+	spin_lock(&mutex->wqh.slock);
 	__mutex_unlock(&mutex->lock);
-	BUG_ON(mutex->sleepers < 0);
-	if (mutex->sleepers > 0) {
-		struct waitqueue *wq;
-		struct ktcb *sleeper;
+	current->nlocks--;
+	BUG_ON(current->nlocks < 0);
+	BUG_ON(mutex->wqh.sleepers < 0);
+	if (mutex->wqh.sleepers > 0) {
+		struct waitqueue *wq = list_entry(mutex->wqh.task_list.next,
+						  struct waitqueue,
+						  task_list);
+		struct ktcb *sleeper = wq->task;
 
-		/* Each unlocker wakes one other sleeper in queue. */
-		mutex->sleepers--;
-		BUG_ON(list_empty(&mutex->wq.task_list));
-		list_for_each_entry(wq, &mutex->wq.task_list, task_list) {
-			list_del_init(&wq->task_list);
-			spin_unlock(&mutex->slock);
-			/*
-			 * Here, someone else may get the lock, well before we
-			 * wake up the sleeper that we *hope* would get it. This
-			 * is fine as the sleeper would retry and re-sleep. BUT,
-			 * this may potentially starve the sleeper causing
-			 * non-determinisim.
-			 */
-			sleeper = wq->task;
-			printk("(%d) Waking up (%d)\n", current->tid,
-			       sleeper->tid);
-			sched_resume_task(sleeper);
-			return;	/* Don't iterate, wake only one task. */
-		}
+		task_unset_wqh(sleeper);
+		BUG_ON(list_empty(&mutex->wqh.task_list));
+		list_del_init(&wq->task_list);
+		mutex->wqh.sleepers--;
+		sleeper->state = TASK_RUNNABLE;
+		spin_unlock(&mutex->wqh.slock);
+
+		/*
+		 * TODO:
+		 * Here someone could grab the mutex, this is fine
+		 * but it may potentially starve the sleeper causing
+		 * non-determinism. We may consider priorities here.
+		 */
+		sched_resume_sync(sleeper);
+
+		/* Don't iterate, wake only one task. */
+		return;
 	}
-	spin_unlock(&mutex->slock);
+	spin_unlock(&mutex->wqh.slock);
 }
 
diff --git a/src/lib/wait.c b/src/lib/wait.c
index 6e52ece..30a1b3e 100644
--- a/src/lib/wait.c
+++ b/src/lib/wait.c
@@ -1,45 +1,117 @@
 /*
  * Implementation of wakeup/wait for processes.
  *
- * Copyright (C) 2007 Bahadir Balban
+ * Copyright (C) 2007, 2008 Bahadir Balban
  */
 #include <l4/generic/scheduler.h>
 #include <l4/lib/wait.h>
 #include <l4/lib/spinlock.h>
+#include <l4/api/errno.h>
 
-/* Sleep if the given condition isn't true. */
-#define wait_event(wqh, condition)				\
+/*
+ * This sets any wait details of a task so that any arbitrary
+ * wakers can know where the task is sleeping.
+ */
+void task_set_wqh(struct ktcb *task, struct waitqueue_head *wqh,
+		  struct waitqueue *wq)
+{
+	spin_lock(&task->waitlock);
+	task->waiting_on = wqh;
+	task->wq = wq;
+	spin_unlock(&task->waitlock);
+}
+
+
+/*
+ * This clears all wait details of a task. Used as the
+ * task is removed from its queue and is about to wake up.
+ */
+void task_unset_wqh(struct ktcb *task)
+{
+	spin_lock(&task->waitlock);
+	task->waiting_on = 0;
+	task->wq = 0;
+	spin_unlock(&task->waitlock);
+
+}
+
+/*
+ * Sleep if the given condition isn't true.
+ * ret will tell whether condition was met
+ * or we got interrupted.
+ */
+#define WAIT_EVENT(wqh, condition, ret)				\
 do {								\
+	ret = 0;						\
 	for (;;) {						\
 		if (condition)					\
 			break;					\
-		DECLARE_WAITQUEUE(wq, current);			\
+		CREATE_WAITQUEUE_ON_STACK(wq, current);		\
 		spin_lock(&wqh->slock);				\
+		task_set_wqh(current, wqh, wq);			\
 		wqh->sleepers++;				\
 		list_add_tail(&wq.task_list, &wqh->task_list);	\
-		sched_tell(current, SCHED_FL_SLEEP);		\
-		need_resched = 1;				\
+		task->state = TASK_SLEEPING;			\
 		printk("(%d) waiting...\n", current->tid);	\
 		spin_unlock(&wqh->slock);			\
+		schedule();					\
+		/* Did we wake up normally or get interrupted */\
+		if (current->flags & TASK_INTERRUPTED) {	\
+			current->flags &= ~TASK_INTERRUPTED;	\
+			ret = -EINTR;				\
+			break;					\
+		}						\
 	}							\
 } while(0);
 
 /* Sleep without any condition */
-#define wait_on(wqh)					\
+#define WAIT_ON(wqh, ret)				\
 do {							\
-	DECLARE_WAITQUEUE(wq, current);			\
+	CREATE_WAITQUEUE_ON_STACK(wq, current);		\
 	spin_lock(&wqh->slock);				\
+	task_set_wqh(current, wqh, &wq);		\
 	wqh->sleepers++;				\
 	list_add_tail(&wq.task_list, &wqh->task_list);	\
-	sched_tell(current, SCHED_FL_SLEEP);		\
-	need_resched = 1;				\
-	printk("(%d) waiting...\n", current->tid);	\
+	current->state = TASK_SLEEPING;			\
+	printk("(%d) waiting on wqh at: 0x%p\n",	\
+	       current->tid, wqh);			\
 	spin_unlock(&wqh->slock);			\
+	schedule();					\
+							\
+	/* Did we wake up normally or get interrupted */\
+	if (current->flags & TASK_INTERRUPTED) {	\
+		current->flags &= ~TASK_INTERRUPTED;	\
+		ret = -EINTR;				\
+	} else						\
+		ret = 0;				\
 } while(0);
 
-/* FIXME: Wake up should take the task as an argument, rather than the queue */
+/* Sleep without any condition */
+int wait_on(struct waitqueue_head *wqh)
+{
+	CREATE_WAITQUEUE_ON_STACK(wq, current);
+	spin_lock(&wqh->slock);
+	task_set_wqh(current, wqh, &wq);
+	wqh->sleepers++;
+	list_add_tail(&wq.task_list, &wqh->task_list);
+	current->state = TASK_SLEEPING;
+	printk("(%d) waiting on wqh at: 0x%p\n",
+	       current->tid, wqh);
+	spin_unlock(&wqh->slock);
+	schedule();
+
+	/* Did we wake up normally or get interrupted */
+	if (current->flags & TASK_INTERRUPTED) {
+		current->flags &= ~TASK_INTERRUPTED;
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+
 /* Wake up single waiter */
-void wake_up(struct waitqueue_head *wqh)
+void wake_up(struct waitqueue_head *wqh, int sync)
 {
 	BUG_ON(wqh->sleepers < 0);
 	spin_lock(&wqh->slock);
@@ -48,14 +120,82 @@ void wake_up(struct waitqueue_head *wqh)
 						  struct waitqueue,
 						  task_list);
 		struct ktcb *sleeper = wq->task;
+		task_unset_wqh(sleeper);
+		BUG_ON(list_empty(&wqh->task_list));
 		list_del_init(&wq->task_list);
 		wqh->sleepers--;
-		BUG_ON(list_empty(&wqh->task_list));
+		sleeper->state = TASK_RUNNABLE;
 		printk("(%d) Waking up (%d)\n", current->tid, sleeper->tid);
-		sched_notify_resume(sleeper);
 		spin_unlock(&wqh->slock);
+
+		if (sync)
+			sched_resume_sync(sleeper);
+		else
+			sched_resume_async(sleeper);
 		return;
 	}
 	spin_unlock(&wqh->slock);
 }
 
+/*
+ * Wakes up a task. If task is not waiting, or has been woken up
+ * as we were peeking on it, returns -1. @sync makes us immediately
+ * yield or else leave it to scheduler's discretion.
+ */
+int wake_up_task(struct ktcb *task, int sync)
+{
+	struct waitqueue_head *wqh;
+	struct waitqueue *wq;
+
+	spin_lock(&task->waitlock);
+	if (!task->waiting_on) {
+		spin_unlock(&task->waitlock);
+		return -1;
+	}
+
+	/*
+	 * We have found the waitqueue head.
+	 * That needs to be locked first to conform with
+	 * lock order and avoid deadlocks. Release task's
+	 * waitlock and take the wqh's one.
+	 */
+	wqh = task->waiting_on;
+	wq = task->wq;
+	spin_unlock(&task->waitlock);
+
+	/* -- Task can be woken up by someone else here -- */
+
+	spin_lock(&wqh->slock);
+
+	/*
+	 * Now lets check if the task is still
+	 * waiting and in the same queue
+	 */
+	spin_lock(&task->waitlock);
+	if (task->waiting_on != wqh) {
+		/* No, task has been woken by someone else */
+		spin_unlock(&wqh->slock);
+		spin_unlock(&task->waitlock);
+		return -1;
+	}
+
+	/* Now we can remove the task from its waitqueue */
+	list_del_init(&wq->task_list);
+	wqh->sleepers--;
+	task->waiting_on = 0;
+	task->wq = 0;
+	task->state = TASK_RUNNABLE;
+	spin_unlock(&wqh->slock);
+	spin_unlock(&task->waitlock);
+
+	/* Removed from waitqueue, we can now safely resume task */
+	if (sync)
+		sched_resume_sync(task);
+	else
+		sched_resume_async(task);
+
+	return 0;
+}
+
+
+
diff --git a/tasks/mm0/src/file.c b/tasks/mm0/src/file.c
index 8fe63bf..13ee4de 100644
--- a/tasks/mm0/src/file.c
+++ b/tasks/mm0/src/file.c
@@ -367,7 +367,6 @@ int flush_file_pages(struct vm_file *f)
 /* Given a task and fd, syncs all IO on it */
 int fsync_common(struct tcb *task, int fd)
 {
-	struct vm_file *f;
 	int err;
 
 	/* Check fd validity */
diff --git a/tasks/test0/src/forktest.c b/tasks/test0/src/forktest.c
index c7fe3f8..0edd217 100644
--- a/tasks/test0/src/forktest.c
+++ b/tasks/test0/src/forktest.c
@@ -29,8 +29,8 @@ int forktest(void)
 	}
 
 	/* Print only when failed, otherwise too many pass messages */
-	//	printf("PID: %d, my global: %d\n", myid, global);
-	// printf("-- PASSED --\n");
+	printf("PID: %d, my global: %d\n", myid, global);
+	printf("-- PASSED --\n");
 out:
 	while(1)
 		;