codezero/conts/posix/mm0/mm/fault.c

/*
 * Page fault handling.
 *
 * Copyright (C) 2007, 2008-2010 Bahadir Bilgehan Balban
 */
#include <vm_area.h>
#include <task.h>
#include <mem/alloc_page.h>
#include <mem/malloc.h>
#include <l4/generic/space.h>
#include <l4/api/errno.h>
#include <string.h>
#include <memory.h>
#include <shm.h>
#include <file.h>
#include <test.h>

#include L4LIB_INC_ARCH(syscalls.h)
#include L4LIB_INC_ARCH(syslib.h)
#include INC_GLUE(memory.h)
#include INC_SUBARCH(mm.h)
#include __INC_ARCH(mm.h)
#include __INC_ARCH(debug.h)

/* Given a page and the vma it is in, returns that page's virtual address */
unsigned long vma_page_to_virtual(struct vm_area *vma, struct page *page)
{
	unsigned long virtual_pfn = vma->pfn_start + page->offset - vma->file_offset;

	/* Page must be contained in vma's pages  */
	BUG_ON(vma->file_offset > page->offset);

	return __pfn_to_addr(virtual_pfn);
}

unsigned long fault_to_file_offset(struct fault_data *fault)
{
	/* Fault's offset in its vma */
	unsigned long vma_off_pfn = __pfn(fault->address) - fault->vma->pfn_start;

	/* Fault's offset in the file */
	unsigned long f_off_pfn = fault->vma->file_offset + vma_off_pfn;

	return f_off_pfn;
}

/*
 * Given a reference to a vm_object link, returns the next link but
 * avoids wrapping around back to head. If next is head, returns 0.
 *
 * vma->link1->link2->link3
 *       |      |      |
 *       V      V      V
 *       vmo1   vmo2   vmo3|vm_file
 *
 * Example:
 * Given a reference to link = vma, head = vma, returns link1.
 * Given a reference to link = link3, head = vma, returns 0.
 */
struct vm_obj_link *vma_next_link(struct link *link,
				  struct link *head)
{
	BUG_ON(list_empty(link));
	if (link->next == head)
		return 0;
	else
		return link_to_struct(link->next, struct vm_obj_link, list);
}

/* Unlinks orig_link from its vma and deletes it but keeps the object. */
struct vm_object *vma_drop_link(struct vm_obj_link *link)
{
	struct vm_object *dropped;

	/* Remove object link from vma's list */
	list_remove(&link->list);

	/* Unlink the link from object */
	dropped = vm_unlink_object(link);

	/* Delete the original link */
	kfree(link);

	return dropped;
}

/*
 * Checks if page cache pages of lesser is a subset of those of copier.
 *
 * FIXME:
 * Note this just checks the page cache, so if any objects have pages
 * swapped to disk, this function won't work, which is a logic error.
 * This should really count the swapped ones as well.
 */
int vm_object_is_subset(struct vm_object *shadow,
			struct vm_object *original)
{
	struct page *pc, *pl;

	/* Copier must have equal or more pages to overlap lesser */
	if (shadow->npages < original->npages)
		return 0;

	/*
	 * Do a page by page comparison. Every lesser page
	 * must be in copier for overlap.
	 */
	list_foreach_struct(pl, &original->page_cache, list)
		if (!(pc = find_page(shadow, pl->offset)))
			return 0;
	/*
	 * For all pages of lesser vmo, there seems to be a page
	 * in the copier vmo. So lesser is a subset of copier
	 */
	return 1;
}

static inline int vm_object_is_droppable(struct vm_object *shadow,
					 struct vm_object *original)
{
	if (shadow->npages == original->npages &&
	    (original->flags & VM_OBJ_SHADOW))
		return 1;
	else
		return 0;
}


/*
 * vma_merge_object()
 *
 * FIXME: Currently this is an optimisation that needs to go
 * away when swapping is available. We have this solely because
 * currently a shadow needs to identically mirror the whole
 * object underneath, in order to drop it. A file that is 1MB
 * long would spend 2MB until dropped. When swapping is available,
 * we will go back to identical mirroring instead of merging the
 * last shadow, since most unused pages would be swapped out.
 */

/*
 * When one shadow object is redundant, merges it into the shadow in front of it.
 * Note it must be determined that it is redundant before calling this function.
 *
 * vma --> link1 --> link2 --> link3
 *         |         |         |
 *         v         v         v
 *         Front     Redundant Next
 *         Shadow    Shadow    Object (E.g. shadow or file)
 */
int vma_merge_object(struct vm_object *redundant)
{
	/* The redundant shadow object */
	struct vm_object *front; /* Shadow in front of redundant */
	struct vm_obj_link *last_link;
	struct page *p1, *p2, *n;

	/* Check link and shadow count is really 1 */
	BUG_ON(redundant->nlinks != 1);
	BUG_ON(redundant->shadows != 1);

	/* Get the last shadower object in front */
	front = link_to_struct(redundant->shdw_list.next,
			   struct vm_object, shref);

	/* Move all non-intersecting pages to front shadow. */
	list_foreach_removable_struct(p1, n, &redundant->page_cache, list) {
		/* Page doesn't exist in front, move it there */
		if (!(p2 = find_page(front, p1->offset))) {
			list_remove_init(&p1->list);
			spin_lock(&p1->lock);
			p1->owner = front;
			spin_unlock(&p1->lock);
			insert_page_olist(p1, front);
			front->npages++;
		}
	}

	/* Sort out shadow relationships after the merge: */

	/* Front won't be a shadow of the redundant shadow anymore */
	list_remove_init(&front->shref);

	/* Check that there really was one shadower of redundant left */
	BUG_ON(!list_empty(&redundant->shdw_list));

	/* Redundant won't be a shadow of its next object */
	list_remove_init(&redundant->shref);

	/* Front is now a shadow of redundant's next object */
	list_insert(&front->shref, &redundant->orig_obj->shdw_list);
	front->orig_obj = redundant->orig_obj;

	/* Find last link for the object */
	last_link = link_to_struct(redundant->link_list.next,
			       struct vm_obj_link, linkref);

	/* Drop the last link to the object */
	vma_drop_link(last_link);

	/* Redundant shadow has no shadows anymore */
	BUG_ON(--redundant->shadows < 0);

	/* Delete the redundant shadow along with all its pages. */
	vm_object_delete(redundant);

	return 0;
}

struct vm_obj_link *vm_objlink_create(void)
{
	struct vm_obj_link *vmo_link;

	if (!(vmo_link = kzalloc(sizeof(*vmo_link))))
		return PTR_ERR(-ENOMEM);
	link_init(&vmo_link->list);
	link_init(&vmo_link->linkref);

	return vmo_link;
}

/*
 * Creates a bare vm_object along with its vma link, since
 * the shadow will be immediately used in a vma object list.
 */
struct vm_obj_link *vma_create_shadow(void)
{
	struct vm_object *vmo;
	struct vm_obj_link *vmo_link;

	if (IS_ERR(vmo_link = vm_objlink_create()))
		return 0;

	if (!(vmo = vm_object_create())) {
		kfree(vmo_link);
		return 0;
	}
	vmo->flags = VM_OBJ_SHADOW;

	vm_link_object(vmo_link, vmo);

	return vmo_link;
}

/* Allocates a new page, copies the original onto it and returns. */
struct page *copy_to_new_page(struct page *orig)
{
	void *paddr = alloc_page(1);

	BUG_ON(!paddr);

	/* Copy the page into new page */
	memcpy(phys_to_virt(paddr), page_to_virt(orig), PAGE_SIZE);

	return phys_to_page(paddr);
}

/* Copy all mapped object link stack from vma to new vma */
int vma_copy_links(struct vm_area *new_vma, struct vm_area *vma)
{
	struct vm_obj_link *vmo_link, *new_link;

	/* Get the first object on the vma */
	BUG_ON(list_empty(&vma->vm_obj_list));
	vmo_link = link_to_struct(vma->vm_obj_list.next,
			      struct vm_obj_link, list);
	do {
		/* Create a new link */
		new_link = vm_objlink_create();

		/* Link object with new link */
		vm_link_object(new_link, vmo_link->obj);

		/* Add the new link to vma in object order */
		list_insert_tail(&new_link->list, &new_vma->vm_obj_list);

	/* Continue traversing links, doing the same copying */
	} while((vmo_link = vma_next_link(&vmo_link->list,
					  &vma->vm_obj_list)));

	return 0;
}

/*
 * Determine if an object is deletable.
 *
 * Shadows are deleted if nlinks = 0, and
 * merged if they have nlinks = 1, shadows = 1.
 * See below for explanation.
 *
 * vfs-type vmfiles are deleted if their
 * openers = 0, and their nlinks
 * (i.e. mappers) = 0.
 *
 * shm-type vmfiles are deleted if their
 * nlinks = 0, since they only have map count.
 */
int vm_object_is_deletable(struct vm_object *obj)
{
	struct vm_file *f;

	//printf("%s: Checking: ", __FUNCTION__);
	//vm_object_print(obj);

	if (obj->nlinks != 0)
		return 0;

	BUG_ON(obj->shadows != 0);
	BUG_ON(!list_empty(&obj->shref));

	if (obj->flags & VM_OBJ_SHADOW)
		return 1;

	f = vm_object_to_file(obj);

	/* Devzero should probably never have 0 refs left */
	if (f->type == VM_FILE_DEVZERO)
		return 0;
	else if (f->type == VM_FILE_SHM)
		return 1;
	else if (f->type == VM_FILE_VFS) {
		if (f->openers == 0)
			return 1;
		else
			return 0;
	}

	/* To make gcc happy */
	BUG();
	return 0;
}

/*
 * exit has: !prev, next || !next
 * shadow drop has: prev, next
 */

/*
 * Shadow drops: Dropping a link to shadow does not mean the shadow's
 * next object has lost a shadow. There may be other links to both. But
 * when the shadow has dropped its last link, and is going to be deleted,
 * it is then true that the shadow is lost by the next object.
 */
int vma_drop_merge_delete(struct vm_area *vma, struct vm_obj_link *link)
{
	struct vm_obj_link *prev, *next;
	struct vm_object *obj;

	/* Get previous and next links, if they exist */
	prev = (link->list.prev == &vma->vm_obj_list) ? 0 :
		link_to_struct(link->list.prev, struct vm_obj_link, list);

	next = (link->list.next == &vma->vm_obj_list) ? 0 :
		link_to_struct(link->list.next, struct vm_obj_link, list);

	/* Drop the link */
	obj = vma_drop_link(link);

	/* If there is an object in front, this is a shadow drop */
	if (prev) {
		BUG_ON(!(prev->obj->flags & VM_OBJ_SHADOW));
		BUG_ON(!(prev->obj->flags & VM_WRITE));
		BUG_ON(--obj->shadows < 0);
		// vm_object_print(obj);

		/* Remove prev from current object's shadow list */
		BUG_ON(list_empty(&prev->obj->shref));
		list_remove_init(&prev->obj->shref);

		/*
		 * We don't allow dropping non-shadow objects yet,
		 * (see ...is_droppable) so there must be a next.
		 */
		BUG_ON(!next);

		/* prev is now shadow of next */
		list_insert(&prev->obj->shref,
			 &next->obj->shdw_list);
		prev->obj->orig_obj = next->obj;

		/*
		 * No referrers left, meaning this object is not
		 * shadowing its original object anymore.
		 */
		if (obj->nlinks == 0) {
			BUG_ON(obj->orig_obj != next->obj);
			list_remove_init(&obj->shref);
		} else {
			/*
			 * Dropped object still has referrers, which
			 * means next has gained a new shadow.
			 * Here's why:
			 *
			 * T1 and T2:	        T2: drop-
			 * prev->drop->next	         \
			 *              became: T1: prev--- next
			 *
			 * Now we have both prev and current object
			 * in next's shadow list.
			 */
			next->obj->shadows++;
		}
	/* It's an exit, we check if there's a shadow loss */
	} else {
		if (obj->nlinks == 0) {
			/* Is it a shadow delete? Sort out next */
			if (next && obj->flags & VM_OBJ_SHADOW) {
				BUG_ON(obj->orig_obj != next->obj);
				BUG_ON(--next->obj->shadows < 0);
				// vm_object_print(next->obj);
				list_remove_init(&obj->shref);
			}
		}
	}

	/* Now deal with the object itself */
	if (vm_object_is_deletable(obj)) {
		dprintf("Deleting object:\n");
		// vm_object_print(obj);
		vm_object_delete(obj);
	} else if ((obj->flags & VM_OBJ_SHADOW) &&
		   obj->nlinks == 1 && obj->shadows == 1) {
		dprintf("Merging object:\n");
		// vm_object_print(obj);
		vma_merge_object(obj);
	}

	mm0_test_global_vm_integrity();
	return 0;
}

/*
 * A scenario that pretty much covers every exit() case.
 *
 * T = vma on a unique task
 * l = link
 * Sobj = Shadow object
 * Fobj = File object
 *
 * Every l links to the object on the nearest
 * row to it and on the same column.
 *
 *	l	l	l	l	l	l		T
 *	Sobj	Sobj
 *
 *			Sobj	Sobj	Sobj	Fobj
 *
 * Sobj	Sobj	Sobj
 * l	l	l	l	l	l	l		T
 *
 * l	l	l	l	l	l	l		T
 * Sobj
 *
 */

/* This version is used when exiting. */
int vma_drop_merge_delete_all(struct vm_area *vma)
{
	struct vm_obj_link *vmo_link, *n;

	/* Vma cannot be empty */
	BUG_ON(list_empty(&vma->vm_obj_list));

	/* Traverse and get rid of all links */
	list_foreach_removable_struct(vmo_link, n, &vma->vm_obj_list, list)
		vma_drop_merge_delete(vma, vmo_link);

	return 0;
}

/* TODO:
 * - Why not allocate a swap descriptor in vma_create_shadow() rather than
 *   a bare vm_object? It will be needed.
 * - Check refcounting of shadows, their references, page refs,
 *   reduces increases etc.
 *
 *   This handles copy-on-write semantics in various situations. Returns
 *   page struct for copy page availabe for mapping.
 *
 *   1) Copy-on-write of read-only files. (Creates r/w shadows/adds pages)
 *   2) Copy-on-write of forked RO shadows (Creates r/w shadows/adds pages)
 *   3) Copy-on-write of shm files. (Adds pages to r/w shm file from devzero).
 */
struct page *copy_on_write(struct fault_data *fault)
{
	struct vm_obj_link *vmo_link, *shadow_link;
	struct vm_object *shadow;
	struct page *page, *new_page;
	struct vm_area *vma = fault->vma;
	unsigned long file_offset = fault_to_file_offset(fault);

	/* Get the first object, either original file or a shadow */
	if (!(vmo_link = vma_next_link(&vma->vm_obj_list, &vma->vm_obj_list))) {
		printf("%s:%s: No vm object in vma!\n",
		       __TASKNAME__, __FUNCTION__);
		BUG();
	}

	/* Is the object read-only? Create a shadow object if so.
	 *
	 * NOTE: Whenever the topmost object is read-only, a new shadow
	 * object must be created. When there are no shadows one is created
	 * because, its the original vm_object that is not writeable, and
	 * when there are shadows one is created because a fork had just
	 * happened, in which case all shadows are rendered read-only.
	 */
	if (!(vmo_link->obj->flags & VM_WRITE)) {
		if (!(shadow_link = vma_create_shadow()))
			return PTR_ERR(-ENOMEM);

		/* Initialise the shadow */
		shadow = shadow_link->obj;
		shadow->orig_obj = vmo_link->obj;
		shadow->flags = VM_OBJ_SHADOW | VM_WRITE;
		shadow->pager = &swap_pager;
		vmo_link->obj->shadows++;
		// vm_object_print(vmo_link->obj);
		dprintf("%s: Created a shadow:\n", __TASKNAME__);
		// vm_object_print(shadow);
		dprintf("%s: Original object:\n", __TASKNAME__);
		// vm_object_print(shadow->orig_obj);

		/*
		 * Add the shadow in front of the original:
		 *
 		 * vma->link0->link1
 		 *       |      |
 		 *       v      v
 		 *       shadow original
		 */
		list_insert(&shadow_link->list, &vma->vm_obj_list);

		/* Add object to original's shadower list */
		list_insert(&shadow->shref, &shadow->orig_obj->shdw_list);

		/* Add to global object list */
		global_add_vm_object(shadow);

	} else {
		/* We ought to copy the missing RW page to top shadow */
		dprintf("No new shadows. Going to add to "
			"topmost r/w shadow object\n");
		shadow_link = vmo_link;

		/*
		 * FIXME: Here we check for the case that a cloned thread is
		 * doing a duplicate write request on an existing RW shadow
		 * page. If so, we return the existing writable page in the top
		 * shadow. We should find a generic way to detect duplicate
		 * requests and cease IPC at an earlier stage.
		 */
		page = shadow_link->obj->pager->ops.page_in(shadow_link->obj,
							    file_offset);
		if (!IS_ERR(page))
			return page;

		/*
		 * We start page search on read-only objects. If the first
		 * one was writable, go to next which must be read-only.
		 */
		BUG_ON(!(vmo_link = vma_next_link(&vmo_link->list,
						  &vma->vm_obj_list)));
		BUG_ON(vmo_link->obj->flags & VM_WRITE);
	}

	/* Traverse the list of read-only vm objects and search for the page */
	while (IS_ERR(page = vmo_link->obj->pager->ops.page_in(vmo_link->obj,
							       file_offset))) {
		if (!(vmo_link = vma_next_link(&vmo_link->list,
					       &vma->vm_obj_list))) {
			printf("%s:%s: Traversed all shadows and the original "
			       "file's vm_object, but could not find the "
			       "faulty page in this vma.\n",__TASKNAME__,
			       __FUNCTION__);
			BUG();
		}
	}

	/*
	 * Copy the page. This traverse and copy is like a page-in operation
	 * of a pager, except that the page is moving along vm_objects.
	 */
	new_page = copy_to_new_page(page);

	/* Update page details */
	spin_lock(&new_page->lock);
	BUG_ON(!list_empty(&new_page->list));
	new_page->refcnt = 0;
	new_page->owner = shadow_link->obj;
	new_page->offset = file_offset;
	new_page->virtual = 0;
	spin_unlock(&page->lock);

	/* Add the page to owner's list of in-memory pages */
	insert_page_olist(new_page, new_page->owner);
	new_page->owner->npages++;

	mm0_test_global_vm_integrity();

	/* Shared faults don't have shadows so we don't look for collapses */
	if (!(vma->flags & VMA_SHARED)) {

		/*
		 * Finished handling the actual fault, now check for possible
		 * shadow collapses. Does the shadow completely shadow the one
		 * underlying it?
		 */
		if (!(vmo_link = vma_next_link(&shadow_link->list,
					       &vma->vm_obj_list))) {
			/* Copier must have an object under it */
			printf("Copier must have had an object under it!\n");
			BUG();
		}
		if (vm_object_is_droppable(shadow_link->obj, vmo_link->obj))
			vma_drop_merge_delete(vma, vmo_link);
	}

	return new_page;
}

/*
 * Handles the page fault, all entries here are assumed *legal*
 * faults, i.e. do_page_fault() should have already checked
 * for illegal accesses.
 *
 * NOTE:
 * Anon/Shared pages:
 * First access from first process is COW. All subsequent RW
 * accesses (which are attempts of *sharing*) simply map that
 * page to faulting processes.
 *
 * Non-anon/shared pages:
 * First access from first process simply writes to the pages
 * of that file. All subsequent accesses by other processes
 * do so as well.
 *
 * FIXME: Add VM_DIRTY bit for every page that has write-faulted.
 */

/* Handle read faults */
struct page *page_read_fault(struct fault_data *fault)
{
	struct vm_area *vma = fault->vma;
	struct vm_obj_link *vmo_link;
	unsigned long file_offset;
	struct page *page = 0;

	file_offset = fault_to_file_offset(fault);

	/* Get the first object, either original file or a shadow */
	if (!(vmo_link = vma_next_link(&vma->vm_obj_list, &vma->vm_obj_list))) {
		printf("%s:%s: No vm object in vma!\n",
		       __TASKNAME__, __FUNCTION__);
		BUG();
	}

	/* Traverse the list of read-only vm objects and search for the page */
	while (IS_ERR(page = vmo_link->obj->pager->ops.page_in(vmo_link->obj,
							       file_offset))) {
		if (!(vmo_link = vma_next_link(&vmo_link->list,
					       &vma->vm_obj_list))) {
			printf("%s:%s: Traversed all shadows and the original "
			       "file's vm_object, but could not find the "
			       "faulty page in this vma.\n",__TASKNAME__,
			       __FUNCTION__);
			BUG();
		}
	}
	BUG_ON(!page);

	return page;
}

struct page *page_write_fault(struct fault_data *fault)
{
	unsigned int vma_flags = fault->vma->flags;
	struct vm_area *vma = fault->vma;
	struct vm_obj_link *vmo_link;
	unsigned long file_offset;
	struct page *page = 0;

	/* Copy-on-write. All private vmas are always COW */
	if (vma_flags & VMA_PRIVATE) {
		BUG_ON(IS_ERR(page = copy_on_write(fault)));

	/*
	 * This handles shared pages that are both anon and non-anon.
	 */
	} else if ((vma_flags & VMA_SHARED)) {
		file_offset = fault_to_file_offset(fault);

		/* Don't traverse, just take the first object */
		BUG_ON(!(vmo_link = vma_next_link(&vma->vm_obj_list,
						  &vma->vm_obj_list)));

		/* Get the page from its pager */
		if (IS_ERR(page = vmo_link->obj->pager->ops.page_in(vmo_link->obj,
								    file_offset))) {
			/*
			 * Writable page does not exist,
			 * if it is anonymous, it needs to be COW'ed,
			 * otherwise the file must have paged-in this
			 * page, so its a bug.
			 */
			if (vma_flags & VMA_ANONYMOUS) {
				BUG_ON(IS_ERR(page = copy_on_write(fault)));
				return page;
			} else {
				printf("%s: Could not obtain faulty "
				       "page from regular file.\n",
				       __TASKNAME__);
				BUG();
			}
		}

		/*
		 * Page and object are now dirty. Currently it's
		 * only relevant for file-backed shared objects.
		 */
		page->flags |= VM_DIRTY;
		page->owner->flags |= VM_DIRTY;
	} else
		BUG();

	return page;
}

struct page *__do_page_fault(struct fault_data *fault)
{
	unsigned int reason = fault->reason;
	unsigned int pte_flags = fault->pte_flags;
	unsigned int map_flags = 0;
	struct page *page = 0;

	if ((reason & VM_READ) && (pte_flags & VM_NONE)) {
		page = page_read_fault(fault);
		map_flags = MAP_USR_RO;

	} else if ((reason & VM_WRITE) && (pte_flags & VM_NONE)) {
		page = page_read_fault(fault);
		page = page_write_fault(fault);
		map_flags = MAP_USR_RW;

	} else if ((reason & VM_EXEC) && (pte_flags & VM_NONE)) {
		page = page_read_fault(fault);
		map_flags = MAP_USR_RX;

	} else if ((reason & VM_EXEC) && (pte_flags & VM_READ)) {
		/* Retrieve already paged in file */
		page = page_read_fault(fault);
		if (pte_flags & VM_WRITE)
			map_flags = MAP_USR_RWX;
		else
			map_flags = MAP_USR_RX;

	} else if ((reason & VM_WRITE) && (pte_flags & VM_READ)) {
		page = page_write_fault(fault);
		if (pte_flags & VM_EXEC)
			map_flags = MAP_USR_RWX;
		else
			map_flags = MAP_USR_RW;

	} else {
		printf("mm0: Unhandled page fault.\n");
		BUG();
	}

	BUG_ON(!page);

	/* Map the new page to faulty task */
	l4_map((void *)page_to_phys(page),
	       (void *)page_align(fault->address), 1,
	       map_flags, fault->task->tid);
	// vm_object_print(page->owner);

	return page;
}

/*
 * Sets all r/w shadow objects as read-only for the process
 * so that as expected after a fork() operation, writes to those
 * objects cause copy-on-write events.
 */
int vm_freeze_shadows(struct tcb *task)
{
	unsigned long virtual;
	struct vm_area *vma;
	struct vm_obj_link *vmo_link;
	struct vm_object *vmo;
	struct page *p;

	list_foreach_struct(vma, &task->vm_area_head->list, list) {

		/* Shared vmas don't have shadows */
		if (vma->flags & VMA_SHARED)
			continue;

		/* Get the first object */
		BUG_ON(list_empty(&vma->vm_obj_list));
		vmo_link = link_to_struct(vma->vm_obj_list.next,
				      struct vm_obj_link, list);
		vmo = vmo_link->obj;

		/*
		 * Is this a writeable shadow?
		 *
		 * The only R/W shadow in a vma object chain
		 * can be the first one, so we don't check further
		 * objects if first one is not what we want.
		 */
		if (!((vmo->flags & VM_OBJ_SHADOW) &&
		      (vmo->flags & VM_WRITE)))
			continue;

		/* Make the object read only */
		vmo->flags &= ~VM_WRITE;
		vmo->flags |= VM_READ;

		/*
		 * Make all pages on it read-only
		 * in the page tables.
		 */
		list_foreach_struct(p, &vmo->page_cache, list) {

			/* Find virtual address of each page */
			virtual = vma_page_to_virtual(vma, p);

			/* Map the page as read-only */
			l4_map((void *)page_to_phys(p),
			       (void *)virtual, 1,
			       MAP_USR_RO, task->tid);
		}
	}

	return 0;
}

/*
 * Page fault model:
 *
 * A page is anonymous (e.g. stack)
 *  - page needs read access:
 *  	action: map the zero page.
 *  - page needs write access:
 *      action: allocate ZI page and map that. Swap file owns the page.
 *  - page is swapped to swap:
 *      action: read back from swap file into new page.
 *
 * A page is file-backed but private (e.g. .data section)
 *  - page needs read access:
 *      action: read the page from its file.
 *  - page is swapped out before being private. (i.e. invalidated)
 *      action: read the page from its file. (original file)
 *  - page is swapped out after being private.
 *      action: read the page from its file. (swap file)
 *  - page needs write access:
 *      action: allocate new page, declare page as private, change its
 *              owner to swap file.
 *
 * A page is file backed but not-private, and read-only. (e.g. .text section)
 *  - page needs read access:
 *     action: read in the page from its file.
 *  - page is swapped out. (i.e. invalidated)
 *     action: read in the page from its file.
 *  - page needs write access:
 *     action: forbidden, kill task?
 *
 * A page is file backed but not-private, and read/write. (e.g. any data file.)
 *  - page needs read access:
 *     action: read in the page from its file.
 *  - page is flushed back to its original file. (i.e. instead of swap)
 *     action: read in the page from its file.
 *  - page needs write access:
 *     action: read the page in, give write access.
 */
struct page *do_page_fault(struct fault_data *fault)
{
	unsigned int vma_flags = (fault->vma) ? fault->vma->flags : VM_NONE;
	unsigned int reason = fault->reason;

	/* vma flags show no access */
	if (vma_flags & VM_NONE) {
		printf("Illegal access, tid: %d, address: 0x%x, PC @ 0x%x,\n",
		       fault->task->tid, fault->address, fault->kdata->faulty_pc);
		fault_handle_error(fault);
	}

	/* The access reason is not included in the vma's listed flags */
	if (!(reason & vma_flags)) {
		printf("Illegal access, tid: %d, address: 0x%x, PC @ 0x%x\n",
		       fault->task->tid, fault->address, fault->kdata->faulty_pc);
		fault_handle_error(fault);
	}

	/* Handle legitimate faults */
	return __do_page_fault(fault);
}

struct page *page_fault_handler(struct tcb *sender, fault_kdata_t *fkdata)
{
	struct fault_data fault = {
		/* Fault data from kernel */
		.kdata = fkdata,
		.task = sender,
	};

	/* Extract fault reason, fault address etc. in generic format */
	set_generic_fault_params(&fault);

	/* Get vma info */
	if (!(fault.vma = find_vma(fault.address,
				   &fault.task->vm_area_head->list)))
		printf("Hmm. No vma for faulty region. "
		       "Bad things will happen.\n");

	/* Handle the actual fault */
	return do_page_fault(&fault);
}

static inline unsigned int pte_to_map_flags(unsigned int pte_flags)
{
	unsigned int map_flags;

	switch(pte_flags) {
	case VM_READ:
		map_flags = MAP_USR_RO;
		break;
	case (VM_READ | VM_WRITE):
		map_flags = MAP_USR_RW;
		break;
	case (VM_READ | VM_WRITE | VM_EXEC):
		map_flags = MAP_USR_RWX;
		break;
	case (VM_READ | VM_EXEC):
		map_flags = MAP_USR_RX;
		break;
	default:
		BUG();
	}

	return map_flags;
}

/*
 * Prefaults a page of a task. The catch is that the page may already
 * have been faulted with even more progress than the desired
 * flags would progress in the fault (e.g. read-faulting a
 * copy-on-write'd page).
 *
 * This function detects whether progress is necessary or not by
 * inspecting the vma's vm_object chain state.
 *
 * Generally both read-fault and write-fault paths are repeatable, in
 * the sense that an already faulted page may be safely re-faulted again
 * and again, be it a read-only or copy-on-write'd page.
 *
 * The retrieval of the same page in a repetitive fashion is safe,
 * but while it also seems to appear safe, it is unnecessary to downgrade
 * or change mapping permissions of a page. E.g. make a copy-on-write'd
 * page read-only by doing a blind read-fault on it.
 *
 * Hence this function checks whether a fault is necessary and simply
 * returns if it isn't.
 *
 * FIXME: Escalate any page fault errors like a civilized function!
 */
struct page *task_prefault_smart(struct tcb *task, unsigned long address,
				 unsigned int wanted_flags)
{
	struct vm_obj_link *vmo_link;
	unsigned long file_offset;
	unsigned int vma_flags, pte_flags;
	struct vm_area *vma;
	struct page *page;
	int err;

	struct fault_data fault = {
		.task = task,
		.address = address,
	};

	/* Find the vma */
	if (!(fault.vma = find_vma(fault.address,
				   &fault.task->vm_area_head->list))) {
		dprintf("%s: Invalid: No vma for given address. %d\n",
			__FUNCTION__, -EINVAL);
		return PTR_ERR(-EINVAL);
	}

	/* Read fault, repetitive safe */
	if (wanted_flags & VM_READ)
		if (IS_ERR(page = page_read_fault(&fault)))
			return page;

	/* Write fault, repetitive safe */
	if (wanted_flags & VM_WRITE)
		if (IS_ERR(page = page_write_fault(&fault)))
			return page;

	/*
	 * If we came this far, it means we have more
	 * permissions than VM_NONE.
	 *
	 * Now we _must_ find out what those page
	 * protection flags were, and do this without
	 * needing to inspect any ptes.
	 *
	 * We don't want to downgrade a RW page to RO again.
	 */
	file_offset = fault_to_file_offset(&fault);
	vma_flags = fault.vma->flags;
	vma = fault.vma;

	/* Get the topmost vm_object */
	if (!(vmo_link = vma_next_link(&vma->vm_obj_list,
				       &vma->vm_obj_list))) {
		printf("%s:%s: No vm object in vma!\n",
		       __TASKNAME__, __FUNCTION__);
		BUG();
	}

	/* Traverse the list of vm objects and search for the page */
	while (IS_ERR(page = vmo_link->obj->pager->ops.page_in(vmo_link->obj,
							       file_offset))) {
		if (!(vmo_link = vma_next_link(&vmo_link->list,
					       &vma->vm_obj_list))) {
			printf("%s:%s: Traversed all shadows and the original "
			       "file's vm_object, but could not find the "
			       "faulty page in this vma.\n",__TASKNAME__,
			       __FUNCTION__);
			BUG();
		}
	}

	/* Use flags for the vm_object containing the page */
	if (vmo_link->obj->flags & VM_WRITE)
		pte_flags = VM_WRITE | VM_READ;
	else
		pte_flags = VM_READ;

	/*
	 * Now check vma flags for adding the VM_EXEC
	 * The real pte may not have this flag yet, but
	 * it is allowed to have it and it doesn't harm.
	 */
	if (vma_flags & VM_EXEC)
		pte_flags |= VM_EXEC;

	/* Map the page to task using these flags */
	if ((err = l4_map((void *)page_to_phys(page),
			  (void *)page_align(fault.address), 1,
			  pte_to_map_flags(pte_flags),
			  fault.task->tid)) < 0) {
		printf("l4_map() failed. err=%d\n", err);
		BUG();
	}

	return page;
}

/*
 * Prefaults the page with given virtual address, to given task
 * with given reasons. Multiple reasons are allowed, they are
 * handled separately in order.
 */
struct page *task_prefault_page(struct tcb *task, unsigned long address,
				unsigned int vmflags)
{
	struct page *ret;

	perfmon_reset_start_cyccnt();
	ret = task_prefault_smart(task, address, vmflags);

	debug_record_cycles("task_prefault_smart");

	return ret;

#if 0
	struct page *p;
	struct fault_data fault = {
		.task = task,
		.address = address,
	};

	dprintf("Pre-faulting address 0x%lx, on task %d, with flags: 0x%x\n",
		address, task->tid, vmflags);

	/* Find the vma */
	if (!(fault.vma = find_vma(fault.address,
				   &fault.task->vm_area_head->list))) {
		dprintf("%s: Invalid: No vma for given address. %d\n",
			__FUNCTION__, -EINVAL);
		return PTR_ERR(-EINVAL);
	}

	/* Flags may indicate multiple fault reasons. First do the read */
	if (vmflags & VM_READ) {
		fault.pte_flags = VM_NONE;
		fault.reason = VM_READ;
		if (IS_ERR(p = do_page_fault(&fault)))
			return p;
	}
	/* Now write */
	if (vmflags & VM_WRITE) {
		fault.pte_flags = VM_READ;
		fault.reason = VM_WRITE;
		if (IS_ERR(p = do_page_fault(&fault)))
			return p;
	}
	/* No exec or any other fault reason allowed. */
	BUG_ON(vmflags & ~(VM_READ | VM_WRITE));

	return p;
#endif
}


int vm_compare_prot_flags(unsigned int current, unsigned int needed)
{
	current &= VM_PROT_MASK;
	needed &= VM_PROT_MASK;

	if (needed & VM_READ)
		if (current & (VM_READ | VM_WRITE))
			return 1;

	if (needed & VM_WRITE &&
	    (current & VM_WRITE))
		return 1;

	return 0;
}