diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs
index 3c57607..0bff758 100644
--- a/src/arch/x86_64/gdt.rs
+++ b/src/arch/x86_64/gdt.rs
@@ -1,12 +1,29 @@
 //! Global Descriptor Table (GDT) for x86-64
 //!
-//! This module provides a kernel-space GDT that can be used after
-//! identity mapping is removed.
+//! This module provides the GDT with kernel and user segments, plus TSS.
+//!
+//! GDT Layout:
+//! - 0x00: Null descriptor
+//! - 0x08: Kernel code segment (ring 0)
+//! - 0x10: Kernel data segment (ring 0)
+//! - 0x18: User data segment (ring 3)
+//! - 0x20: User code segment (ring 3)
+//! - 0x28: TSS descriptor (16 bytes, spans 0x28-0x37)
 
 use core::arch::asm;
 use core::mem::size_of;
 
-/// GDT entry (segment descriptor)
+/// Segment selectors
+pub mod selectors {
+    pub const NULL: u16 = 0x00;
+    pub const KERNEL_CODE: u16 = 0x08;
+    pub const KERNEL_DATA: u16 = 0x10;
+    pub const USER_DATA: u16 = 0x18 | 3;   // RPL 3
+    pub const USER_CODE: u16 = 0x20 | 3;   // RPL 3
+    pub const TSS: u16 = 0x28;
+}
+
+/// GDT entry (segment descriptor) - 8 bytes
 #[repr(C, packed)]
 #[derive(Clone, Copy)]
 pub struct GdtEntry {
@@ -31,29 +48,133 @@ impl GdtEntry {
         }
     }
 
-    /// Create a 64-bit code segment descriptor
-    pub const fn code64() -> Self {
+    /// Create a 64-bit kernel code segment
+    pub const fn kernel_code() -> Self {
         Self {
             limit_low: 0xFFFF,
             base_low: 0,
             base_mid: 0,
-            access: 0x9A,       // Present, ring 0, code, exec/read
-            flags_limit_high: 0xAF, // 64-bit, limit high nibble
+            access: 0x9A,       // Present, DPL 0, code, exec/read
+            flags_limit_high: 0xAF, // 64-bit, limit high
             base_high: 0,
         }
     }
 
-    /// Create a data segment descriptor
-    pub const fn data() -> Self {
+    /// Create a kernel data segment
+    pub const fn kernel_data() -> Self {
         Self {
             limit_low: 0xFFFF,
             base_low: 0,
             base_mid: 0,
-            access: 0x92,       // Present, ring 0, data, read/write
+            access: 0x92,       // Present, DPL 0, data, read/write
             flags_limit_high: 0xCF, // 32-bit, 4KB granularity
             base_high: 0,
         }
     }
+
+    /// Create a 64-bit user code segment
+    pub const fn user_code() -> Self {
+        Self {
+            limit_low: 0xFFFF,
+            base_low: 0,
+            base_mid: 0,
+            access: 0xFA,       // Present, DPL 3, code, exec/read
+            flags_limit_high: 0xAF, // 64-bit, limit high
+            base_high: 0,
+        }
+    }
+
+    /// Create a user data segment
+    pub const fn user_data() -> Self {
+        Self {
+            limit_low: 0xFFFF,
+            base_low: 0,
+            base_mid: 0,
+            access: 0xF2,       // Present, DPL 3, data, read/write
+            flags_limit_high: 0xCF, // 32-bit, 4KB granularity
+            base_high: 0,
+        }
+    }
+}
+
+/// TSS descriptor (16 bytes in 64-bit mode)
+#[repr(C, packed)]
+#[derive(Clone, Copy)]
+pub struct TssDescriptor {
+    limit_low: u16,
+    base_low: u16,
+    base_mid: u8,
+    access: u8,
+    flags_limit_high: u8,
+    base_high: u8,
+    base_upper: u32,
+    reserved: u32,
+}
+
+impl TssDescriptor {
+    pub const fn null() -> Self {
+        Self {
+            limit_low: 0,
+            base_low: 0,
+            base_mid: 0,
+            access: 0,
+            flags_limit_high: 0,
+            base_high: 0,
+            base_upper: 0,
+            reserved: 0,
+        }
+    }
+
+    /// Create a TSS descriptor for the given TSS address and size
+    pub fn new(base: u64, limit: u32) -> Self {
+        Self {
+            limit_low: limit as u16,
+            base_low: base as u16,
+            base_mid: (base >> 16) as u8,
+            access: 0x89,       // Present, 64-bit TSS (available)
+            flags_limit_high: ((limit >> 16) as u8) & 0x0F,
+            base_high: (base >> 24) as u8,
+            base_upper: (base >> 32) as u32,
+            reserved: 0,
+        }
+    }
+}
+
+/// Task State Segment (TSS) for x86-64
+///
+/// The TSS is used primarily for:
+/// - RSP0: Stack to use when transitioning from ring 3 to ring 0
+/// - IST: Interrupt Stack Table for specific interrupts
+#[repr(C, packed)]
+pub struct Tss {
+    reserved0: u32,
+    /// Stack pointers for privilege levels 0-2
+    pub rsp0: u64,
+    pub rsp1: u64,
+    pub rsp2: u64,
+    reserved1: u64,
+    /// Interrupt Stack Table (IST) entries 1-7
+    pub ist: [u64; 7],
+    reserved2: u64,
+    reserved3: u16,
+    /// I/O map base address
+    pub iopb: u16,
+}
+
+impl Tss {
+    pub const fn new() -> Self {
+        Self {
+            reserved0: 0,
+            rsp0: 0,
+            rsp1: 0,
+            rsp2: 0,
+            reserved1: 0,
+            ist: [0; 7],
+            reserved2: 0,
+            reserved3: 0,
+            iopb: size_of::<Tss>() as u16,
+        }
+    }
 }
 
 /// GDT pointer for LGDT instruction
@@ -63,53 +184,83 @@ pub struct GdtPointer {
     base: u64,
 }
 
-/// Number of GDT entries
-const GDT_ENTRIES: usize = 3;
-
-/// Kernel GDT with null, code, and data segments
+/// Combined GDT structure with all entries
 #[repr(C, align(16))]
 pub struct Gdt {
-    entries: [GdtEntry; GDT_ENTRIES],
+    null: GdtEntry,
+    kernel_code: GdtEntry,
+    kernel_data: GdtEntry,
+    user_data: GdtEntry,
+    user_code: GdtEntry,
+    tss: TssDescriptor,
 }
 
 impl Gdt {
     pub const fn new() -> Self {
         Self {
-            entries: [
-                GdtEntry::null(),   // 0x00: Null descriptor
-                GdtEntry::code64(), // 0x08: Kernel code segment
-                GdtEntry::data(),   // 0x10: Kernel data segment
-            ],
+            null: GdtEntry::null(),
+            kernel_code: GdtEntry::kernel_code(),
+            kernel_data: GdtEntry::kernel_data(),
+            user_data: GdtEntry::user_data(),
+            user_code: GdtEntry::user_code(),
+            tss: TssDescriptor::null(),
         }
     }
+
+    /// Set the TSS descriptor
+    pub fn set_tss(&mut self, base: u64, limit: u32) {
+        self.tss = TssDescriptor::new(base, limit);
+    }
 }
 
-/// Static kernel GDT (in higher-half memory)
-static KERNEL_GDT: Gdt = Gdt::new();
+use core::cell::UnsafeCell;
 
-/// Reload the GDT with the kernel-space GDT
+struct SyncGdt(UnsafeCell<Gdt>);
+unsafe impl Sync for SyncGdt {}
+
+struct SyncTss(UnsafeCell<Tss>);
+unsafe impl Sync for SyncTss {}
+
+/// Static kernel GDT
+static KERNEL_GDT: SyncGdt = SyncGdt(UnsafeCell::new(Gdt::new()));
+
+/// Static kernel TSS
+static KERNEL_TSS: SyncTss = SyncTss(UnsafeCell::new(Tss::new()));
+
+/// Initialize the GDT with TSS
 ///
-/// This should be called before removing identity mapping to ensure
-/// the GDT is accessible after the low memory is unmapped.
-pub fn reload() {
+/// This sets up the full GDT including user segments and TSS,
+/// then loads it into the CPU.
+pub fn init(kernel_stack: u64) {
+    let gdt = unsafe { &mut *KERNEL_GDT.0.get() };
+    let tss = unsafe { &mut *KERNEL_TSS.0.get() };
+
+    // Set up TSS with kernel stack for ring 0
+    tss.rsp0 = kernel_stack;
+
+    // Update GDT with TSS descriptor
+    let tss_addr = tss as *const Tss as u64;
+    let tss_limit = (size_of::<Tss>() - 1) as u32;
+    gdt.set_tss(tss_addr, tss_limit);
+
+    // Load GDT
+    let gdt_size = size_of::<Gdt>();
     let pointer = GdtPointer {
-        limit: (size_of::<[GdtEntry; GDT_ENTRIES]>() - 1) as u16,
-        base: KERNEL_GDT.entries.as_ptr() as u64,
+        limit: (gdt_size - 1) as u16,
+        base: gdt as *const Gdt as u64,
     };
 
     unsafe {
-        // Load new GDT
         asm!("lgdt [{}]", in(reg) &pointer, options(nostack, preserves_flags));
 
-        // Reload code segment by doing a far return
-        // Push SS, RSP, RFLAGS, CS, RIP and do IRETQ
+        // Reload code segment
         asm!(
-            "push 0x10",        // SS
+            "push 0x10",        // SS (kernel data)
             "push rsp",         // RSP
-            "add qword ptr [rsp], 8", // Adjust for the push
+            "add qword ptr [rsp], 8",
             "pushfq",           // RFLAGS
-            "push 0x08",        // CS
-            "lea rax, [rip + 2f]", // RIP (address of label 2)
+            "push 0x08",        // CS (kernel code)
+            "lea rax, [rip + 2f]",
             "push rax",
             "iretq",
             "2:",
@@ -119,19 +270,90 @@ pub fn reload() {
             "mov es, ax",
             "mov fs, ax",
             "mov gs, ax",
-            // SS is already set by IRETQ
+            out("rax") _,
+            options(preserves_flags)
+        );
+
+        // Load TSS
+        asm!(
+            "ltr {0:x}",
+            in(reg) selectors::TSS,
+            options(nostack, preserves_flags)
+        );
+    }
+}
+
+/// Reload the GDT (called before removing identity mapping)
+///
+/// This is the simpler reload that doesn't reinitialize TSS.
+pub fn reload() {
+    let gdt = unsafe { &*KERNEL_GDT.0.get() };
+
+    let gdt_size = size_of::<Gdt>();
+    let pointer = GdtPointer {
+        limit: (gdt_size - 1) as u16,
+        base: gdt as *const Gdt as u64,
+    };
+
+    unsafe {
+        asm!("lgdt [{}]", in(reg) &pointer, options(nostack, preserves_flags));
+
+        // Reload segments
+        asm!(
+            "push 0x10",
+            "push rsp",
+            "add qword ptr [rsp], 8",
+            "pushfq",
+            "push 0x08",
+            "lea rax, [rip + 2f]",
+            "push rax",
+            "iretq",
+            "2:",
+            "mov ax, 0x10",
+            "mov ds, ax",
+            "mov es, ax",
+            "mov fs, ax",
+            "mov gs, ax",
             out("rax") _,
             options(preserves_flags)
         );
     }
 }
 
+/// Update TSS RSP0 (kernel stack for ring transitions)
+pub fn set_kernel_stack(stack: u64) {
+    let tss = unsafe { &mut *KERNEL_TSS.0.get() };
+    tss.rsp0 = stack;
+}
+
+/// Set an IST (Interrupt Stack Table) entry
+///
+/// IST entries are numbered 1-7 (index 0-6 in the array).
+/// These provide dedicated stacks for specific interrupt handlers.
+pub fn set_ist(ist_index: u8, stack: u64) {
+    if ist_index == 0 || ist_index > 7 {
+        return; // Invalid index
+    }
+    let tss = unsafe { &mut *KERNEL_TSS.0.get() };
+    tss.ist[(ist_index - 1) as usize] = stack;
+}
+
 /// Get the kernel code segment selector
 pub const fn kernel_cs() -> u16 {
-    0x08
+    selectors::KERNEL_CODE
 }
 
 /// Get the kernel data segment selector
 pub const fn kernel_ds() -> u16 {
-    0x10
+    selectors::KERNEL_DATA
+}
+
+/// Get the user code segment selector
+pub const fn user_cs() -> u16 {
+    selectors::USER_CODE
+}
+
+/// Get the user data segment selector
+pub const fn user_ds() -> u16 {
+    selectors::USER_DATA
 }
diff --git a/src/arch/x86_64/interrupts.rs b/src/arch/x86_64/interrupts.rs
index a6dd88a..febb22e 100644
--- a/src/arch/x86_64/interrupts.rs
+++ b/src/arch/x86_64/interrupts.rs
@@ -123,6 +123,17 @@ impl Idt {
             0,    // No IST
         );
     }
+
+    /// Set an interrupt handler with custom DPL
+    pub fn set_handler_dpl(&mut self, vector: u8, handler: u64, gate_type: GateType, dpl: u8) {
+        self.entries[vector as usize] = IdtEntry::new(
+            handler,
+            0x08, // Kernel code segment
+            gate_type,
+            dpl,
+            0,    // No IST
+        );
+    }
 }
 
 /// Interrupt stack frame pushed by CPU on interrupt/exception
@@ -271,6 +282,49 @@ unsafe fn load_idt(idt: *const Idt) {
     }
 }
 
+/// Set an interrupt handler (public interface)
+///
+/// # Arguments
+/// * `vector` - Interrupt vector number (0-255)
+/// * `handler` - Handler function address
+/// * `gate_type` - Type of gate (Interrupt or Trap)
+/// * `dpl` - Descriptor Privilege Level (0 = kernel only, 3 = user callable)
+pub fn set_handler(vector: u8, handler: u64, gate_type: GateType, dpl: u8) {
+    let idt = unsafe { &mut *STATIC_IDT.0.get() };
+    idt.set_handler_dpl(vector, handler, gate_type, dpl);
+}
+
+/// Set an interrupt handler with IST (Interrupt Stack Table) support
+///
+/// # Arguments
+/// * `vector` - Interrupt vector number (0-255)
+/// * `handler` - Handler function address
+/// * `gate_type` - Type of gate (Interrupt or Trap)
+/// * `ist` - IST index (1-7), or 0 for no IST
+pub fn set_handler_with_ist(vector: u8, handler: u64, gate_type: GateType, ist: u8) {
+    let idt = unsafe { &mut *STATIC_IDT.0.get() };
+    idt.entries[vector as usize] = IdtEntry::new(
+        handler,
+        0x08, // Kernel code segment
+        gate_type,
+        0,    // DPL 0 (kernel)
+        ist,
+    );
+}
+
+/// Configure the double fault handler to use an IST entry
+///
+/// This must be called after init() and after the IST stack has been
+/// set up in the TSS via gdt::set_ist().
+pub fn set_double_fault_ist(ist: u8) {
+    set_handler_with_ist(
+        vectors::DOUBLE_FAULT,
+        double_fault_handler as *const () as u64,
+        GateType::Trap,
+        ist,
+    );
+}
+
 // ============================================================================
 // Exception Handlers
 // ============================================================================
@@ -404,7 +458,6 @@ extern "C" fn rust_exception_handler(state: &ExceptionState) {
              exception_name(state.vector as u8), state.vector).ok();
     writeln!(serial, "    Error code: {:#x}", state.error_code).ok();
     writeln!(serial, "    RIP: {:#x}  CS: {:#x}", state.rip, state.cs).ok();
-    writeln!(serial, "    RSP: {:#x}  SS: {:#x}", state.rsp, state.ss).ok();
     writeln!(serial, "    RFLAGS: {:#x}", state.rflags).ok();
     writeln!(serial, "    RAX: {:#018x}  RBX: {:#018x}", state.rax, state.rbx).ok();
     writeln!(serial, "    RCX: {:#018x}  RDX: {:#018x}", state.rcx, state.rdx).ok();
diff --git a/src/arch/x86_64/mod.rs b/src/arch/x86_64/mod.rs
index 753524f..6a8e862 100644
--- a/src/arch/x86_64/mod.rs
+++ b/src/arch/x86_64/mod.rs
@@ -1,7 +1,9 @@
 //! x86_64 architecture support
 
-pub mod interrupts;
 pub mod gdt;
+pub mod interrupts;
+pub mod pic;
+pub mod syscall;
 
 /// Halt the CPU until the next interrupt
 #[inline]
diff --git a/src/arch/x86_64/pic.rs b/src/arch/x86_64/pic.rs
new file mode 100644
index 0000000..7343dd8
--- /dev/null
+++ b/src/arch/x86_64/pic.rs
@@ -0,0 +1,244 @@
+//! 8259 Programmable Interrupt Controller (PIC) driver
+//!
+//! The legacy PIC must be properly configured even if we plan to use the APIC,
+//! because its default IRQ mappings (0x08-0x0F, 0x70-0x77) conflict with CPU
+//! exception vectors:
+//!
+//! - IRQ0 (Timer) -> Vector 0x08 (Double Fault!)
+//! - IRQ1 (Keyboard) -> Vector 0x09
+//! - IRQ7 (Spurious) -> Vector 0x0F
+//!
+//! This module remaps the PICs to vectors 0x20-0x2F and provides control
+//! over interrupt masking.
+
+use core::arch::asm;
+
+/// PIC1 (master) command port
+const PIC1_CMD: u16 = 0x20;
+/// PIC1 (master) data port
+const PIC1_DATA: u16 = 0x21;
+/// PIC2 (slave) command port
+const PIC2_CMD: u16 = 0xA0;
+/// PIC2 (slave) data port
+const PIC2_DATA: u16 = 0xA1;
+
+/// ICW1: Initialization Command Word 1
+const ICW1_INIT: u8 = 0x10;
+const ICW1_ICW4: u8 = 0x01; // ICW4 needed
+
+/// ICW4: Initialization Command Word 4
+const ICW4_8086: u8 = 0x01; // 8086/88 mode
+
+/// Vector offset for PIC1 IRQs (IRQ0-7 -> vectors 0x20-0x27)
+pub const PIC1_OFFSET: u8 = 0x20;
+/// Vector offset for PIC2 IRQs (IRQ8-15 -> vectors 0x28-0x2F)
+pub const PIC2_OFFSET: u8 = 0x28;
+
+/// IRQ numbers
+pub mod irq {
+    pub const TIMER: u8 = 0;
+    pub const KEYBOARD: u8 = 1;
+    pub const CASCADE: u8 = 2; // Used internally for PIC1-PIC2 cascade
+    pub const COM2: u8 = 3;
+    pub const COM1: u8 = 4;
+    pub const LPT2: u8 = 5;
+    pub const FLOPPY: u8 = 6;
+    pub const LPT1: u8 = 7; // Also spurious IRQ
+    pub const RTC: u8 = 8;
+    pub const FREE1: u8 = 9;
+    pub const FREE2: u8 = 10;
+    pub const FREE3: u8 = 11;
+    pub const MOUSE: u8 = 12;
+    pub const FPU: u8 = 13;
+    pub const ATA_PRIMARY: u8 = 14;
+    pub const ATA_SECONDARY: u8 = 15;
+}
+
+/// Write a byte to an I/O port
+#[inline]
+unsafe fn outb(port: u16, value: u8) {
+    unsafe {
+        asm!(
+            "out dx, al",
+            in("dx") port,
+            in("al") value,
+            options(nostack, nomem, preserves_flags)
+        );
+    }
+}
+
+/// Read a byte from an I/O port
+#[inline]
+unsafe fn inb(port: u16) -> u8 {
+    let value: u8;
+    unsafe {
+        asm!(
+            "in al, dx",
+            in("dx") port,
+            out("al") value,
+            options(nostack, nomem, preserves_flags)
+        );
+    }
+    value
+}
+
+/// Small I/O delay for PIC timing requirements
+#[inline]
+unsafe fn io_wait() {
+    // Write to an unused port to create a small delay
+    // Port 0x80 is used for POST codes and is safe to write to
+    unsafe { outb(0x80, 0); }
+}
+
+/// Initialize and remap both PICs
+///
+/// This remaps:
+/// - PIC1 (IRQ 0-7) to vectors 0x20-0x27
+/// - PIC2 (IRQ 8-15) to vectors 0x28-0x2F
+///
+/// After initialization, all IRQs are masked (disabled).
+pub fn init() {
+    unsafe {
+        // Save current masks
+        let mask1 = inb(PIC1_DATA);
+        let mask2 = inb(PIC2_DATA);
+
+        // ICW1: Start initialization sequence (cascade mode, ICW4 needed)
+        outb(PIC1_CMD, ICW1_INIT | ICW1_ICW4);
+        io_wait();
+        outb(PIC2_CMD, ICW1_INIT | ICW1_ICW4);
+        io_wait();
+
+        // ICW2: Set vector offsets
+        outb(PIC1_DATA, PIC1_OFFSET);
+        io_wait();
+        outb(PIC2_DATA, PIC2_OFFSET);
+        io_wait();
+
+        // ICW3: Configure cascade
+        // Tell PIC1 that PIC2 is at IRQ2 (bit 2 = 0x04)
+        outb(PIC1_DATA, 0x04);
+        io_wait();
+        // Tell PIC2 its cascade identity (IRQ2 = 2)
+        outb(PIC2_DATA, 0x02);
+        io_wait();
+
+        // ICW4: Set 8086 mode
+        outb(PIC1_DATA, ICW4_8086);
+        io_wait();
+        outb(PIC2_DATA, ICW4_8086);
+        io_wait();
+
+        // Mask all interrupts (we'll unmask specific ones as needed)
+        outb(PIC1_DATA, 0xFF);
+        outb(PIC2_DATA, 0xFF);
+
+        // Note: We intentionally mask all interrupts rather than restoring
+        // the old masks, since we want to start with a clean slate
+        let _ = (mask1, mask2); // Suppress unused warning
+    }
+}
+
+/// Disable the PIC entirely by masking all interrupts
+///
+/// This is useful when transitioning to APIC mode.
+pub fn disable() {
+    unsafe {
+        outb(PIC1_DATA, 0xFF);
+        outb(PIC2_DATA, 0xFF);
+    }
+}
+
+/// Mask (disable) a specific IRQ
+pub fn mask_irq(irq: u8) {
+    let port = if irq < 8 { PIC1_DATA } else { PIC2_DATA };
+    let irq_bit = if irq < 8 { irq } else { irq - 8 };
+
+    unsafe {
+        let mask = inb(port) | (1 << irq_bit);
+        outb(port, mask);
+    }
+}
+
+/// Unmask (enable) a specific IRQ
+pub fn unmask_irq(irq: u8) {
+    let port = if irq < 8 { PIC1_DATA } else { PIC2_DATA };
+    let irq_bit = if irq < 8 { irq } else { irq - 8 };
+
+    unsafe {
+        let mask = inb(port) & !(1 << irq_bit);
+        outb(port, mask);
+    }
+
+    // If unmasking an IRQ on PIC2, also unmask the cascade IRQ on PIC1
+    if irq >= 8 {
+        unsafe {
+            let mask = inb(PIC1_DATA) & !(1 << irq::CASCADE);
+            outb(PIC1_DATA, mask);
+        }
+    }
+}
+
+/// Send End-Of-Interrupt (EOI) signal
+///
+/// This must be called at the end of an IRQ handler to acknowledge
+/// the interrupt and allow further interrupts.
+pub fn send_eoi(irq: u8) {
+    const EOI: u8 = 0x20;
+
+    unsafe {
+        // If IRQ came from PIC2, send EOI to both PICs
+        if irq >= 8 {
+            outb(PIC2_CMD, EOI);
+        }
+        outb(PIC1_CMD, EOI);
+    }
+}
+
+/// Check if an IRQ is a spurious IRQ
+///
+/// Spurious IRQs (IRQ7 or IRQ15) can occur due to electrical noise
+/// or race conditions. They should be checked before handling.
+pub fn is_spurious(irq: u8) -> bool {
+    const ISR_READ: u8 = 0x0B;
+
+    if irq == 7 {
+        // Check PIC1's In-Service Register
+        unsafe {
+            outb(PIC1_CMD, ISR_READ);
+            let isr = inb(PIC1_CMD);
+            // If bit 7 is not set, it's spurious
+            return (isr & 0x80) == 0;
+        }
+    } else if irq == 15 {
+        // Check PIC2's In-Service Register
+        unsafe {
+            outb(PIC2_CMD, ISR_READ);
+            let isr = inb(PIC2_CMD);
+            // If bit 7 is not set, it's spurious
+            if (isr & 0x80) == 0 {
+                // Still need to send EOI to PIC1 (for cascade)
+                outb(PIC1_CMD, 0x20);
+                return true;
+            }
+        }
+    }
+    false
+}
+
+/// Get the current IRQ mask for both PICs
+pub fn get_mask() -> u16 {
+    unsafe {
+        let mask1 = inb(PIC1_DATA) as u16;
+        let mask2 = inb(PIC2_DATA) as u16;
+        mask1 | (mask2 << 8)
+    }
+}
+
+/// Set the IRQ mask for both PICs
+pub fn set_mask(mask: u16) {
+    unsafe {
+        outb(PIC1_DATA, mask as u8);
+        outb(PIC2_DATA, (mask >> 8) as u8);
+    }
+}
diff --git a/src/arch/x86_64/syscall.rs b/src/arch/x86_64/syscall.rs
new file mode 100644
index 0000000..0aa1c3b
--- /dev/null
+++ b/src/arch/x86_64/syscall.rs
@@ -0,0 +1,232 @@
+//! System Call Interface for x86-64
+//!
+//! This module implements system calls using INT 0x80.
+//! This is simpler than SYSCALL/SYSRET and uses the existing IDT infrastructure.
+//!
+//! Syscall Convention:
+//! - RAX = syscall number
+//! - RDI, RSI, RDX, R10, R8, R9 = arguments
+//! - RAX = return value
+
+use core::arch::asm;
+use crate::arch::x86_64::gdt;
+
+/// System call interrupt vector
+pub const SYSCALL_VECTOR: u8 = 0x80;
+
+/// System call numbers
+pub mod numbers {
+    pub const EXIT: u64 = 0;
+    pub const WRITE: u64 = 1;
+    pub const YIELD: u64 = 2;
+    pub const GETPID: u64 = 3;
+}
+
+/// Initialize the syscall interface
+///
+/// This adds the INT 0x80 handler to the IDT.
+pub fn init() {
+    use crate::arch::x86_64::interrupts::{GateType, set_handler};
+
+    // Set up INT 0x80 as a trap gate with DPL 3 (user-callable)
+    set_handler(SYSCALL_VECTOR, syscall_entry as *const () as u64, GateType::Trap, 3);
+}
+
+/// Syscall entry point
+///
+/// This is registered as the INT 0x80 handler.
+/// Stack on entry (pushed by CPU):
+/// - SS, RSP, RFLAGS, CS, RIP (if from ring 3)
+/// - Error code (none for INT)
+#[unsafe(naked)]
+extern "C" fn syscall_entry() {
+    core::arch::naked_asm!(
+        // No error code for software interrupts
+        // Save all registers
+        "push rax",
+        "push rbx",
+        "push rcx",
+        "push rdx",
+        "push rsi",
+        "push rdi",
+        "push rbp",
+        "push r8",
+        "push r9",
+        "push r10",
+        "push r11",
+        "push r12",
+        "push r13",
+        "push r14",
+        "push r15",
+
+        // Call Rust syscall handler
+        // First arg (RDI) = pointer to saved state
+        "mov rdi, rsp",
+        "call {handler}",
+
+        // Return value is in RAX, save it to the stack frame
+        "mov [rsp + 14*8], rax",  // Overwrite saved RAX
+
+        // Restore all registers
+        "pop r15",
+        "pop r14",
+        "pop r13",
+        "pop r12",
+        "pop r11",
+        "pop r10",
+        "pop r9",
+        "pop r8",
+        "pop rbp",
+        "pop rdi",
+        "pop rsi",
+        "pop rdx",
+        "pop rcx",
+        "pop rbx",
+        "pop rax",
+
+        // Return from interrupt
+        "iretq",
+
+        handler = sym syscall_handler_rust,
+    );
+}
+
+/// Saved register state for syscall
+#[repr(C)]
+pub struct SyscallFrame {
+    pub r15: u64,
+    pub r14: u64,
+    pub r13: u64,
+    pub r12: u64,
+    pub r11: u64,
+    pub r10: u64,
+    pub r9: u64,
+    pub r8: u64,
+    pub rbp: u64,
+    pub rdi: u64,
+    pub rsi: u64,
+    pub rdx: u64,
+    pub rcx: u64,
+    pub rbx: u64,
+    pub rax: u64,
+    // CPU-pushed
+    pub rip: u64,
+    pub cs: u64,
+    pub rflags: u64,
+    pub rsp: u64,
+    pub ss: u64,
+}
+
+/// Rust syscall handler
+extern "C" fn syscall_handler_rust(frame: &SyscallFrame) -> u64 {
+    let num = frame.rax;
+    let arg1 = frame.rdi;
+    let arg2 = frame.rsi;
+    let arg3 = frame.rdx;
+    let arg4 = frame.r10;
+    let arg5 = frame.r8;
+
+    match num {
+        numbers::EXIT => {
+            use crate::serial::SerialPort;
+            use core::fmt::Write;
+            let mut serial = unsafe { SerialPort::new(0x3F8) };
+            writeln!(serial, "\n[SYSCALL] exit({})", arg1).ok();
+
+            // For now, just halt. In a real OS, we'd terminate the process
+            // and schedule another one.
+            loop {
+                unsafe { asm!("cli; hlt", options(nostack, nomem)); }
+            }
+        }
+
+        numbers::WRITE => {
+            // write(fd, buf, len) -> bytes_written
+            if arg1 == 1 || arg1 == 2 {
+                // stdout or stderr -> serial
+                use crate::serial::SerialPort;
+                let mut serial = unsafe { SerialPort::new(0x3F8) };
+
+                let buf = arg2 as *const u8;
+                let len = arg3 as usize;
+
+                // Safety: we trust the user buffer for now
+                // In a real OS, we'd validate it's in user memory
+                for i in 0..len {
+                    let c = unsafe { *buf.add(i) };
+                    serial.write_byte(c);
+                }
+
+                len as u64
+            } else {
+                u64::MAX // -1 = error
+            }
+        }
+
+        numbers::YIELD => {
+            // No-op for single process kernel
+            0
+        }
+
+        numbers::GETPID => {
+            // Return current process ID
+            crate::process::current().pid as u64
+        }
+
+        _ => {
+            // Unknown syscall
+            use crate::serial::SerialPort;
+            use core::fmt::Write;
+            let mut serial = unsafe { SerialPort::new(0x3F8) };
+            writeln!(serial, "[SYSCALL] Unknown syscall: {}", num).ok();
+            u64::MAX
+        }
+    }
+}
+
+/// Make a syscall from user mode (for testing)
+#[inline]
+pub unsafe fn syscall0(num: u64) -> u64 {
+    let ret: u64;
+    unsafe {
+        asm!(
+            "int 0x80",
+            in("rax") num,
+            lateout("rax") ret,
+            options(nostack)
+        );
+    }
+    ret
+}
+
+#[inline]
+pub unsafe fn syscall1(num: u64, arg1: u64) -> u64 {
+    let ret: u64;
+    unsafe {
+        asm!(
+            "int 0x80",
+            in("rax") num,
+            in("rdi") arg1,
+            lateout("rax") ret,
+            options(nostack)
+        );
+    }
+    ret
+}
+
+#[inline]
+pub unsafe fn syscall3(num: u64, arg1: u64, arg2: u64, arg3: u64) -> u64 {
+    let ret: u64;
+    unsafe {
+        asm!(
+            "int 0x80",
+            in("rax") num,
+            in("rdi") arg1,
+            in("rsi") arg2,
+            in("rdx") arg3,
+            lateout("rax") ret,
+            options(nostack)
+        );
+    }
+    ret
+}
diff --git a/src/lib.rs b/src/lib.rs
index 5105755..a0922a1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -109,8 +109,15 @@ pub fn kernel_init(info: &BootInfo) -> ! {
     // Get serial port for output
     let mut serial = unsafe { SerialPort::new(0x3F8) };
 
+    // CRITICAL: Initialize and remap the PIC first!
+    // The legacy PIC's default IRQ0 (timer) maps to vector 0x08, which conflicts
+    // with the Double Fault exception. This causes spurious "double faults" when
+    // the timer fires. We remap the PIC to vectors 0x20-0x2F and mask all IRQs.
+    arch::x86_64::pic::init();
+
     writeln!(serial, "").ok();
     writeln!(serial, ">>> Entering kernel_init()").ok();
+    writeln!(serial, "    PIC remapped and masked").ok();
     writeln!(serial, "    Boot method: {:?}", info.boot_method).ok();
 
     // Report memory information from boot
@@ -295,11 +302,52 @@ pub fn kernel_init(info: &BootInfo) -> ! {
         }
     }
 
-    // Reload GDT to higher-half address before removing identity mapping
+    // Initialize GDT with TSS for user mode support
+    // We need a kernel stack for ring 0 transitions from ring 3
     writeln!(serial, "").ok();
-    writeln!(serial, ">>> Reloading GDT to higher-half...").ok();
-    arch::x86_64::gdt::reload();
-    writeln!(serial, "    GDT reloaded").ok();
+    writeln!(serial, ">>> Initializing GDT with TSS...").ok();
+
+    // Allocate a kernel stack for syscall/interrupt handling from user mode
+    // We'll use 4 pages (16KB) for the kernel stack
+    let kernel_stack_base = VirtAddr::new(0xFFFFFE8000010000); // In temp region
+
+    // Allocate and map 4 pages for the kernel stack
+    for i in 0..4 {
+        let frame = memory::frame::allocate_frame().expect("Failed to allocate kernel stack");
+        let page_virt = VirtAddr::new(kernel_stack_base.as_u64() + (i * 0x1000) as u64);
+        memory::paging::map_4kb(page_virt, frame.start_address(), memory::paging::flags::KERNEL_DATA)
+            .expect("Failed to map kernel stack");
+    }
+
+    // Stack grows down, so point to top of the 4-page region
+    let kernel_stack_top = kernel_stack_base.as_u64() + 0x4000;
+    arch::x86_64::gdt::init(kernel_stack_top);
+    writeln!(serial, "    GDT with TSS initialized").ok();
+    writeln!(serial, "    Kernel stack at {:#x}", kernel_stack_top).ok();
+
+    // Initialize syscall interface (INT 0x80)
+    writeln!(serial, "").ok();
+    writeln!(serial, ">>> Initializing syscall interface...").ok();
+    arch::x86_64::syscall::init();
+    writeln!(serial, "    INT 0x80 syscall handler installed").ok();
+
+    // Set up a dedicated stack for double fault handling (IST1)
+    // This ensures the double fault handler has a known-good stack even if
+    // the main stack is corrupted (e.g., during failed privilege transitions)
+    writeln!(serial, "").ok();
+    writeln!(serial, ">>> Setting up IST for double fault...").ok();
+    let ist1_stack_base = VirtAddr::new(0xFFFFFE8000020000); // Separate from kernel stack
+    // Allocate 4 pages (16KB) - must be enough for exception frame + handler execution
+    for i in 0..4 {
+        let frame = memory::frame::allocate_frame().expect("Failed to allocate IST1 stack");
+        let page_virt = VirtAddr::new(ist1_stack_base.as_u64() + (i * 0x1000) as u64);
+        memory::paging::map_4kb(page_virt, frame.start_address(), memory::paging::flags::KERNEL_DATA)
+            .expect("Failed to map IST1 stack");
+    }
+    let ist1_stack_top = ist1_stack_base.as_u64() + 0x4000; // 16KB stack
+    arch::x86_64::gdt::set_ist(1, ist1_stack_top);
+    arch::x86_64::interrupts::set_double_fault_ist(1);
+    writeln!(serial, "    IST1 (double fault) stack at {:#x}", ist1_stack_top).ok();
 
     // Remove identity mapping - no longer needed now that we're in higher-half
     writeln!(serial, "").ok();
@@ -314,15 +362,162 @@ pub fn kernel_init(info: &BootInfo) -> ! {
         writeln!(serial, "    Identity mapping removed (PML4[0] cleared)").ok();
     }
 
+    // Test user-mode execution
     writeln!(serial, "").ok();
-    writeln!(serial, "Kernel initialization complete.").ok();
-    writeln!(serial, "Halting CPU.").ok();
+    writeln!(serial, ">>> Testing user-mode execution...").ok();
 
-    // Halt the CPU
-    loop {
-        unsafe {
-            core::arch::asm!("cli; hlt", options(nostack, nomem));
+    // Create a new process for user mode test
+    let user_pid = process::create().expect("Failed to create user process");
+    writeln!(serial, "    Created user process {}", user_pid).ok();
+
+    // Get the process's page table for mapping user pages
+    let user_process = process::get(user_pid).unwrap();
+    writeln!(serial, "    Process page table: {:#x}", user_process.page_table).ok();
+
+    // Allocate frames for user code and stack
+    let user_code_frame = memory::frame::allocate_frame().expect("Failed to allocate user code frame");
+    let user_stack_frame = memory::frame::allocate_frame().expect("Failed to allocate user stack frame");
+
+    // User virtual addresses (in low memory, user-accessible)
+    let user_code_virt = VirtAddr::new(0x400000);   // 4MB - typical user code location
+    let user_stack_virt = VirtAddr::new(0x800000);  // 8MB - user stack base
+
+    // First switch to the user process's address space to set up its mappings
+    unsafe { process::switch_address_space(user_pid).expect("Failed to switch to user address space"); }
+
+    // Verify kernel stacks are accessible in user address space
+    // (They should be, since we copy kernel PML4 entries during process creation)
+    writeln!(serial, "    Verifying kernel stack mappings...").ok();
+    if let Some(phys) = memory::paging::translate(VirtAddr::new(kernel_stack_top - 8)) {
+        writeln!(serial, "      Kernel stack: {:#x} -> {:#x}", kernel_stack_top - 8, phys).ok();
+    } else {
+        panic!("Kernel stack not mapped in user address space!");
+    }
+    if let Some(phys) = memory::paging::translate(VirtAddr::new(ist1_stack_top - 8)) {
+        writeln!(serial, "      IST1 stack: {:#x} -> {:#x}", ist1_stack_top - 8, phys).ok();
+    } else {
+        panic!("IST1 stack not mapped in user address space!");
+    }
+
+    // Map user code page (readable, executable, user-accessible)
+    memory::paging::map_4kb(user_code_virt, user_code_frame.start_address(), memory::paging::flags::USER_CODE)
+        .expect("Failed to map user code");
+    writeln!(serial, "    Mapped user code at {:#x}", user_code_virt).ok();
+
+    // Map user stack page (readable, writable, user-accessible)
+    memory::paging::map_4kb(user_stack_virt, user_stack_frame.start_address(), memory::paging::flags::USER_DATA)
+        .expect("Failed to map user stack");
+    writeln!(serial, "    Mapped user stack at {:#x}", user_stack_virt).ok();
+
+    // Write a simple user program that:
+    // 1. Calls write(1, "Hello from user mode!\n", 22)
+    // 2. Calls exit(0)
+    let user_code_ptr = user_code_virt.as_u64() as *mut u8;
+    let message = b"Hello from user mode!\n";
+    let message_offset = 64u64; // Place message after code
+
+    unsafe {
+        let code: &[u8] = &[
+            // mov rax, 1 (WRITE syscall)
+            0x48, 0xc7, 0xc0, 0x01, 0x00, 0x00, 0x00,
+            // mov rdi, 1 (fd = stdout)
+            0x48, 0xc7, 0xc7, 0x01, 0x00, 0x00, 0x00,
+            // lea rsi, [rip + message_offset] - we'll use absolute address instead
+            // mov rsi, 0x400040 (message address = code_base + 64)
+            0x48, 0xbe,
+            ((user_code_virt.as_u64() + message_offset) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 8) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 16) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 24) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 32) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 40) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 48) & 0xFF) as u8,
+            (((user_code_virt.as_u64() + message_offset) >> 56) & 0xFF) as u8,
+            // mov rdx, 22 (length)
+            0x48, 0xc7, 0xc2, 0x16, 0x00, 0x00, 0x00,
+            // int 0x80
+            0xcd, 0x80,
+            // mov rax, 0 (EXIT syscall)
+            0x48, 0xc7, 0xc0, 0x00, 0x00, 0x00, 0x00,
+            // mov rdi, 0 (exit code)
+            0x48, 0xc7, 0xc7, 0x00, 0x00, 0x00, 0x00,
+            // int 0x80
+            0xcd, 0x80,
+            // hlt (should never reach here)
+            0xf4,
+        ];
+
+        // Write the code
+        for (i, &byte) in code.iter().enumerate() {
+            core::ptr::write_volatile(user_code_ptr.add(i), byte);
         }
+
+        // Write the message after the code
+        let message_ptr = user_code_ptr.add(message_offset as usize);
+        for (i, &byte) in message.iter().enumerate() {
+            core::ptr::write_volatile(message_ptr.add(i), byte);
+        }
+    }
+    writeln!(serial, "    Wrote user program ({} bytes code + {} bytes data)", 52, message.len()).ok();
+
+    // User stack pointer (top of stack page)
+    let user_stack_top = user_stack_virt.as_u64() + 0x1000;
+
+    // First, let's test that user mode works by running code in kernel
+    // that verifies the segments are correct
+    writeln!(serial, "").ok();
+    writeln!(serial, ">>> Testing IRETQ mechanism with kernel mode...").ok();
+
+    // Test: Do a simple kernel-to-kernel IRETQ to verify the mechanism
+    unsafe {
+        core::arch::asm!(
+            // Push a simple return frame for kernel mode
+            "push 0x10",        // SS (kernel data)
+            "push rsp",         // RSP (current stack)
+            "add qword ptr [rsp], 8",  // Adjust for the push
+            "pushfq",           // RFLAGS
+            "push 0x08",        // CS (kernel code)
+            "lea rax, [rip + 2f]",  // RIP (label 2)
+            "push rax",
+            "iretq",
+            "2:",
+            out("rax") _,
+            options(nostack)
+        );
+    }
+    writeln!(serial, "    Kernel IRETQ test passed!").ok();
+
+    // Debug: Print the GDT segment descriptor values
+    writeln!(serial, "").ok();
+    writeln!(serial, ">>> Verifying GDT entries...").ok();
+    let user_cs = arch::x86_64::gdt::user_cs();
+    let user_ds = arch::x86_64::gdt::user_ds();
+    writeln!(serial, "    USER_CS selector: {:#x}", user_cs).ok();
+    writeln!(serial, "    USER_DS selector: {:#x}", user_ds).ok();
+
+    // Test loading user data segment while in kernel mode
+    // This should work: loading DPL=3 segment with RPL=3 while CPL=0
+    writeln!(serial, "    Testing user segment load in kernel mode...").ok();
+    unsafe {
+        core::arch::asm!(
+            "mov ax, {0:x}",
+            "mov ds, ax",      // This might fail with GPF if segment is invalid
+            "mov ax, 0x10",    // Restore kernel data segment
+            "mov ds, ax",
+            in(reg) user_ds as u64,
+            out("rax") _,
+            options(nostack, preserves_flags)
+        );
+    }
+    writeln!(serial, "    User segment load test passed!").ok();
+
+    writeln!(serial, "").ok();
+    writeln!(serial, ">>> Jumping to user mode (ring 3)...").ok();
+    writeln!(serial, "    Entry: {:#x}, Stack: {:#x}", user_code_virt, user_stack_top).ok();
+
+    // Jump to user mode! (This won't return)
+    unsafe {
+        process::jump_to_user(user_code_virt.as_u64(), user_stack_top);
     }
 }
 
diff --git a/src/memory/paging.rs b/src/memory/paging.rs
index cd5f1fe..e78a398 100644
--- a/src/memory/paging.rs
+++ b/src/memory/paging.rs
@@ -444,15 +444,18 @@ pub fn remove_identity_mapping() {
 // ============================================================================
 
 /// Ensure a PML4 entry exists, creating a PDPT if necessary
-fn ensure_pml4_entry(pml4_idx: usize, _flags: u64) -> Result<(), PagingError> {
+fn ensure_pml4_entry(pml4_idx: usize, page_flags: u64) -> Result<(), PagingError> {
     let entry = read_pml4(pml4_idx);
     if !entry.is_present() {
         let frame = allocate_frame()?;
         let phys = frame.start_address();
 
         // Link the new PDPT into the PML4 first
-        // Use only table flags (PRESENT | WRITABLE) for intermediate entries
-        let table_flags = flags::PRESENT | flags::WRITABLE;
+        // For user pages, the USER bit must be set in all intermediate entries
+        let mut table_flags = flags::PRESENT | flags::WRITABLE;
+        if page_flags & flags::USER != 0 {
+            table_flags |= flags::USER;
+        }
         let new_entry = PageTableEntry::new(phys, table_flags);
         write_pml4(pml4_idx, new_entry);
 
@@ -462,13 +465,18 @@ fn ensure_pml4_entry(pml4_idx: usize, _flags: u64) -> Result<(), PagingError> {
         // Zero the new page table via recursive mapping
         // Now that PML4[pml4_idx] is set, pdpt_table_addr gives us access
         zero_page_table(pdpt_table_addr(pml4_idx));
+    } else if page_flags & flags::USER != 0 && !entry.is_user() {
+        // Existing entry needs USER bit added
+        let mut updated = entry;
+        updated.set_flags(entry.flags() | flags::USER);
+        write_pml4(pml4_idx, updated);
     }
     Ok(())
 }
 
 /// Ensure a PDPT entry exists, creating a PD if necessary
-fn ensure_pdpt_entry(pml4_idx: usize, pdpt_idx: usize, flags: u64) -> Result<(), PagingError> {
-    ensure_pml4_entry(pml4_idx, flags)?;
+fn ensure_pdpt_entry(pml4_idx: usize, pdpt_idx: usize, page_flags: u64) -> Result<(), PagingError> {
+    ensure_pml4_entry(pml4_idx, page_flags)?;
 
     let entry = read_pdpt(pml4_idx, pdpt_idx);
     if entry.is_huge() {
@@ -479,8 +487,11 @@ fn ensure_pdpt_entry(pml4_idx: usize, pdpt_idx: usize, flags: u64) -> Result<(),
         let phys = frame.start_address();
 
         // Link the new PD into the PDPT first
-        // Use only table flags for intermediate entries
-        let table_flags = flags::PRESENT | flags::WRITABLE;
+        // For user pages, the USER bit must be set in all intermediate entries
+        let mut table_flags = flags::PRESENT | flags::WRITABLE;
+        if page_flags & flags::USER != 0 {
+            table_flags |= flags::USER;
+        }
         let new_entry = PageTableEntry::new(phys, table_flags);
         write_pdpt(pml4_idx, pdpt_idx, new_entry);
 
@@ -489,13 +500,18 @@ fn ensure_pdpt_entry(pml4_idx: usize, pdpt_idx: usize, flags: u64) -> Result<(),
 
         // Zero the new page table via recursive mapping
         zero_page_table(pd_table_addr(pml4_idx, pdpt_idx));
+    } else if page_flags & flags::USER != 0 && !entry.is_user() {
+        // Existing entry needs USER bit added
+        let mut updated = entry;
+        updated.set_flags(entry.flags() | flags::USER);
+        write_pdpt(pml4_idx, pdpt_idx, updated);
     }
     Ok(())
 }
 
 /// Ensure a PD entry exists, creating a PT if necessary
-fn ensure_pd_entry(pml4_idx: usize, pdpt_idx: usize, pd_idx: usize, flags: u64) -> Result<(), PagingError> {
-    ensure_pdpt_entry(pml4_idx, pdpt_idx, flags)?;
+fn ensure_pd_entry(pml4_idx: usize, pdpt_idx: usize, pd_idx: usize, page_flags: u64) -> Result<(), PagingError> {
+    ensure_pdpt_entry(pml4_idx, pdpt_idx, page_flags)?;
 
     let entry = read_pd(pml4_idx, pdpt_idx, pd_idx);
     if entry.is_huge() {
@@ -506,8 +522,11 @@ fn ensure_pd_entry(pml4_idx: usize, pdpt_idx: usize, pd_idx: usize, flags: u64)
         let phys = frame.start_address();
 
         // Link the new PT into the PD first
-        // Use only table flags for intermediate entries
-        let table_flags = flags::PRESENT | flags::WRITABLE;
+        // For user pages, the USER bit must be set in all intermediate entries
+        let mut table_flags = flags::PRESENT | flags::WRITABLE;
+        if page_flags & flags::USER != 0 {
+            table_flags |= flags::USER;
+        }
         let new_entry = PageTableEntry::new(phys, table_flags);
         write_pd(pml4_idx, pdpt_idx, pd_idx, new_entry);
 
@@ -516,6 +535,11 @@ fn ensure_pd_entry(pml4_idx: usize, pdpt_idx: usize, pd_idx: usize, flags: u64)
 
         // Zero the new page table via recursive mapping
         zero_page_table(pt_table_addr(pml4_idx, pdpt_idx, pd_idx));
+    } else if page_flags & flags::USER != 0 && !entry.is_user() {
+        // Existing entry needs USER bit added
+        let mut updated = entry;
+        updated.set_flags(entry.flags() | flags::USER);
+        write_pd(pml4_idx, pdpt_idx, pd_idx, updated);
     }
     Ok(())
 }
diff --git a/src/process/mod.rs b/src/process/mod.rs
index 19532e8..ba3d782 100644
--- a/src/process/mod.rs
+++ b/src/process/mod.rs
@@ -219,10 +219,17 @@ fn init_page_table(pml4_phys: PhysAddr) -> Result<(), ProcessError> {
         }
 
         // Copy kernel-space entries (256-511) from current PML4
-        // These include the recursive mapping (510) and kernel mapping (511)
+        // EXCEPT for entry 510 (recursive mapping) which needs to point to THIS PML4
         for i in 256..512 {
-            let entry = paging::read_pml4(i);
-            core::ptr::write_volatile(pml4_ptr.add(i), entry.bits());
+            if i == 510 {
+                // Set recursive mapping to point to this new PML4 itself
+                // Use PRESENT | WRITABLE flags (same as kernel recursive mapping)
+                let self_ref_entry = pml4_phys.as_u64() | flags::PRESENT | flags::WRITABLE;
+                core::ptr::write_volatile(pml4_ptr.add(i), self_ref_entry);
+            } else {
+                let entry = paging::read_pml4(i);
+                core::ptr::write_volatile(pml4_ptr.add(i), entry.bits());
+            }
         }
     }
 
@@ -321,3 +328,93 @@ pub fn switch_to_kernel() {
         );
     }
 }
+
+/// Jump to user mode (ring 3)
+///
+/// This uses IRETQ to transition from ring 0 to ring 3.
+///
+/// # Arguments
+/// * `entry` - User code entry point
+/// * `stack` - User stack pointer
+///
+/// # Safety
+/// The entry point and stack must be valid mapped addresses in user space.
+/// The current process's page table must have proper user mappings.
+pub unsafe fn jump_to_user(entry: u64, stack: u64) -> ! {
+    use crate::arch::x86_64::gdt;
+    use crate::serial::SerialPort;
+    use core::fmt::Write;
+
+    let user_cs = gdt::user_cs() as u64;
+    let user_ds = gdt::user_ds() as u64;
+
+    // Debug: Print what we're about to push
+    let mut serial = unsafe { SerialPort::new(0x3F8) };
+    writeln!(serial, "    IRETQ frame: SS={:#x} RSP={:#x} RFLAGS=0x202 CS={:#x} RIP={:#x}",
+             user_ds, stack, user_cs, entry).ok();
+
+    // Verify the entry point is mapped and accessible
+    use crate::memory::paging;
+    use crate::memory::VirtAddr;
+    let entry_virt = VirtAddr::new(entry);
+    if let Some((phys, size, flags)) = paging::get_mapping_info(entry_virt) {
+        writeln!(serial, "    Entry mapping: phys={:#x} size={:?} flags={:#x}", phys, size, flags).ok();
+    } else {
+        writeln!(serial, "    WARNING: Entry point {:#x} is NOT MAPPED!", entry).ok();
+    }
+
+    // Verify the stack is mapped
+    let stack_virt = VirtAddr::new(stack - 8);  // Stack will be decremented
+    if let Some((phys, size, flags)) = paging::get_mapping_info(stack_virt) {
+        writeln!(serial, "    Stack mapping: phys={:#x} size={:?} flags={:#x}", phys, size, flags).ok();
+    } else {
+        writeln!(serial, "    WARNING: Stack {:#x} is NOT MAPPED!", stack).ok();
+    }
+
+    // Flush TLB to ensure all page table changes are visible
+    // This reloads CR3 which flushes the entire TLB
+    unsafe {
+        let cr3: u64;
+        core::arch::asm!("mov {}, cr3", out(reg) cr3, options(nostack, preserves_flags));
+        core::arch::asm!("mov cr3, {}", in(reg) cr3, options(nostack, preserves_flags));
+    }
+
+    // IRETQ expects the stack to contain (top to bottom):
+    // [RSP+0]  RIP   - last pushed, first popped
+    // [RSP+8]  CS
+    // [RSP+16] RFLAGS
+    // [RSP+24] RSP
+    // [RSP+32] SS    - first pushed, last popped
+    //
+    // Note: DS/ES/FS/GS must be set to valid user selectors before IRETQ
+    // when transitioning to ring 3. Using null (0) is valid in 64-bit mode.
+    unsafe {
+        core::arch::asm!(
+            // Set DS/ES/FS/GS to null using r11 - avoid clobbering input registers
+            "xor r11d, r11d",
+            "mov ds, r11w",
+            "mov es, r11w",
+            "mov fs, r11w",
+            "mov gs, r11w",
+
+            // Memory barrier to ensure all stores are complete
+            "mfence",
+
+            // Build IRETQ frame on stack
+            "push {user_ss}",   // SS
+            "push {stack}",     // RSP
+            "push 0x202",       // RFLAGS (IF=1, reserved bit 1 = 1)
+            "push {user_cs}",   // CS
+            "push {entry}",     // RIP
+
+            // Jump to user mode
+            "iretq",
+
+            user_cs = in(reg) user_cs,
+            user_ss = in(reg) user_ds,  // SS same as DS for user mode
+            entry = in(reg) entry,
+            stack = in(reg) stack,
+            options(noreturn)
+        );
+    }
+}