Replace INT 0x80 with native SYSCALL/SYSRET

Implement fast system calls using the x86-64 SYSCALL/SYSRET instructions:

- Configure MSRs: EFER (enable SCE), STAR (segment selectors),
  LSTAR (entry point), SFMASK (disable interrupts on entry)
- Add per-CPU data structure accessed via GS segment for kernel
  stack pointer storage during privilege transitions
- Use SWAPGS in syscall entry to access per-CPU data and switch
  from user to kernel stack
- Add SWAPGS before IRETQ to user mode to set up correct GS state
  for subsequent syscalls

SYSCALL/SYSRET is significantly faster than INT 0x80 as it avoids
IDT lookup and uses direct MSR-configured entry.
This commit is contained in:
wilkie
2025-12-28 02:18:33 -05:00
parent 4e76831fb0
commit 1842896ee9
3 changed files with 302 additions and 82 deletions

View File

@@ -1,20 +1,119 @@
//! System Call Interface for x86-64
//! System Call Interface for x86-64 using SYSCALL/SYSRET
//!
//! This module implements system calls using INT 0x80.
//! This is simpler than SYSCALL/SYSRET and uses the existing IDT infrastructure.
//! This module implements fast system calls using the native x86-64
//! SYSCALL and SYSRET instructions.
//!
//! Syscall Convention:
//! Syscall Convention (Linux-compatible):
//! - RAX = syscall number
//! - RDI, RSI, RDX, R10, R8, R9 = arguments
//! - RAX = return value
//! - RCX and R11 are clobbered (used by SYSCALL for RIP and RFLAGS)
//!
//! On SYSCALL entry:
//! - RCX = return RIP
//! - R11 = return RFLAGS
//! - CS = kernel code segment (from STAR)
//! - SS = kernel data segment (from STAR)
//! - RSP = unchanged (still user stack!)
//!
//! We use SWAPGS to access per-CPU data containing the kernel stack pointer.
use core::arch::asm;
use crate::arch::x86_64::gdt;
/// System call interrupt vector
pub const SYSCALL_VECTOR: u8 = 0x80;
// ============================================================================
// MSR Definitions
// ============================================================================
/// Extended Feature Enable Register
const IA32_EFER: u32 = 0xC0000080;
/// System Call Extensions enable bit in EFER
const EFER_SCE: u64 = 1 << 0;
/// STAR: Segment selectors for SYSCALL/SYSRET
/// Bits 47:32 = SYSCALL CS (SS = CS + 8)
/// Bits 63:48 = SYSRET CS base (CS = base + 16, SS = base + 8)
const IA32_STAR: u32 = 0xC0000081;
/// LSTAR: Target RIP for SYSCALL (Long Mode)
const IA32_LSTAR: u32 = 0xC0000082;
/// SFMASK: RFLAGS mask for SYSCALL (bits set here are cleared in RFLAGS)
const IA32_SFMASK: u32 = 0xC0000084;
/// GS base for current privilege level
const IA32_GS_BASE: u32 = 0xC0000101;
/// GS base swapped by SWAPGS
const IA32_KERNEL_GS_BASE: u32 = 0xC0000102;
// ============================================================================
// MSR Access
// ============================================================================
/// Read a Model-Specific Register
#[inline]
unsafe fn rdmsr(msr: u32) -> u64 {
let low: u32;
let high: u32;
unsafe {
asm!(
"rdmsr",
in("ecx") msr,
out("eax") low,
out("edx") high,
options(nostack, nomem, preserves_flags)
);
}
((high as u64) << 32) | (low as u64)
}
/// Write a Model-Specific Register
#[inline]
unsafe fn wrmsr(msr: u32, value: u64) {
let low = value as u32;
let high = (value >> 32) as u32;
unsafe {
asm!(
"wrmsr",
in("ecx") msr,
in("eax") low,
in("edx") high,
options(nostack, nomem, preserves_flags)
);
}
}
// ============================================================================
// Per-CPU Data
// ============================================================================
/// Per-CPU data structure accessed via GS segment
///
/// This is used by the SYSCALL entry point to get the kernel stack.
/// Fields are at fixed offsets used by assembly code.
#[repr(C)]
pub struct PerCpuData {
/// Kernel stack pointer (offset 0)
/// Set to top of kernel stack for this CPU
pub kernel_rsp: u64,
/// User stack pointer (offset 8)
/// Saved here during SYSCALL, restored on SYSRET
pub user_rsp: u64,
/// Current process pointer (offset 16)
pub current_process: u64,
}
/// Static per-CPU data (for single-CPU system)
static mut PER_CPU: PerCpuData = PerCpuData {
kernel_rsp: 0,
user_rsp: 0,
current_process: 0,
};
// ============================================================================
// System Call Numbers
// ============================================================================
/// System call numbers
pub mod numbers {
pub const EXIT: u64 = 0;
pub const WRITE: u64 = 1;
@@ -22,99 +121,200 @@ pub mod numbers {
pub const GETPID: u64 = 3;
}
/// Initialize the syscall interface
///
/// This adds the INT 0x80 handler to the IDT.
pub fn init() {
use crate::arch::x86_64::interrupts::{GateType, set_handler};
// ============================================================================
// RFLAGS bits to mask on SYSCALL
// ============================================================================
// Set up INT 0x80 as a trap gate with DPL 3 (user-callable)
set_handler(SYSCALL_VECTOR, syscall_entry as *const () as u64, GateType::Trap, 3);
const RFLAGS_IF: u64 = 1 << 9; // Interrupt enable
const RFLAGS_TF: u64 = 1 << 8; // Trap flag (single-step)
const RFLAGS_DF: u64 = 1 << 10; // Direction flag
const RFLAGS_AC: u64 = 1 << 18; // Alignment check
const RFLAGS_NT: u64 = 1 << 14; // Nested task
/// Flags to clear on SYSCALL entry
/// We clear IF (disable interrupts), TF (no single-step), DF (clear direction)
const SFMASK_VALUE: u64 = RFLAGS_IF | RFLAGS_TF | RFLAGS_DF | RFLAGS_AC | RFLAGS_NT;
// ============================================================================
// Initialization
// ============================================================================
/// Initialize the SYSCALL/SYSRET mechanism
///
/// This configures the MSRs and sets up per-CPU data.
/// Must be called after GDT is set up with user segments.
pub fn init(kernel_stack: u64) {
unsafe {
// Set up per-CPU data via raw pointer
let per_cpu = &raw mut PER_CPU;
(*per_cpu).kernel_rsp = kernel_stack;
(*per_cpu).user_rsp = 0;
(*per_cpu).current_process = 0;
// Set GS bases for SWAPGS
// When in user mode: GS_BASE = user value, KERNEL_GS_BASE = &PER_CPU
// When in kernel mode: GS_BASE = &PER_CPU, KERNEL_GS_BASE = user value
// We start in kernel mode, so set GS_BASE to PER_CPU
let per_cpu_addr = per_cpu as u64;
wrmsr(IA32_GS_BASE, per_cpu_addr);
wrmsr(IA32_KERNEL_GS_BASE, 0); // User's GS base (0 for now)
// Enable System Call Extensions in EFER
let efer = rdmsr(IA32_EFER);
wrmsr(IA32_EFER, efer | EFER_SCE);
// Set up STAR: segment selectors
// Bits 47:32 = SYSCALL CS and SS (kernel segments)
// Bits 63:48 = SYSRET CS and SS base (user segments)
//
// Our GDT layout:
// 0x08: Kernel code
// 0x10: Kernel data
// 0x18: User data
// 0x20: User code
//
// For SYSCALL: CS = STAR[47:32], SS = STAR[47:32] + 8
// We want CS = 0x08, SS = 0x10, so STAR[47:32] = 0x08
//
// For SYSRET (64-bit): CS = STAR[63:48] + 16, SS = STAR[63:48] + 8
// We want CS = 0x20, SS = 0x18, so STAR[63:48] = 0x10
let star = (0x10u64 << 48) | (0x08u64 << 32);
wrmsr(IA32_STAR, star);
// Set up LSTAR: syscall entry point
wrmsr(IA32_LSTAR, syscall_entry as *const () as u64);
// Set up SFMASK: flags to clear on SYSCALL
wrmsr(IA32_SFMASK, SFMASK_VALUE);
}
}
/// Syscall entry point
/// Update the kernel stack pointer in per-CPU data
pub fn set_kernel_stack(stack: u64) {
unsafe {
let per_cpu = &raw mut PER_CPU;
(*per_cpu).kernel_rsp = stack;
}
}
// ============================================================================
// SYSCALL Entry Point
// ============================================================================
/// SYSCALL entry point
///
/// This is registered as the INT 0x80 handler.
/// Stack on entry (pushed by CPU):
/// - SS, RSP, RFLAGS, CS, RIP (if from ring 3)
/// - Error code (none for INT)
/// On entry:
/// - RCX = user RIP (return address)
/// - R11 = user RFLAGS
/// - RAX = syscall number
/// - RDI, RSI, RDX, R10, R8, R9 = arguments
/// - RSP = user stack (not switched!)
/// - Interrupts are disabled (IF cleared by SFMASK)
///
/// We must:
/// 1. SWAPGS to get access to per-CPU data
/// 2. Save user RSP and load kernel RSP
/// 3. Save registers and call handler
/// 4. Restore registers
/// 5. Load user RSP
/// 6. SWAPGS back
/// 7. SYSRET
#[unsafe(naked)]
extern "C" fn syscall_entry() {
core::arch::naked_asm!(
// No error code for software interrupts
// Save all registers
"push rax",
// SWAPGS: now GS points to per-CPU data
"swapgs",
// Save user stack pointer to per-CPU data
"mov gs:[8], rsp",
// Load kernel stack pointer from per-CPU data
"mov rsp, gs:[0]",
// Now we're on the kernel stack. Build a stack frame.
// We need to save enough state to call the handler and return.
// Push user context (for potential inspection and SYSRET)
"push gs:[8]", // User RSP (from per-CPU)
"push r11", // User RFLAGS
"push rcx", // User RIP
// Push syscall arguments (callee-saved perspective)
"push rax", // Syscall number
"push rdi", // arg1
"push rsi", // arg2
"push rdx", // arg3
"push r10", // arg4 (note: r10 instead of rcx which is clobbered)
"push r8", // arg5
"push r9", // arg6
// Save remaining callee-saved registers
"push rbx",
"push rcx",
"push rdx",
"push rsi",
"push rdi",
"push rbp",
"push r8",
"push r9",
"push r10",
"push r11",
"push r12",
"push r13",
"push r14",
"push r15",
// Call Rust syscall handler
// First arg (RDI) = pointer to saved state
// Call Rust handler with pointer to stack frame
"mov rdi, rsp",
"call {handler}",
// Return value is in RAX, save it to the stack frame
"mov [rsp + 14*8], rax", // Overwrite saved RAX
// RAX now contains return value
// Restore all registers
// Restore callee-saved registers
"pop r15",
"pop r14",
"pop r13",
"pop r12",
"pop r11",
"pop r10",
"pop r9",
"pop r8",
"pop rbp",
"pop rdi",
"pop rsi",
"pop rdx",
"pop rcx",
"pop rbx",
"pop rax",
// Return from interrupt
"iretq",
// Skip over saved arguments (7 values: rax, rdi, rsi, rdx, r10, r8, r9)
"add rsp, 7 * 8",
// Pop user context for SYSRET
"pop rcx", // User RIP -> RCX for SYSRET
"pop r11", // User RFLAGS -> R11 for SYSRET
"pop rsp", // User RSP (direct pop since we're switching stacks)
// SWAPGS back: restore user's GS base
"swapgs",
// Return to user mode
// SYSRETQ: RIP = RCX, RFLAGS = R11, CS = STAR[63:48]+16, SS = STAR[63:48]+8
"sysretq",
handler = sym syscall_handler_rust,
);
}
/// Saved register state for syscall
// ============================================================================
// Syscall Handler
// ============================================================================
/// Stack frame layout for syscall
#[repr(C)]
pub struct SyscallFrame {
// Callee-saved (pushed last, at lower addresses)
pub r15: u64,
pub r14: u64,
pub r13: u64,
pub r12: u64,
pub r11: u64,
pub r10: u64,
pub r9: u64,
pub r8: u64,
pub rbp: u64,
pub rdi: u64,
pub rsi: u64,
pub rdx: u64,
pub rcx: u64,
pub rbx: u64,
pub rax: u64,
// CPU-pushed
pub rip: u64,
pub cs: u64,
pub rflags: u64,
pub rsp: u64,
pub ss: u64,
// Arguments
pub r9: u64, // arg6
pub r8: u64, // arg5
pub r10: u64, // arg4
pub rdx: u64, // arg3
pub rsi: u64, // arg2
pub rdi: u64, // arg1
pub rax: u64, // syscall number
// User context (for SYSRET)
pub user_rip: u64,
pub user_rflags: u64,
pub user_rsp: u64,
}
/// Rust syscall handler
@@ -123,8 +323,8 @@ extern "C" fn syscall_handler_rust(frame: &SyscallFrame) -> u64 {
let arg1 = frame.rdi;
let arg2 = frame.rsi;
let arg3 = frame.rdx;
let arg4 = frame.r10;
let arg5 = frame.r8;
let _arg4 = frame.r10;
let _arg5 = frame.r8;
match num {
numbers::EXIT => {
@@ -184,47 +384,59 @@ extern "C" fn syscall_handler_rust(frame: &SyscallFrame) -> u64 {
}
}
/// Make a syscall from user mode (for testing)
// ============================================================================
// User-space syscall wrappers (using SYSCALL instruction)
// ============================================================================
/// Make a syscall with no arguments
#[inline]
pub unsafe fn syscall0(num: u64) -> u64 {
let ret: u64;
unsafe {
asm!(
"int 0x80",
"syscall",
in("rax") num,
lateout("rax") ret,
out("rcx") _, // clobbered by SYSCALL
out("r11") _, // clobbered by SYSCALL
options(nostack)
);
}
ret
}
/// Make a syscall with one argument
#[inline]
pub unsafe fn syscall1(num: u64, arg1: u64) -> u64 {
let ret: u64;
unsafe {
asm!(
"int 0x80",
"syscall",
in("rax") num,
in("rdi") arg1,
lateout("rax") ret,
out("rcx") _,
out("r11") _,
options(nostack)
);
}
ret
}
/// Make a syscall with three arguments
#[inline]
pub unsafe fn syscall3(num: u64, arg1: u64, arg2: u64, arg3: u64) -> u64 {
let ret: u64;
unsafe {
asm!(
"int 0x80",
"syscall",
in("rax") num,
in("rdi") arg1,
in("rsi") arg2,
in("rdx") arg3,
lateout("rax") ret,
out("rcx") _,
out("r11") _,
options(nostack)
);
}

View File

@@ -325,11 +325,11 @@ pub fn kernel_init(info: &BootInfo) -> ! {
writeln!(serial, " GDT with TSS initialized").ok();
writeln!(serial, " Kernel stack at {:#x}", kernel_stack_top).ok();
// Initialize syscall interface (INT 0x80)
// Initialize SYSCALL/SYSRET interface
writeln!(serial, "").ok();
writeln!(serial, ">>> Initializing syscall interface...").ok();
arch::x86_64::syscall::init();
writeln!(serial, " INT 0x80 syscall handler installed").ok();
writeln!(serial, ">>> Initializing SYSCALL/SYSRET...").ok();
arch::x86_64::syscall::init(kernel_stack_top);
writeln!(serial, " SYSCALL/SYSRET configured").ok();
// Set up a dedicated stack for double fault handling (IST1)
// This ensures the double fault handler has a known-good stack even if
@@ -412,6 +412,7 @@ pub fn kernel_init(info: &BootInfo) -> ! {
// Write a simple user program that:
// 1. Calls write(1, "Hello from user mode!\n", 22)
// 2. Calls exit(0)
// Uses native SYSCALL instruction (0x0f 0x05) instead of int 0x80
let user_code_ptr = user_code_virt.as_u64() as *mut u8;
let message = b"Hello from user mode!\n";
let message_offset = 64u64; // Place message after code
@@ -422,8 +423,7 @@ pub fn kernel_init(info: &BootInfo) -> ! {
0x48, 0xc7, 0xc0, 0x01, 0x00, 0x00, 0x00,
// mov rdi, 1 (fd = stdout)
0x48, 0xc7, 0xc7, 0x01, 0x00, 0x00, 0x00,
// lea rsi, [rip + message_offset] - we'll use absolute address instead
// mov rsi, 0x400040 (message address = code_base + 64)
// mov rsi, message_address (code_base + 64)
0x48, 0xbe,
((user_code_virt.as_u64() + message_offset) & 0xFF) as u8,
(((user_code_virt.as_u64() + message_offset) >> 8) & 0xFF) as u8,
@@ -435,14 +435,14 @@ pub fn kernel_init(info: &BootInfo) -> ! {
(((user_code_virt.as_u64() + message_offset) >> 56) & 0xFF) as u8,
// mov rdx, 22 (length)
0x48, 0xc7, 0xc2, 0x16, 0x00, 0x00, 0x00,
// int 0x80
0xcd, 0x80,
// syscall
0x0f, 0x05,
// mov rax, 0 (EXIT syscall)
0x48, 0xc7, 0xc0, 0x00, 0x00, 0x00, 0x00,
// mov rdi, 0 (exit code)
0x48, 0xc7, 0xc7, 0x00, 0x00, 0x00, 0x00,
// int 0x80
0xcd, 0x80,
// syscall
0x0f, 0x05,
// hlt (should never reach here)
0xf4,
];

View File

@@ -388,18 +388,26 @@ pub unsafe fn jump_to_user(entry: u64, stack: u64) -> ! {
//
// Note: DS/ES/FS/GS must be set to valid user selectors before IRETQ
// when transitioning to ring 3. Using null (0) is valid in 64-bit mode.
//
// IMPORTANT: We must SWAPGS before going to user mode so that:
// - User mode has GS_BASE = user's TLS (0 for now)
// - KERNEL_GS_BASE = per-CPU data pointer
// This way, when SYSCALL happens, SWAPGS in the entry point will
// give the kernel access to per-CPU data.
unsafe {
core::arch::asm!(
// Set DS/ES/FS/GS to null using r11 - avoid clobbering input registers
// Set DS/ES to null using r11 - avoid clobbering input registers
"xor r11d, r11d",
"mov ds, r11w",
"mov es, r11w",
"mov fs, r11w",
"mov gs, r11w",
// Memory barrier to ensure all stores are complete
"mfence",
// SWAPGS: swap kernel GS with user GS before entering user mode
// After this: GS_BASE = user's GS (0), KERNEL_GS_BASE = per-CPU data
"swapgs",
// Build IRETQ frame on stack
"push {user_ss}", // SS
"push {stack}", // RSP