diff --git a/ports/stm8/README b/ports/stm8/README
index 2f6c198..b4c7366 100644
--- a/ports/stm8/README
+++ b/ports/stm8/README
@@ -436,4 +436,71 @@ included in any STM8 products. If it is included in future products then
 you will need to put the device in the stack compatible mode described.
 
 
+---------------------------------------------------------------------------
+
+WRITING NEW INTERRUPT HANDLERS
+
+All interrupt handlers which will call out to the OS kernel and potentially
+cause a thread switch must call atomIntEnter() and atomIntExit(). An
+example of this can be seen in the timer tick ISR in atomport.c.
+
+With the Cosmic compiler port it is also necessary to add the @svlreg
+modifier to any interrupt handlers which call out to the OS kernel.
+Alternatively you may use the INTERRUPT macro from atomport-private.h which
+always adds the @svlreg modifier. This modifier ensures that the c_lreg
+virtual register is saved on the interrupted thread's stack for any
+preemptive context switches. It also ensures that longs are available for
+use within any OS kernel code called as part of the interrupt handling. 
+
+You may also implement fast interrupt handlers in the system which do not
+call atomIntEnter()/atomIntExit() and which do not need the @svlreg
+modifier, however these ISRs cannot perform OS functions such as posting
+semaphores or effecting a thread switch.
+
+
+---------------------------------------------------------------------------
+
+COSMIC COMPILER VIRTUAL REGISTERS
+
+The STM8 has only very few CPU registers, so the Cosmic compiler augments
+them with three "virtual" registers, which are simply locations in fast
+memory. These registers are called c_x, c_y and c_lreg.
+
+The Atomthreads context switch for Cosmic/STM8 takes advantage of the fact
+that all CPU and virtual registers are automatically saved on the stack by
+the compiler when calling out to C functions.
+
+For cooperative context switches (where a thread calls an OS kernel
+function to schedule itself out), any of these registers which should be
+preserved across the function call are automatically saved on the stack by
+the compiler before the context switch is even called. This means that no
+CPU or virtual registers actually have to be saved in the context switch
+routine, making cooperative switches potentially very cheap if few
+registers must be preserved.
+
+For preemptive switches (where an ISR has interrupted a thread and wishes
+to switch to a new thread), the interrupt handler prologue also saves all
+CPU and virtual registers. In this case all registers must always be saved
+because the ISR has no knowledge of what registers the interrupted thread
+was using, so we cannot take advantage of the potential for saving fewer
+than the full set of registers that we achieve with cooperative switches.
+With the Cosmic compiler, interrupt handlers that call out to C functions
+(as would happen on a thread switch) always save the CPU registers and
+the virtual registers c_x and c_y. For the Atomthreads port we force
+interrupt handlers to also save the virtual register c_lreg. This is to
+ensure that the interrupted thread's c_lreg value is preserved across a
+thread switch, but also ensures that longs can be used within the OS
+kernel code called by interrupt handlers (c_lreg is used by the compiler
+for handling longs and floats).
+
+An alternative scheme would be to not save c_lreg in all interrupt
+handlers and instead save it in the context-switch function. This would
+allow interrupt handlers to avoid saving the 4-byte c_lreg on the stack,
+but it would mean that any OS kernel code called by interrupt handlers
+could not deal with longs, which would be an unfortunate burden on the
+core portable OS code just for the benefit of this one architecture and
+compiler. It would also mean that c_lreg is always saved unnecessarily
+for every cooperative context switch. 
+
+
 ---------------------------------------------------------------------------
diff --git a/ports/stm8/atomport-asm-cosmic.s b/ports/stm8/atomport-asm-cosmic.s
index d67e731..7e9e2c2 100644
--- a/ports/stm8/atomport-asm-cosmic.s
+++ b/ports/stm8/atomport-asm-cosmic.s
@@ -36,19 +36,19 @@ xdef _archContextSwitch, _archFirstThreadRestore
 
 
 ;  \b archContextSwitch
-; 
+;
 ;  Architecture-specific context switch routine.
-; 
+;
 ;  Note that interrupts are always locked out when this routine is
 ;  called. For cooperative switches, the scheduler will have entered
 ;  a critical region. For preemptions (called from an ISR), the
 ;  ISR will have disabled interrupts on entry.
-; 
+;
 ;  @param[in] old_tcb_ptr Pointer to the thread being scheduled out
 ;  @param[in] new_tcb_ptr Pointer to the thread being scheduled in
-; 
+;
 ;  @return None
-; 
+;
 ;  void archContextSwitch (ATOM_TCB *old_tcb_ptr, ATOM_TCB *new_tcb_ptr)
 _archContextSwitch:
 
@@ -63,7 +63,11 @@ _archContextSwitch:
     ; PC: Program counter
     ; CC: Code condition register
     ;
-    ; 
+    ; Cosmic compiler virtual registers:
+    ;
+    ; c_x, c_y: Scratch memory areas saved by ISRs
+    ; c_lreg: Scratch memory area only saved by ISRs with @svlreg
+    ;
     ; If this is a cooperative context switch (a thread has called us
     ; to schedule itself out), the Cosmic compiler will have saved any
     ; of the registers which it does not want us to clobber. There are
@@ -77,15 +81,19 @@ _archContextSwitch:
     ; similarly saved all registers which it needs us not to clobber
     ; which in the case of this compiler is all registers. Again, we
     ; do not need to save any registers because no registers are
-	; expected to be unclobbered by a subroutine.
+	; expected to be unclobbered by a subroutine. Note that it is
+	; necessary to add the @svlreg modifier to ISRs which call out to
+	; the OS in order to force a save of c_lreg. The rest of the CPU
+	; registers and the c_x and c_y virtual registers are, however,
+    ; always saved by ISRs which call out to C subroutines.
     ;
     ; This is an unusual context switch routine, because it does not
 	; need to actually save any registers. Instead, the act of
 	; calling this function causes all registers which must not be
-	; clobbered to be saved on the stack anyway in the case of 
+	; clobbered to be saved on the stack anyway in the case of
 	; cooperative context switches. For preemptive switches, the
-	; interrupt service routine which calls out to here causes all
-	; registers to be saved in a similar fashion.
+	; interrupt service routine which calls out to here also causes
+	; all registers to be saved in a similar fashion.
 
     ; We do have to do some work in here though: we need to store
     ; the current stack pointer to the current thread's TCB, and
@@ -109,7 +117,7 @@ _archContextSwitch:
     ; Our stack frame now contains all registers (if this is a preemptive
     ; switch due to an interrupt handler) or those registers which the
     ; calling function did not wish to be clobbered (if this is a
-    ; cooperative context switch). It also contains the return address 
+    ; cooperative context switch). It also contains the return address
     ; which will be either a function called via an ISR (for preemptive
     ; switches) or a function called from thread context (for cooperative
     ; switches). Finally, the stack also contains the aforementioned
@@ -118,7 +126,7 @@ _archContextSwitch:
     ; In addition, the thread's stack pointer (after context-save) is
     ; stored in the thread's TCB.
 
-    ; We are now ready to restore the new thread's context. In most 
+    ; We are now ready to restore the new thread's context. In most
     ; architecture ports we would typically switch our stack pointer
     ; to the new thread's stack pointer, and pop all of its context
     ; off the stack, before returning to the caller (the original
@@ -135,11 +143,11 @@ _archContextSwitch:
 
     ; Pull the new_tcb_ptr parameter from the stack into X register
     ldw X,($3,SP)
-    
-    ; Pull the first entry out of new_tcb_ptr (the new thread's 
+
+    ; Pull the first entry out of new_tcb_ptr (the new thread's
     ; stack pointer) into X register.
     ldw X,(X)
-    
+
     ; Switch our current stack pointer to that of the new thread.
     ldw SP,X
 
@@ -161,7 +169,7 @@ _archContextSwitch:
     ; because this is a subroutine regardless of whether we were called
     ; during an ISR or by a thread cooperatively switching out. The
     ; difference between RET and IRET on the STM8 architecture is that
-    ; RET only pops the return address off the stack, while IRET also 
+    ; RET only pops the return address off the stack, while IRET also
     ; pops from the stack all of the CPU registers saved when the ISR
     ; started, including restoring the interrupt-enable bits from the CC
     ; register.
@@ -243,8 +251,8 @@ _archContextSwitch:
     ;    different thread's stack. Because the stack pointer is
     ;    switched, it does not matter that on entry via ISRs more
     ;    registers are saved on the original thread's stack than entries
-    ;    via non-ISRs. Those extra registers will be restored properly 
-    ;    by an IRET when the thread is eventually scheduled back in 
+    ;    via non-ISRs. Those extra registers will be restored properly
+    ;    by an IRET when the thread is eventually scheduled back in
     ;    (which could be a long way off). This assumes that the CPU does
     ;    not have hidden behaviour that occurs on interrupts, and we can
     ;    in fact trick it into leaving via another thread's call stack,
@@ -269,7 +277,7 @@ _archContextSwitch:
 ; data for being restored by either this function or the normal
 ; function used for scheduling threads in, archContextSwitch(). Only
 ; the first thread run by the system is launched via this function,
-; after which all other new threads will first be run by 
+; after which all other new threads will first be run by
 ; archContextSwitch().
 ;
 ; Typically ports will implement something similar here to the
@@ -277,21 +285,21 @@ _archContextSwitch:
 ; switch does not restore many registers, and instead relies on the
 ; fact that returning from any function which called
 ; archContextSwitch() will restore any of the necessary registers.
-; For new threads which have never been run there is no calling 
+; For new threads which have never been run there is no calling
 ; function which will restore context on return, therefore we
 ; do not restore many register values here. It is not necessary
 ; for the new threads to have initialised values for the scratch
-; registers A, X and Y or the code condition register CC which 
+; registers A, X and Y or the code condition register CC which
 ; leaves SP and PC. SP is restored because this is always needed to
-; switch to a new thread's stack context. It is not necessary to 
-; restore PC, because the thread's entry point is in the stack 
-; context (when this function returns using RET the PC is 
+; switch to a new thread's stack context. It is not necessary to
+; restore PC, because the thread's entry point is in the stack
+; context (when this function returns using RET the PC is
 ; automatically changed to the thread's entry point because the
-; entry point is stored in the preinitialised stack). 
+; entry point is stored in the preinitialised stack).
 ;
 ; When new threads are started interrupts must be enabled, so there
 ; is some scope for enabling interrupts in the CC here. It must be
-; done for all new threads, however, not just the first thread, so 
+; done for all new threads, however, not just the first thread, so
 ; we use a different system. We instead use a thread shell routine
 ; which all functions run when they are first started, and
 ; interrupts are enabled in there. This allows us to avoid having
@@ -327,7 +335,7 @@ _archFirstThreadRestore:
     ; As described above, first thread restores in this port do not
     ; expect any initial register context to be pre-initialised in
     ; the thread's stack area. The thread's initial stack need only
-    ; contain the thread's initial entry point, and we do not even 
+    ; contain the thread's initial entry point, and we do not even
     ; "restore" that within this function. We leave the thread's entry
     ; point in the stack, and RET at the end of the function pops it
     ; off and "returns" to the entry point as if we were called from
@@ -342,7 +350,7 @@ _archFirstThreadRestore:
     ; pointer it conveniently located at the top of the TCB so no
     ; indexing is required to pull it out.
     ldw X,(X)
-    
+
     ; Switch our current stack pointer to that of the new thread.
     ldw SP,X
 
@@ -353,4 +361,4 @@ _archFirstThreadRestore:
     ret
 
 
-    end
\ No newline at end of file
+    end
diff --git a/ports/stm8/atomport-asm-iar.s b/ports/stm8/atomport-asm-iar.s
index 5c99850..c6e6977 100644
--- a/ports/stm8/atomport-asm-iar.s
+++ b/ports/stm8/atomport-asm-iar.s
@@ -39,19 +39,19 @@
 
 
 ;  \b archContextSwitch
-; 
+;
 ;  Architecture-specific context switch routine.
-; 
+;
 ;  Note that interrupts are always locked out when this routine is
 ;  called. For cooperative switches, the scheduler will have entered
 ;  a critical region. For preemptions (called from an ISR), the
 ;  ISR will have disabled interrupts on entry.
-; 
+;
 ;  @param[in] old_tcb_ptr Pointer to the thread being scheduled out
 ;  @param[in] new_tcb_ptr Pointer to the thread being scheduled in
-; 
+;
 ;  @return None
-; 
+;
 ;  void archContextSwitch (ATOM_TCB *old_tcb_ptr, ATOM_TCB *new_tcb_ptr)
   PUBLIC archContextSwitch
 archContextSwitch:
@@ -190,7 +190,7 @@ archContextSwitch:
     ; because this is a subroutine regardless of whether we were called
     ; during an ISR or by a thread cooperatively switching out. The
     ; difference between RET and IRET on the STM8 architecture is that
-    ; RET only pops the return address off the stack, while IRET also 
+    ; RET only pops the return address off the stack, while IRET also
     ; pops from the stack all of the CPU registers saved when the ISR
     ; started, including restoring the interrupt-enable bits from the CC
     ; register.
@@ -272,8 +272,8 @@ archContextSwitch:
     ;    different thread's stack. Because the stack pointer is
     ;    switched, it does not matter that on entry via ISRs more
     ;    registers are saved on the original thread's stack than entries
-    ;    via non-ISRs. Those extra registers will be restored properly 
-    ;    by an IRET when the thread is eventually scheduled back in 
+    ;    via non-ISRs. Those extra registers will be restored properly
+    ;    by an IRET when the thread is eventually scheduled back in
     ;    (which could be a long way off). This assumes that the CPU does
     ;    not have hidden behaviour that occurs on interrupts, and we can
     ;    in fact trick it into leaving via another thread's call stack,
@@ -298,7 +298,7 @@ archContextSwitch:
 ; data for being restored by either this function or the normal
 ; function used for scheduling threads in, archContextSwitch(). Only
 ; the first thread run by the system is launched via this function,
-; after which all other new threads will first be run by 
+; after which all other new threads will first be run by
 ; archContextSwitch().
 ;
 ; Typically ports will implement something similar here to the
@@ -306,21 +306,21 @@ archContextSwitch:
 ; switch does not restore many registers, and instead relies on the
 ; fact that returning from any function which called
 ; archContextSwitch() will restore any of the necessary registers.
-; For new threads which have never been run there is no calling 
+; For new threads which have never been run there is no calling
 ; function which will restore context on return, therefore we
 ; do not restore many register values here. It is not necessary
 ; for the new threads to have initialised values for the scratch
-; registers A, X and Y or the code condition register CC which 
+; registers A, X and Y or the code condition register CC which
 ; leaves SP and PC. SP is restored because this is always needed to
-; switch to a new thread's stack context. It is not necessary to 
-; restore PC, because the thread's entry point is in the stack 
-; context (when this function returns using RET the PC is 
+; switch to a new thread's stack context. It is not necessary to
+; restore PC, because the thread's entry point is in the stack
+; context (when this function returns using RET the PC is
 ; automatically changed to the thread's entry point because the
-; entry point is stored in the preinitialised stack). 
+; entry point is stored in the preinitialised stack).
 ;
 ; When new threads are started interrupts must be enabled, so there
 ; is some scope for enabling interrupts in the CC here. It must be
-; done for all new threads, however, not just the first thread, so 
+; done for all new threads, however, not just the first thread, so
 ; we use a different system. We instead use a thread shell routine
 ; which all functions run when they are first started, and
 ; interrupts are enabled in there. This allows us to avoid having
@@ -356,7 +356,7 @@ archFirstThreadRestore:
     ; As described above, first thread restores in this port do not
     ; expect any initial register context to be pre-initialised in
     ; the thread's stack area. The thread's initial stack need only
-    ; contain the thread's initial entry point, and we do not even 
+    ; contain the thread's initial entry point, and we do not even
     ; "restore" that within this function. We leave the thread's entry
     ; point in the stack, and RET at the end of the function pops it
     ; off and "returns" to the entry point as if we were called from
@@ -371,7 +371,7 @@ archFirstThreadRestore:
     ; pointer it conveniently located at the top of the TCB so no
     ; indexing is required to pull it out.
     ldw X,(X)
-    
+
     ; Switch our current stack pointer to that of the new thread.
     ldw SP,X
 
@@ -392,4 +392,4 @@ archFirstThreadRestore:
     ret
 
 
-    end
\ No newline at end of file
+    end
diff --git a/ports/stm8/atomport-private.h b/ports/stm8/atomport-private.h
index 58709ee..5a013ab 100644
--- a/ports/stm8/atomport-private.h
+++ b/ports/stm8/atomport-private.h
@@ -31,12 +31,40 @@
 #define __ATOM_PORT_PRIVATE_H
 
 
-/* Function prototypes */
-void archInitSystemTickTimer (void);
-#ifdef __CSMC__
-@far @interrupt void TIM1_SystemTickISR (void);
-#elif __IAR_SYSTEMS_ICC__
-__interrupt void TIM1_SystemTickISR (void);
+/**
+ * Compiler-specific modifier to prevent some functions from saving
+ * and restoring registers on entry and exit, if the function is
+ * known to never complete (e.g. thread entry points).
+ * Reduces stack usage on supporting compilers.
+ */
+#ifdef __IAR_SYSTEMS_ICC__
+#define NO_REG_SAVE __task
+#else
+#define NO_REG_SAVE
 #endif
 
+
+/**
+ * Compiler-specific modifiers for interrupt handler functions.
+ *
+ * COSMIC: Uses @interrupt modifier for interrupt handlers. We
+ * also force all interrupts to save c_lreg, a separate memory
+ * area which Cosmic uses for longs and floats. This memory
+ * area must be saved by interrupt handlers for context
+ * switch purposes, and to avoid making it impossible to use
+ * longs in any OS kernel code accessed by interrupt handlers.
+ *
+ * IAR: Uses __interrupt modifier for interrupt handlers.
+ */
+#ifdef __CSMC__
+#define INTERRUPT @far @interrupt @svlreg
+#else
+#define INTERRUPT __interrupt
+#endif
+
+
+/* Function prototypes */
+void archInitSystemTickTimer (void);
+INTERRUPT void TIM1_SystemTickISR (void);
+
 #endif /* __ATOM_PORT_PRIVATE_H */
diff --git a/ports/stm8/atomport.c b/ports/stm8/atomport.c
index 686d686..8987dfc 100644
--- a/ports/stm8/atomport.c
+++ b/ports/stm8/atomport.c
@@ -34,7 +34,7 @@
 
 
 /** Forward declarations */
-static void thread_shell (void);
+static NO_REG_SAVE void thread_shell (void);
 
 
 /**
@@ -66,9 +66,16 @@ static void thread_shell (void);
  * first time because you can preinitialise the stack context with
  * a suitable register value that will enable interrupts.
  *
+ * If the compiler supports it, stack space can be saved by preventing
+ * the function from saving registers on entry. This is because we
+ * are called directly by the context-switch assembler, and know that
+ * threads cannot return from here. The NO_REG_SAVE macro is used to
+ * denote such functions in a compiler-agnostic way, though not all
+ * compilers support it.
+ *
  * @return None
  */
-static void thread_shell (void)
+static NO_REG_SAVE void thread_shell (void)
 {
     ATOM_TCB *curr_tcb;
 
@@ -207,7 +214,7 @@ void archThreadContextInit (ATOM_TCB *tcb_ptr, void *stack_top, void (*entry_poi
 
     /**
      * All thread context has now been initialised. All that is left
-     * is to save the current stack pointer to the thread's TCB so 
+     * is to save the current stack pointer to the thread's TCB so
      * that it knows where to start looking when the thread is started.
      */
     tcb_ptr->sp_save_ptr = stack_ptr;
@@ -277,9 +284,8 @@ void archInitSystemTickTimer ( void )
  */
 #if defined(__IAR_SYSTEMS_ICC__)
 #pragma vector = 13
-__interrupt 
 #endif
-void TIM1_SystemTickISR (void)
+INTERRUPT void TIM1_SystemTickISR (void)
 {
     /* Call the interrupt entry routine */
     atomIntEnter();
diff --git a/ports/stm8/tests-main.c b/ports/stm8/tests-main.c
index 8e4ff97..dc2e372 100644
--- a/ports/stm8/tests-main.c
+++ b/ports/stm8/tests-main.c
@@ -122,8 +122,15 @@ static void main_thread_func (uint32_t data);
  *
  * Sets up the STM8 hardware resources (system tick timer interrupt) necessary
  * for the OS to be started. Creates an application thread and starts the OS.
+ *
+ * If the compiler supports it, stack space can be saved by preventing
+ * the function from saving registers on entry. This is because we
+ * are called directly by the C startup assembler, and know that we will
+ * never return from here. The NO_REG_SAVE macro is used to denote such 
+ * functions in a compiler-agnostic way, though not all compilers support it.
+ *
  */
-void main ( void )
+NO_REG_SAVE void main ( void )
 {
     int8_t status;
 
@@ -164,7 +171,10 @@ void main ( void )
     }
 
     /* There was an error starting the OS if we reach here */
-    return;
+    while (1)
+    {
+    }
+
 }
 
 
diff --git a/tests/kern2.c b/tests/kern2.c
index 3eaf951..f0b5adf 100644
--- a/tests/kern2.c
+++ b/tests/kern2.c
@@ -63,16 +63,16 @@ uint32_t test_start (void)
     uint8_t eight = 8;
     uint8_t nine = 9;
     uint8_t ten = 10;
-    uint8_t eleven = 11;
-    uint8_t twelve = 12;
-    uint8_t thirteen = 13;
-    uint8_t fourteen = 14;
-    uint8_t fifteen = 15;
-    uint8_t sixteen = 16;
-    uint8_t seventeen = 17;
-    uint8_t eighteen = 18;
-    uint8_t nineteen = 19;
-    uint8_t twenty = 20;
+    uint16_t eleven = 11;
+    uint16_t twelve = 12;
+    uint16_t thirteen = 13;
+    uint16_t fourteen = 14;
+    uint16_t fifteen = 15;
+    uint32_t sixteen = 16;
+    uint32_t seventeen = 17;
+    uint32_t eighteen = 18;
+    uint32_t nineteen = 19;
+    uint32_t twenty = 20;
 
     /* Default to zero failures */
     failures = 0;
@@ -185,4 +185,4 @@ uint32_t test_start (void)
     /* Quit */
     return failures;
 
-}
\ No newline at end of file
+}