diff --git a/.hgignore b/.hgignore
index d79da1a2..2c48dc63 100644
--- a/.hgignore
+++ b/.hgignore
@@ -16,8 +16,10 @@ CMakeFiles
 CMakeCache.txt
 cmake_install.cmake
 .DS_Store
-CMakeLists.txt.user
+CMakeLists.txt.user*
 .directory
+druntime-orig
+phobos-orig
 
 syntax: regexp
 ^obj/
@@ -25,6 +27,7 @@ syntax: regexp
 ^tests/reference/
 ^tango/
 ^druntime/
+^phobos/
 ^import/
 ^bin/ldc2?$
 ^bin/ldc2?\.conf$
diff --git a/druntime.patch b/druntime.patch
index 51b50e58..8935fe38 100644
--- a/druntime.patch
+++ b/druntime.patch
@@ -1,525 +1,7 @@
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/cstdarg.di druntime/import/ldc/cstdarg.di
---- druntime-old/import/ldc/cstdarg.di	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/import/ldc/cstdarg.di	2010-09-30 22:10:37.000000000 +0400
-@@ -0,0 +1,29 @@
-+/*
-+ * vararg support for extern(C) functions
-+ */
-+
-+module ldc.cstdarg;
-+
-+// Check for the right compiler
-+version(LDC)
-+{
-+    // OK
-+}
-+else
-+{
-+    static assert(false, "This module is only valid for LDC");
-+}
-+
-+alias void* va_list;
-+
-+pragma(va_start)
-+    void va_start(T)(va_list ap, ref T);
-+
-+pragma(va_arg)
-+    T va_arg(T)(va_list ap);
-+
-+pragma(va_end)
-+    void va_end(va_list args);
-+
-+pragma(va_copy)
-+    void va_copy(va_list dst, va_list src);
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/intrinsics.di druntime/import/ldc/intrinsics.di
---- druntime-old/import/ldc/intrinsics.di	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/import/ldc/intrinsics.di	2010-10-02 14:01:02.975890001 +0400
-@@ -0,0 +1,413 @@
-+/*
-+ * This module holds declarations to LLVM intrinsics.
-+ *
-+ * See the LLVM language reference for more information:
-+ *
-+ * - http://llvm.org/docs/LangRef.html#intrinsics
-+ *
-+ */
-+
-+module ldc.intrinsics;
-+
-+// Check for the right compiler
-+version(LDC)
-+{
-+    // OK
-+}
-+else
-+{
-+    static assert(false, "This module is only valid for LDC");
-+}
-+
-+//
-+// CODE GENERATOR INTRINSICS
-+//
-+
-+
-+// The 'llvm.returnaddress' intrinsic attempts to compute a target-specific
-+// value indicating the return address of the current function or one of its
-+// callers.
-+
-+pragma(intrinsic, "llvm.returnaddress")
-+    void* llvm_returnaddress(uint level);
-+
-+
-+// The 'llvm.frameaddress' intrinsic attempts to return the target-specific
-+// frame pointer value for the specified stack frame.
-+
-+pragma(intrinsic, "llvm.frameaddress")
-+    void* llvm_frameaddress(uint level);
-+
-+
-+// The 'llvm.stacksave' intrinsic is used to remember the current state of the
-+// function stack, for use with llvm.stackrestore. This is useful for
-+// implementing language features like scoped automatic variable sized arrays
-+// in C99.
-+
-+pragma(intrinsic, "llvm.stacksave")
-+    void* llvm_stacksave();
-+
-+
-+// The 'llvm.stackrestore' intrinsic is used to restore the state of the
-+// function stack to the state it was in when the corresponding llvm.stacksave
-+// intrinsic executed. This is useful for implementing language features like
-+// scoped automatic variable sized arrays in C99.
-+
-+pragma(intrinsic, "llvm.stackrestore")
-+    void llvm_stackrestore(void* ptr);
-+
-+
-+// The 'llvm.prefetch' intrinsic is a hint to the code generator to insert a
-+// prefetch instruction if supported; otherwise, it is a noop. Prefetches have
-+// no effect on the behavior of the program but can change its performance
-+// characteristics.
-+
-+pragma(intrinsic, "llvm.prefetch")
-+    void llvm_prefetch(void* ptr, uint rw, uint locality);
-+
-+
-+// The 'llvm.pcmarker' intrinsic is a method to export a Program Counter (PC)
-+// in a region of code to simulators and other tools. The method is target
-+// specific, but it is expected that the marker will use exported symbols to
-+// transmit the PC of the marker. The marker makes no guarantees that it will
-+// remain with any specific instruction after optimizations. It is possible
-+// that the presence of a marker will inhibit optimizations. The intended use
-+// is to be inserted after optimizations to allow correlations of simulation
-+// runs.
-+
-+pragma(intrinsic, "llvm.pcmarker")
-+    void llvm_pcmarker(uint id);
-+
-+
-+// The 'llvm.readcyclecounter' intrinsic provides access to the cycle counter
-+// register (or similar low latency, high accuracy clocks) on those targets that
-+// support it. On X86, it should map to RDTSC. On Alpha, it should map to RPCC.
-+// As the backing counters overflow quickly (on the order of 9 seconds on
-+// alpha), this should only be used for small timings.
-+
-+pragma(intrinsic, "llvm.readcyclecounter")
-+    ulong readcyclecounter();
-+
-+
-+
-+
-+//
-+// STANDARD C LIBRARY INTRINSICS
-+//
-+
-+
-+// The 'llvm.memcpy.*' intrinsics copy a block of memory from the source
-+// location to the destination location.
-+// Note that, unlike the standard libc function, the llvm.memcpy.* intrinsics do
-+// not return a value, and takes an extra alignment argument.
-+
-+pragma(intrinsic, "llvm.memcpy.i#")
-+    void llvm_memcpy(T)(void* dst, void* src, T len, uint alignment);
-+
-+deprecated {
-+    alias llvm_memcpy!(uint)  llvm_memcpy_i32;
-+    alias llvm_memcpy!(ulong) llvm_memcpy_i64;
-+}
-+
-+
-+// The 'llvm.memmove.*' intrinsics move a block of memory from the source
-+// location to the destination location. It is similar to the 'llvm.memcpy'
-+// intrinsic but allows the two memory locations to overlap.
-+// Note that, unlike the standard libc function, the llvm.memmove.* intrinsics
-+// do not return a value, and takes an extra alignment argument.
-+
-+pragma(intrinsic, "llvm.memmove.i#")
-+    void llvm_memmove(T)(void* dst, void* src, T len, uint alignment);
-+
-+deprecated {
-+    alias llvm_memmove!(uint)  llvm_memmove_i32;
-+    alias llvm_memmove!(ulong) llvm_memmove_i64;
-+}
-+
-+
-+// The 'llvm.memset.*' intrinsics fill a block of memory with a particular byte
-+// value.
-+// Note that, unlike the standard libc function, the llvm.memset intrinsic does
-+// not return a value, and takes an extra alignment argument.
-+
-+pragma(intrinsic, "llvm.memset.i#")
-+    void llvm_memset(T)(void* dst, ubyte val, T len, uint alignment);
-+
-+deprecated {
-+    alias llvm_memset!(uint)  llvm_memset_i32;
-+    alias llvm_memset!(ulong) llvm_memset_i64;
-+}
-+
-+
-+// The 'llvm.sqrt' intrinsics return the sqrt of the specified operand,
-+// returning the same value as the libm 'sqrt' functions would. Unlike sqrt in
-+// libm, however, llvm.sqrt has undefined behavior for negative numbers other
-+// than -0.0 (which allows for better optimization, because there is no need to
-+// worry about errno being set). llvm.sqrt(-0.0) is defined to return -0.0 like
-+// IEEE sqrt.
-+
-+pragma(intrinsic, "llvm.sqrt.f#")
-+    T llvm_sqrt(T)(T val);
-+
-+deprecated {
-+    alias llvm_sqrt!(float)  llvm_sqrt_f32;
-+    alias llvm_sqrt!(double) llvm_sqrt_f64;
-+    alias llvm_sqrt!(real)   llvm_sqrt_f80;     // may not actually be .f80
-+}
-+
-+
-+// The 'llvm.sin.*' intrinsics return the sine of the operand.
-+
-+pragma(intrinsic, "llvm.sin.f#")
-+    T llvm_sin(T)(T val);
-+
-+deprecated {
-+    alias llvm_sin!(float)  llvm_sin_f32;
-+    alias llvm_sin!(double) llvm_sin_f64;
-+    alias llvm_sin!(real)   llvm_sin_f80;       // may not actually be .f80
-+}
-+
-+
-+// The 'llvm.cos.*' intrinsics return the cosine of the operand.
-+
-+pragma(intrinsic, "llvm.cos.f#")
-+    T llvm_cos(T)(T val);
-+
-+deprecated {
-+    alias llvm_cos!(float)  llvm_cos_f32;
-+    alias llvm_cos!(double) llvm_cos_f64;
-+    alias llvm_cos!(real)   llvm_cos_f80;       // may not actually be .f80
-+}
-+
-+
-+// The 'llvm.powi.*' intrinsics return the first operand raised to the specified
-+// (positive or negative) power. The order of evaluation of multiplications is
-+// not defined. When a vector of floating point type is used, the second
-+// argument remains a scalar integer value.
-+
-+pragma(intrinsic, "llvm.powi.f#")
-+    T llvm_powi(T)(T val, int power);
-+
-+deprecated {
-+    alias llvm_powi!(float)  llvm_powi_f32;
-+    alias llvm_powi!(double) llvm_powi_f64;
-+    alias llvm_powi!(real)   llvm_powi_f80;     // may not actually be .f80
-+}
-+
-+
-+// The 'llvm.pow.*' intrinsics return the first operand raised to the specified
-+// (positive or negative) power.
-+
-+pragma(intrinsic, "llvm.pow.f#")
-+    T llvm_pow(T)(T val, T power);
-+
-+deprecated {
-+    alias llvm_pow!(float)  llvm_pow_f32;
-+    alias llvm_pow!(double) llvm_pow_f64;
-+    alias llvm_pow!(real)   llvm_pow_f80;       // may not actually be .f80
-+}
-+
-+
-+//
-+// BIT MANIPULATION INTRINSICS
-+//
-+
-+// The 'llvm.bswap' family of intrinsics is used to byte swap integer values
-+// with an even number of bytes (positive multiple of 16 bits). These are
-+// useful for performing operations on data that is not in the target's native
-+// byte order.
-+
-+pragma(intrinsic, "llvm.bswap.i#.i#")
-+    T llvm_bswap(T)(T val);
-+
-+deprecated {
-+    alias llvm_bswap!(ushort) llvm_bswap_i16;
-+    alias llvm_bswap!(uint)   llvm_bswap_i32;
-+    alias llvm_bswap!(ulong)  llvm_bswap_i64;
-+}
-+
-+
-+// The 'llvm.ctpop' family of intrinsics counts the number of bits set in a
-+// value.
-+
-+pragma(intrinsic, "llvm.ctpop.i#")
-+    T llvm_ctpop(T)(T src);
-+
-+deprecated {
-+    alias llvm_ctpop!(ubyte)  llvm_ctpop_i8;
-+    alias llvm_ctpop!(ushort) llvm_ctpop_i16;
-+    alias llvm_ctpop!(uint)   llvm_ctpop_i32;
-+    alias llvm_ctpop!(ulong)  llvm_ctpop_i64;
-+}
-+
-+
-+// The 'llvm.ctlz' family of intrinsic functions counts the number of leading
-+// zeros in a variable.
-+
-+pragma(intrinsic, "llvm.ctlz.i#")
-+    T llvm_ctlz(T)(T src);
-+
-+deprecated {
-+    alias llvm_ctlz!(ubyte)  llvm_ctlz_i8;
-+    alias llvm_ctlz!(ushort) llvm_ctlz_i16;
-+    alias llvm_ctlz!(uint)   llvm_ctlz_i32;
-+    alias llvm_ctlz!(ulong)  llvm_ctlz_i64;
-+}
-+
-+
-+// The 'llvm.cttz' family of intrinsic functions counts the number of trailing
-+// zeros.
-+
-+pragma(intrinsic, "llvm.cttz.i#")
-+    T llvm_cttz(T)(T src);
-+
-+deprecated {
-+    alias llvm_cttz!(ubyte)  llvm_cttz_i8;
-+    alias llvm_cttz!(ushort) llvm_cttz_i16;
-+    alias llvm_cttz!(uint)   llvm_cttz_i32;
-+    alias llvm_cttz!(ulong)  llvm_cttz_i64;
-+}
-+
-+
-+// The 'llvm.part.select' family of intrinsic functions selects a range of bits
-+// from an integer value and returns them in the same bit width as the original
-+// value.
-+
-+pragma(intrinsic, "llvm.part.select.i#")
-+    T llvm_part_select(T)(T val, uint loBit, uint hiBit);
-+
-+deprecated {
-+    alias llvm_part_select!(ubyte)  llvm_part_select_i;
-+    alias llvm_part_select!(ushort) llvm_part_select_i;
-+    alias llvm_part_select!(uint)   llvm_part_select_i;
-+    alias llvm_part_select!(ulong)  llvm_part_select_i;
-+}
-+
-+
-+// The 'llvm.part.set' family of intrinsic functions replaces a range of bits
-+// in an integer value with another integer value. It returns the integer with
-+// the replaced bits.
-+
-+// TODO
-+// declare i17 @llvm.part.set.i17.i9 (i17 %val, i9 %repl, i32 %lo, i32 %hi)
-+// declare i29 @llvm.part.set.i29.i9 (i29 %val, i9 %repl, i32 %lo, i32 %hi)
-+
-+
-+
-+
-+//
-+// ATOMIC OPERATIONS AND SYNCHRONIZATION INTRINSICS
-+//
-+
-+// The llvm.memory.barrier intrinsic guarantees ordering between specific
-+// pairs of memory access types.
-+
-+pragma(intrinsic, "llvm.memory.barrier")
-+    void llvm_memory_barrier(bool ll, bool ls, bool sl, bool ss, bool device);
-+
-+// This loads a value in memory and compares it to a given value. If they are
-+// equal, it stores a new value into the memory.
-+
-+pragma(intrinsic, "llvm.atomic.cmp.swap.i#.p0i#")
-+    T llvm_atomic_cmp_swap(T)(shared T* ptr, T cmp, T val);
-+
-+// This intrinsic loads the value stored in memory at ptr and yields the value
-+// from memory. It then stores the value in val in the memory at ptr.
-+
-+pragma(intrinsic, "llvm.atomic.swap.i#.p0i#")
-+    T llvm_atomic_swap(T)(T* ptr, T val);
-+
-+// This intrinsic adds delta to the value stored in memory at ptr. It yields
-+// the original value at ptr.
-+
-+pragma(intrinsic, "llvm.atomic.load.add.i#.p0i#")
-+    T llvm_atomic_load_add(T)(shared const T* ptr, T val);
-+
-+// This intrinsic subtracts delta to the value stored in memory at ptr. It
-+// yields the original value at ptr.
-+
-+pragma(intrinsic, "llvm.atomic.load.sub.i#.p0i#")
-+    T llvm_atomic_load_sub(T)(T* ptr, T val);
-+
-+// These intrinsics bitwise the operation (and, nand, or, xor) delta to the
-+// value stored in memory at ptr. It yields the original value at ptr.
-+
-+pragma(intrinsic, "llvm.atomic.load.and.i#.p0i#")
-+    T llvm_atomic_load_and(T)(T* ptr, T val);
-+
-+pragma(intrinsic, "llvm.atomic.load.nand.i#.p0i#")
-+    T llvm_atomic_load_nand(T)(T* ptr, T val);
-+
-+pragma(intrinsic, "llvm.atomic.load.or.i#.p0i#")
-+    T llvm_atomic_load_or(T)(T* ptr, T val);
-+
-+pragma(intrinsic, "llvm.atomic.load.xor.i#.p0i#")
-+    T llvm_atomic_load_xor(T)(T* ptr, T val);
-+
-+// These intrinsics takes the signed or unsigned minimum or maximum of delta
-+// and the value stored in memory at ptr. It yields the original value at ptr.
-+
-+pragma(intrinsic, "llvm.atomic.load.max.i#.p0i#")
-+    T llvm_atomic_load_max(T)(T* ptr, T val);
-+
-+pragma(intrinsic, "llvm.atomic.load.min.i#.p0i#")
-+    T llvm_atomic_load_min(T)(T* ptr, T val);
-+
-+pragma(intrinsic, "llvm.atomic.load.umax.i#.p0i#")
-+    T llvm_atomic_load_umax(T)(T* ptr, T val);
-+
-+pragma(intrinsic, "llvm.atomic.load.umin.i#.p0i#")
-+    T llvm_atomic_load_umin(T)(T* ptr, T val);
-+
-+
-+//
-+// ARITHMETIC-WITH-OVERFLOW INTRINSICS
-+//
-+
-+struct OverflowRet(T) {
-+    static assert(is(T : int), T.stringof ~ " is not an integer type!");
-+    T result;
-+    bool overflow;
-+}
-+
-+// Signed and unsigned addition
-+pragma(intrinsic, "llvm.sadd.with.overflow.i#")
-+    OverflowRet!(T) llvm_sadd_with_overflow(T)(T lhs, T rhs);
-+
-+pragma(intrinsic, "llvm.uadd.with.overflow.i#")
-+    OverflowRet!(T) llvm_uadd_with_overflow(T)(T lhs, T rhs);
-+
-+
-+// Signed and unsigned subtraction
-+pragma(intrinsic, "llvm.ssub.with.overflow.i#")
-+    OverflowRet!(T) llvm_ssub_with_overflow(T)(T lhs, T rhs);
-+
-+pragma(intrinsic, "llvm.usub.with.overflow.i#")
-+    OverflowRet!(T) llvm_usub_with_overflow(T)(T lhs, T rhs);
-+
-+
-+// Signed and unsigned multiplication
-+pragma(intrinsic, "llvm.smul.with.overflow.i#")
-+    OverflowRet!(T) llvm_smul_with_overflow(T)(T lhs, T rhs);
-+
-+/* Note: LLVM documentations says:
-+ *  Warning: 'llvm.umul.with.overflow' is badly broken.
-+ *  It is actively being fixed, but it should not currently be used!
-+ *
-+ * See: http://llvm.org/docs/LangRef.html#int_umul_overflow
-+ */
-+//pragma(intrinsic, "llvm.umul.with.overflow.i#")
-+//    OverflowRet!(T) llvm_umul_with_overflow(T)(T lhs, T rhs);
-+
-+
-+//
-+// GENERAL INTRINSICS
-+//
-+
-+
-+// This intrinsics is lowered to the target dependent trap instruction. If the
-+// target does not have a trap instruction, this intrinsic will be lowered to
-+// the call of the abort() function.
-+
-+pragma(intrinsic, "llvm.trap")
-+    void llvm_trap();
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/llvmasm.di druntime/import/ldc/llvmasm.di
---- druntime-old/import/ldc/llvmasm.di	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/import/ldc/llvmasm.di	2010-09-30 22:10:37.000000000 +0400
-@@ -0,0 +1,17 @@
-+module ldc.llvmasm;
-+
-+struct __asmtuple_t(T...)
-+{
-+    T v;
-+}
-+
-+pragma(llvm_inline_asm)
-+{
-+    void __asm( )(char[] asmcode, char[] constraints, ...);
-+    T    __asm(T)(char[] asmcode, char[] constraints, ...);
-+
-+    template __asmtuple(T...)
-+    {
-+        __asmtuple_t!(T) __asmtuple(char[] asmcode, char[] constraints, ...);
-+    }
-+}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/vararg.d druntime/import/ldc/vararg.d
---- druntime-old/import/ldc/vararg.d	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/import/ldc/vararg.d	2010-09-30 22:10:37.000000000 +0400
-@@ -0,0 +1,43 @@
-+/*
-+ * This module holds the implementation of special vararg templates for D style var args.
-+ *
-+ * Provides the functions tango.core.Vararg expects to be present!
-+ */
-+
-+module ldc.Vararg;
-+
-+// Check for the right compiler
-+version(LDC)
-+{
-+    // OK
-+}
-+else
-+{
-+    static assert(false, "This module is only valid for LDC");
-+}
-+
-+alias void* va_list;
-+
-+void va_start(T) ( out va_list ap, inout T parmn )
-+{
-+    // not needed !
-+}
-+
-+T va_arg(T)(ref va_list vp)
-+{
-+    T* arg = cast(T*) vp;
-+    // ldc always aligns to size_t.sizeof in vararg lists
-+    vp = cast(va_list) ( cast(void*) vp + ( ( T.sizeof + size_t.sizeof - 1 ) & ~( size_t.sizeof - 1 ) ) );
-+    return *arg;
-+}
-+
-+void va_end( va_list ap )
-+{
-+    // not needed !
-+}
-+
-+void va_copy( out va_list dst, va_list src )
-+{
-+    // seems pretty useless !
-+    dst = src;
-+}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/object.di druntime/import/object.di
---- druntime-old/import/object.di	2010-09-03 12:28:52.000000000 +0400
-+++ druntime/import/object.di	2010-10-05 12:47:24.873150000 +0400
-@@ -130,7 +130,7 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/import/object.di druntime/import/object.di
+--- druntime-orig/import/object.di	2010-09-03 12:28:52.000000000 +0400
++++ druntime/import/object.di	2010-10-27 00:22:27.444925001 +0400
+@@ -130,7 +130,7 @@
      Interface[] interfaces;
      TypeInfo_Class   base;
      void*       destructor;
@@ -528,7 +10,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
      uint        m_flags;
      //  1:      // is IUnknown or is derived from IUnknown
      //  2:      // has no possible pointers into GC memory
-@@ -140,7 +140,7 @@
+@@ -140,7 +140,7 @@
      // 32:      // has typeinfo member
      void*       deallocator;
      OffsetTypeInfo[] m_offTi;
@@ -537,7 +19,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
      const(MemberInfo[]) function(string) xgetMembers;
  
      static TypeInfo_Class find(in char[] classname);
-@@ -179,7 +179,7 @@
+@@ -179,7 +179,7 @@
  
  class TypeInfo_Const : TypeInfo
  {
@@ -546,7 +28,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  class TypeInfo_Invariant : TypeInfo_Const
-@@ -288,7 +288,6 @@
+@@ -288,7 +288,6 @@
      interface TraceInfo
      {
          int opApply(scope int delegate(ref char[]));
@@ -554,190 +36,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
      }
  
      string      msg;
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/std/intrinsic.di druntime/import/std/intrinsic.di
---- druntime-old/import/std/intrinsic.di	2010-08-05 05:39:08.000000000 +0400
-+++ druntime/import/std/intrinsic.di	1970-01-01 03:00:00.000000000 +0300
-@@ -1,176 +0,0 @@
--/**
-- * These functions are built-in intrinsics to the compiler.
-- *
-- * Intrinsic functions are functions built in to the compiler, usually to take
-- * advantage of specific CPU features that are inefficient to handle via
-- * external functions.  The compiler's optimizer and code generator are fully
-- * integrated in with intrinsic functions, bringing to bear their full power on
-- * them. This can result in some surprising speedups.
-- *
-- * Copyright: Public Domain
-- * License:   Public Domain
-- * Authors:   Walter Bright
-- */
--module std.intrinsic;
--
--
--/**
-- * Scans the bits in v starting with bit 0, looking
-- * for the first set bit.
-- * Returns:
-- *      The bit number of the first bit set.
-- *      The return value is undefined if v is zero.
-- */
--pure nothrow int bsf( uint v );
--
--
--/**
-- * Scans the bits in v from the most significant bit
-- * to the least significant bit, looking
-- * for the first set bit.
-- * Returns:
-- *      The bit number of the first bit set.
-- *      The return value is undefined if v is zero.
-- * Example:
-- * ---
-- * import std.intrinsic;
-- *
-- * int main()
-- * {
-- *     uint v;
-- *     int x;
-- *
-- *     v = 0x21;
-- *     x = bsf(v);
-- *     printf("bsf(x%x) = %d\n", v, x);
-- *     x = bsr(v);
-- *     printf("bsr(x%x) = %d\n", v, x);
-- *     return 0;
-- * }
-- * ---
-- * Output:
-- *  bsf(x21) = 0<br>
-- *  bsr(x21) = 5
-- */
--pure nothrow int bsr( uint v );
--
--
--/**
-- * Tests the bit.
-- */
--pure nothrow int bt( in uint* p, uint bitnum );
--
--
--/**
-- * Tests and complements the bit.
-- */
--nothrow int btc( uint* p, uint bitnum );
--
--
--/**
-- * Tests and resets (sets to 0) the bit.
-- */
--nothrow int btr( uint* p, uint bitnum );
--
--
--/**
-- * Tests and sets the bit.
-- * Params:
-- * p = a non-NULL pointer to an array of uints.
-- * index = a bit number, starting with bit 0 of p[0],
-- * and progressing. It addresses bits like the expression:
-----
--p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1)))
-----
-- * Returns:
-- *      A non-zero value if the bit was set, and a zero
-- *      if it was clear.
-- *
-- * Example:
-- * ---
--import std.intrinsic;
--
--int main()
--{
--    uint array[2];
--
--    array[0] = 2;
--    array[1] = 0x100;
--
--    printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
--    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
--
--    printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
--    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
--
--    printf("bts(array, 35) = %d\n", <b>bts</b>(array, 35));
--    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
--
--    printf("btr(array, 35) = %d\n", <b>btr</b>(array, 35));
--    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
--
--    printf("bt(array, 1) = %d\n", <b>bt</b>(array, 1));
--    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
--
--    return 0;
--}
-- * ---
-- * Output:
--<pre>
--btc(array, 35) = 0
--array = [0]:x2, [1]:x108
--btc(array, 35) = -1
--array = [0]:x2, [1]:x100
--bts(array, 35) = 0
--array = [0]:x2, [1]:x108
--btr(array, 35) = -1
--array = [0]:x2, [1]:x100
--bt(array, 1) = -1
--array = [0]:x2, [1]:x100
--</pre>
-- */
--nothrow int bts( uint* p, uint bitnum );
--
--
--/**
-- * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes
-- * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3
-- * becomes byte 0.
-- */
--pure nothrow uint bswap( uint v );
--
--
--/**
-- * Reads I/O port at port_address.
-- */
--nothrow ubyte inp( uint port_address );
--
--
--/**
-- * ditto
-- */
--nothrow ushort inpw( uint port_address );
--
--
--/**
-- * ditto
-- */
--nothrow uint inpl( uint port_address );
--
--
--/**
-- * Writes and returns value to I/O port at port_address.
-- */
--nothrow ubyte outp( uint port_address, ubyte value );
--
--
--/**
-- * ditto
-- */
--nothrow ushort outpw( uint port_address, ushort value );
--
--
--/**
-- * ditto
-- */
--nothrow uint outpl( uint port_address, uint value );
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/core/atomic.d druntime/src/core/atomic.d
---- druntime-old/src/core/atomic.d	2010-09-03 12:28:52.000000000 +0400
-+++ druntime/src/core/atomic.d	2010-10-05 15:55:10.893150001 +0400
-@@ -89,6 +89,117 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/core/atomic.d druntime/src/core/atomic.d
+--- druntime-orig/src/core/atomic.d	2010-09-03 12:28:52.000000000 +0400
++++ druntime/src/core/atomic.d	2010-10-05 15:55:10.893150001 +0400
+@@ -89,6 +89,117 @@
           return false;
       }
  }
@@ -855,7 +157,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  else version( AsmX86_32 )
  {
      T atomicOp(string op, T, V1)( ref shared T val, V1 mod )
-@@ -396,6 +507,12 @@
+@@ -396,6 +507,12 @@
          }
      }
  }
@@ -868,22 +170,38 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  else version( AsmX86_64 )
  {
      T atomicOp(string op, T, V1)( ref shared T val, V1 mod )
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gc.d druntime/src/gc/gc.d
---- druntime-old/src/gc/gc.d	2010-08-05 05:39:08.000000000 +0400
-+++ druntime/src/gc/gc.d	2010-10-04 16:54:06.837685001 +0400
-@@ -100,7 +100,7 @@
-     version (GCCLASS)
-     {   void* p;
-         ClassInfo ci = GC.classinfo;
--
-+        
-         p = malloc(ci.init.length);
-         (cast(byte*)p)[0 .. ci.init.length] = ci.init[];
-         _gc = cast(GC)p;
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcbits.d druntime/src/gc/gcbits.d
---- druntime-old/src/gc/gcbits.d	2010-08-08 04:10:24.000000000 +0400
-+++ druntime/src/gc/gcbits.d	2010-10-01 20:49:51.268892001 +0400
-@@ -26,6 +26,10 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/core/stdc/math.d druntime/src/core/stdc/math.d
+--- druntime-orig/src/core/stdc/math.d	2010-09-03 12:28:52.000000000 +0400
++++ druntime/src/core/stdc/math.d	2010-10-26 16:47:04.036925000 +0400
+@@ -17,6 +17,7 @@
+ 
+ extern (C):
+ nothrow:
++pure: // LDC
+ 
+ alias float  float_t;
+ alias double double_t;
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/core/stdc/stdlib.d druntime/src/core/stdc/stdlib.d
+--- druntime-orig/src/core/stdc/stdlib.d	2010-08-05 05:39:08.000000000 +0400
++++ druntime/src/core/stdc/stdlib.d	2010-10-26 19:26:03.996925001 +0400
+@@ -92,3 +92,13 @@
+ {
+     void* alloca(size_t size); // non-standard
+ }
++else version( LDC )
++{
++    pragma(alloca)
++        void* alloca(size_t size);
++}
++else version( GNU )
++{
++    private import gcc.builtins;
++    alias gcc.builtins.__builtin_alloca alloca;
++}
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/gc/gcbits.d druntime/src/gc/gcbits.d
+--- druntime-orig/src/gc/gcbits.d	2010-08-08 04:10:24.000000000 +0400
++++ druntime/src/gc/gcbits.d	2010-10-01 20:49:51.268892001 +0400
+@@ -26,6 +26,10 @@
  {
      version = bitops;
  }
@@ -894,10 +212,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  else version (GNU)
  {
      // use the unoptimized version
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcx.d druntime/src/gc/gcx.d
---- druntime-old/src/gc/gcx.d	2010-08-27 01:23:26.000000000 +0400
-+++ druntime/src/gc/gcx.d	2010-10-07 22:27:41.879253001 +0400
-@@ -1464,7 +1464,8 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/gc/gcx.d druntime/src/gc/gcx.d
+--- druntime-orig/src/gc/gcx.d	2010-08-27 01:23:26.000000000 +0400
++++ druntime/src/gc/gcx.d	2010-10-07 22:27:41.879253001 +0400
+@@ -1464,7 +1464,8 @@
  
  
      void initialize()
@@ -907,7 +225,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  
          (cast(byte*)&this)[0 .. Gcx.sizeof] = 0;
          stackBottom = cast(char*)&dummy;
-@@ -2200,7 +2201,7 @@
+@@ -2200,7 +2201,7 @@
                  if ((cast(size_t)p & ~(PAGESIZE-1)) == pcache)
                      continue;
  
@@ -916,7 +234,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
                  if (pool)
                  {
                      size_t offset = cast(size_t)(p - pool.baseAddr);
-@@ -2270,80 +2271,129 @@
+@@ -2270,80 +2271,129 @@
              __builtin_unwind_init();
              sp = & sp;
          }
@@ -1114,7 +432,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
          return result;
      }
  
-@@ -2357,7 +2407,7 @@
+@@ -2357,7 +2407,7 @@
          Pool*  pool;
  
          debug(COLLECT_PRINTF) printf("Gcx.fullcollect()\n");
@@ -1123,10 +441,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  
          thread_suspendAll();
  
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/object_.d druntime/src/object_.d
---- druntime-old/src/object_.d	2010-09-03 12:28:52.000000000 +0400
-+++ druntime/src/object_.d	2010-10-05 14:50:34.733150002 +0400
-@@ -1073,7 +1073,7 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/object_.d druntime/src/object_.d
+--- druntime-orig/src/object_.d	2010-10-26 18:47:41.840925001 +0400
++++ druntime/src/object_.d	2010-10-26 19:27:09.224925000 +0400
+@@ -1073,7 +1073,7 @@
  
  abstract class MemberInfo
  {
@@ -1135,7 +453,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  class MemberInfo_field : MemberInfo
-@@ -1663,7 +1663,6 @@
+@@ -1663,7 +1663,6 @@
      {
          int len = 0;
          ModuleReference *mr;
@@ -1143,19 +461,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
          for (mr = _Dmodule_ref; mr; mr = mr.next)
              len++;
          _moduleinfo_array = new ModuleInfo*[len];
-@@ -1802,7 +1801,10 @@
- {
-     debug(PRINTF) printf("_moduleTlsCtor()\n");
- 
--    void* p = alloca(_moduleinfo_array.length * ubyte.sizeof);
-+    version( DMD )
-+        void* p = alloca(_moduleinfo_array.length * ubyte.sizeof);
-+    else
-+        void* p = malloc(_moduleinfo_array.length * ubyte.sizeof);
-     auto flags = cast(ubyte[])p[0 .. _moduleinfo_array.length];
-     flags[] = 0;
- 
-@@ -2025,7 +2027,6 @@
+@@ -2025,7 +2024,6 @@
          _d_monitor_create(h);
          m = getMonitor(h);
      }
@@ -1163,7 +469,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
      IMonitor i = m.impl;
  
      if (i is null)
-@@ -2124,7 +2125,7 @@
+@@ -2124,7 +2122,7 @@
      size_t _aaLen(void* p);
      void* _aaGet(void** pp, TypeInfo keyti, size_t valuesize, ...);
      void* _aaGetRvalue(void* p, TypeInfo keyti, size_t valuesize, ...);
@@ -1172,25 +478,92 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
      void _aaDel(void* p, TypeInfo keyti, ...);
      void[] _aaValues(void* p, size_t keysize, size_t valuesize);
      void[] _aaKeys(void* p, size_t keysize, size_t valuesize);
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/adi.d druntime/src/rt/adi.d
---- druntime-old/src/rt/adi.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/adi.d	2010-10-07 14:32:52.911253001 +0400
-@@ -35,6 +35,14 @@
+@@ -2169,7 +2167,7 @@
+         return *cast(Key[]*) &a;
+     }
+ 
+-    int opApply(scope int delegate(ref Key, ref Value) dg)
++    int opApply(scope int delegate(ref Key, ref const Value) dg)
+     {
+         return _aaApply2(p, aligntsize(Key.sizeof), cast(_dg2_t)dg);
+     }
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/aaA.d druntime/src/rt/aaA.d
+--- druntime-orig/src/rt/aaA.d	2010-08-05 05:39:06.000000000 +0400
++++ druntime/src/rt/aaA.d	2010-10-29 10:48:36.165035001 +0400
+@@ -204,7 +204,7 @@
+  * Add entry for key if it is not already there.
+  */
+ 
+-void* _aaGet(AA* aa, TypeInfo keyti, size_t valuesize, ...)
++void* _aaGet(AA* aa, TypeInfo keyti, size_t valuesize, void *pkey)
+ in
+ {
+     assert(aa);
+@@ -218,7 +218,6 @@
+ }
+ body
+ {
+-    auto pkey = cast(void *)(&valuesize + 1);
+     size_t i;
+     aaA *e;
+     //printf("keyti = %p\n", keyti);
+@@ -274,13 +273,12 @@
+  * Returns null if it is not already there.
+  */
+ 
+-void* _aaGetRvalue(AA aa, TypeInfo keyti, size_t valuesize, ...)
++void* _aaGetRvalue(AA aa, TypeInfo keyti, size_t valuesize, void *pkey)
+ {
+     //printf("_aaGetRvalue(valuesize = %u)\n", valuesize);
+     if (!aa.a)
+         return null;
+ 
+-    auto pkey = cast(void *)(&valuesize + 1);
+     auto keysize = aligntsize(keyti.tsize());
+     auto len = aa.a.b.length;
+ 
+@@ -312,7 +310,7 @@
+  *      !=null  in aa, return pointer to value
+  */
+ 
+-void* _aaIn(AA aa, TypeInfo keyti, ...)
++void* _aaIn(AA aa, TypeInfo keyti, void *pkey)
+ in
+ {
+ }
+@@ -324,8 +322,6 @@
+ {
+     if (aa.a)
+     {
+-        auto pkey = cast(void *)(&keyti + 1);
+-
+         //printf("_aaIn(), .length = %d, .ptr = %x\n", aa.a.length, cast(uint)aa.a.ptr);
+         auto len = aa.a.b.length;
+ 
+@@ -357,9 +353,8 @@
+  * If key is not in aa[], do nothing.
+  */
+ 
+-void _aaDel(AA aa, TypeInfo keyti, ...)
++void _aaDel(AA aa, TypeInfo keyti, void *pkey)
+ {
+-    auto pkey = cast(void *)(&keyti + 1);
+     aaA *e;
+ 
+     if (aa.a && aa.a.b.length)
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/adi.d druntime/src/rt/adi.d
+--- druntime-orig/src/rt/adi.d	2010-08-05 05:39:06.000000000 +0400
++++ druntime/src/rt/adi.d	2010-10-29 11:49:52.065035002 +0400
+@@ -35,6 +35,8 @@
      extern (C) void  gc_free( void* p );
  }
  
-+version (DMD)
-+{  
-+    version (X86)
-+    {
-+        version = DMD_X86;
-+    }
-+}
-+
++version (DMD) version (X86)
++    version = DMD_X86;
  
  struct Array
  {
-@@ -48,7 +56,7 @@
+@@ -48,7 +50,7 @@
   * reversed.
   */
  
@@ -1199,7 +572,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      if (a.length > 1)
      {
-@@ -108,7 +116,7 @@
+@@ -108,7 +110,7 @@
              hi = hi - 1 + (stridehi - stridelo);
          }
      }
@@ -1208,7 +581,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  unittest
-@@ -143,7 +151,7 @@
+@@ -143,7 +145,7 @@
   * reversed.
   */
  
@@ -1217,7 +590,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      if (a.length > 1)
      {
-@@ -201,7 +209,7 @@
+@@ -201,7 +203,7 @@
              hi = hi - 1 + (stridehi - stridelo);
          }
      }
@@ -1226,7 +599,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  unittest
-@@ -225,10 +233,10 @@
+@@ -225,10 +227,10 @@
   * Support for array.reverse property.
   */
  
@@ -1239,21 +612,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  body
  {
-@@ -243,10 +251,10 @@
-         tmp = buffer.ptr;
-         if (szelem > 16)
-         {
--            //version (Windows)
-+            version (Windows)
-                 tmp = cast(byte*) alloca(szelem);
--            //else
--                //tmp = gc_malloc(szelem);
-+            else
-+                tmp = cast(byte*) gc_malloc(szelem);
-         }
- 
-         for (; lo < hi; lo += szelem, hi -= szelem)
-@@ -267,7 +275,7 @@
+@@ -267,7 +269,7 @@
                  //gc_free(tmp);
          }
      }
@@ -1262,7 +621,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  unittest
-@@ -311,7 +319,7 @@
+@@ -311,7 +313,7 @@
   * Sort array of chars.
   */
  
@@ -1271,7 +630,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      if (a.length > 1)
      {
-@@ -326,14 +334,14 @@
+@@ -326,14 +328,14 @@
          }
          delete da;
      }
@@ -1288,7 +647,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      if (a.length > 1)
      {
-@@ -348,7 +356,7 @@
+@@ -348,7 +350,7 @@
          }
          delete da;
      }
@@ -1297,7 +656,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  /***************************************
-@@ -358,7 +366,7 @@
+@@ -358,7 +360,7 @@
   *      0       not equal
   */
  
@@ -1306,7 +665,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      debug(adi) printf("_adEq(a1.length = %d, a2.length = %d)\n", a1.length, a2.length);
      if (a1.length != a2.length)
-@@ -379,7 +387,7 @@
+@@ -379,7 +381,7 @@
      return 1; // equal
  }
  
@@ -1315,7 +674,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      debug(adi) printf("_adEq2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length);
      if (a1.length != a2.length)
-@@ -405,7 +413,7 @@
+@@ -405,7 +407,7 @@
   * Support for array compare test.
   */
  
@@ -1324,7 +683,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      debug(adi) printf("adCmp()\n");
      auto len = a1.length;
-@@ -435,7 +443,7 @@
+@@ -435,7 +437,7 @@
      return (a1.length > a2.length) ? 1 : -1;
  }
  
@@ -1333,7 +692,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      debug(adi) printf("_adCmp2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length);
      return ti.compare(&a1, &a2);
-@@ -461,9 +469,9 @@
+@@ -461,9 +463,9 @@
   * Support for array compare test.
   */
  
@@ -1345,7 +704,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
    {
      asm
      {   naked                   ;
-@@ -569,8 +577,8 @@
+@@ -569,8 +571,8 @@
  
          ret                     ;
      }
@@ -1356,12077 +715,35 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
    {
      int len;
      int c;
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayInit.d druntime/src/rt/arrayInit.d
---- druntime-old/src/rt/arrayInit.d	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/src/rt/arrayInit.d	2010-10-03 20:41:52.223624001 +0400
-@@ -0,0 +1,155 @@
-+private import ldc.intrinsics;
-+
-+extern(C):
-+
-+int memcmp(void*,void*,size_t);
-+size_t strlen(char*);
-+
-+version(LLVM64)
-+alias llvm_memcpy_i64 llvm_memcpy;
-+else
-+alias llvm_memcpy_i32 llvm_memcpy;
-+
-+// per-element array init routines
-+
-+void _d_array_init_i16(ushort* a, size_t n, ushort v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_i32(uint* a, size_t n, uint v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_i64(ulong* a, size_t n, ulong v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_float(float* a, size_t n, float v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_double(double* a, size_t n, double v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_real(real* a, size_t n, real v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_cfloat(cfloat* a, size_t n, cfloat v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_cdouble(cdouble* a, size_t n, cdouble v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_creal(creal* a, size_t n, creal v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_pointer(void** a, size_t n, void* v)
-+{
-+    auto p = a;
-+    auto end = a+n;
-+    while (p !is end)
-+        *p++ = v;
-+}
-+
-+void _d_array_init_mem(void* a, size_t na, void* v, size_t nv)
-+{
-+    auto p = a;
-+    auto end = a + na*nv;
-+    while (p !is end) {
-+        llvm_memcpy(p,v,nv,0);
-+        p += nv;
-+    }
-+}
-+
-+/*
-+void _d_array_init(TypeInfo ti, void* a)
-+{
-+    auto initializer = ti.next.init();
-+    auto isize = initializer.length;
-+    auto q = initializer.ptr;
-+
-+    if (isize == 1)
-+        memset(p, *cast(ubyte*)q, size);
-+    else if (isize == int.sizeof)
-+    {
-+        int init = *cast(int*)q;
-+        size /= int.sizeof;
-+        for (size_t u = 0; u < size; u++)
-+        {
-+            (cast(int*)p)[u] = init;
-+        }
-+    }
-+    else
-+    {
-+        for (size_t u = 0; u < size; u += isize)
-+        {
-+            memcpy(p + u, q, isize);
-+        }
-+    }
-+}*/
-+
-+// for array cast
-+size_t _d_array_cast_len(size_t len, size_t elemsz, size_t newelemsz)
-+{
-+    if (newelemsz == 1) {
-+        return len*elemsz;
-+    }
-+    else if ((len*elemsz) % newelemsz) {
-+        throw new Exception("Bad array cast");
-+    }
-+    return (len*elemsz)/newelemsz;
-+}
-+
-+// slice copy when assertions are enabled
-+void _d_array_slice_copy(void* dst, size_t dstlen, void* src, size_t srclen)
-+{
-+    assert(dst);
-+    assert(src);
-+    if (dstlen != srclen)
-+        throw new Exception("lengths don't match for array copy");
-+    else if (dst+dstlen <= src || src+srclen <= dst)
-+        llvm_memcpy(dst, src, dstlen, 0);
-+    else
-+        throw new Exception("overlapping array copy");
-+}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayassign.d druntime/src/rt/arrayassign.d
---- druntime-old/src/rt/arrayassign.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arrayassign.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,186 +0,0 @@
--/**
-- * Implementation of array assignment support routines.
-- *
-- * Copyright: Copyright Digital Mars 2000 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright
-- *
-- *          Copyright Digital Mars 2000 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arrayassign;
--
--private
--{
--    import rt.util.string;
--    import core.stdc.string;
--    import core.stdc.stdlib;
--    debug(PRINTF) import core.stdc.stdio;
--}
--
--/**
-- * Does array assignment (not construction) from another
-- * array of the same element type.
-- * ti is the element type.
-- * Handles overlapping copies.
-- */
--extern (C) void[] _d_arrayassign(TypeInfo ti, void[] from, void[] to)
--{
--    debug(PRINTF) printf("_d_arrayassign(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize());
--
--    if (to.length != from.length)
--    {
--        char[10] tmp = void;
--        string msg = "lengths don't match for array copy,"c;
--        msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length);
--        throw new Exception(msg);
--    }
--
--    auto element_size = ti.tsize();
--
--    /* Need a temporary buffer tmp[] big enough to hold one element
--     */
--    void[16] buf = void;
--    void[] tmp;
--    if (element_size > buf.sizeof)
--        tmp = alloca(element_size)[0 .. element_size];
--    else
--        tmp = buf;
--
--
--    if (to.ptr <= from.ptr)
--    {
--        foreach (i; 0 .. to.length)
--        {
--            void* pto   = to.ptr   + i * element_size;
--            void* pfrom = from.ptr + i * element_size;
--            memcpy(tmp.ptr, pto, element_size);
--            memcpy(pto, pfrom, element_size);
--            ti.postblit(pto);
--            ti.destroy(tmp.ptr);
--        }
--    }
--    else
--    {
--        for (int i = to.length; i--; )
--        {
--            void* pto   = to.ptr   + i * element_size;
--            void* pfrom = from.ptr + i * element_size;
--            memcpy(tmp.ptr, pto, element_size);
--            memcpy(pto, pfrom, element_size);
--            ti.postblit(pto);
--            ti.destroy(tmp.ptr);
--        }
--    }
--    return to;
--}
--
--/**
-- * Does array initialization (not assignment) from another
-- * array of the same element type.
-- * ti is the element type.
-- */
--extern (C) void[] _d_arrayctor(TypeInfo ti, void[] from, void[] to)
--{
--    debug(PRINTF) printf("_d_arrayctor(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize());
--
--    if (to.length != from.length)
--    {
--        char[10] tmp = void;
--        string msg = "lengths don't match for array initialization,"c;
--        msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length);
--        throw new Exception(msg);
--    }
--
--    auto element_size = ti.tsize();
--
--    int i;
--    try
--    {
--        for (i = 0; i < to.length; i++)
--        {
--            // Copy construction is defined as bit copy followed by postblit.
--            memcpy(to.ptr + i * element_size, from.ptr + i * element_size, element_size);
--            ti.postblit(to.ptr + i * element_size);
--        }
--    }
--    catch (Object o)
--    {
--        /* Destroy, in reverse order, what we've constructed so far
--         */
--        while (i--)
--        {
--            ti.destroy(to.ptr + i * element_size);
--        }
--
--        throw o;
--    }
--    return to;
--}
--
--
--/**
-- * Do assignment to an array.
-- *      p[0 .. count] = value;
-- */
--extern (C) void* _d_arraysetassign(void* p, void* value, int count, TypeInfo ti)
--{
--    void* pstart = p;
--
--    auto element_size = ti.tsize();
--
--    //Need a temporary buffer tmp[] big enough to hold one element
--    void[16] buf = void;
--    void[] tmp;
--    if (element_size > buf.sizeof)
--    {
--        tmp = alloca(element_size)[0 .. element_size];
--    }
--    else
--        tmp = buf;
--
--    foreach (i; 0 .. count)
--    {
--        memcpy(tmp.ptr, p, element_size);
--        memcpy(p, value, element_size);
--        ti.postblit(p);
--        ti.destroy(tmp.ptr);
--        p += element_size;
--    }
--    return pstart;
--}
--
--/**
-- * Do construction of an array.
-- *      ti[count] p = value;
-- */
--extern (C) void* _d_arraysetctor(void* p, void* value, int count, TypeInfo ti)
--{
--    void* pstart = p;
--    auto element_size = ti.tsize();
--
--    try
--    {
--        foreach (i; 0 .. count)
--        {
--            // Copy construction is defined as bit copy followed by postblit.
--            memcpy(p, value, element_size);
--            ti.postblit(p);
--            p += element_size;
--        }
--    }
--    catch (Object o)
--    {
--        // Destroy, in reverse order, what we've constructed so far
--        while (p > pstart)
--        {
--            p -= element_size;
--            ti.destroy(p);
--        }
--
--        throw o;
--    }
--    return pstart;
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraybyte.d druntime/src/rt/arraybyte.d
---- druntime-old/src/rt/arraybyte.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arraybyte.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,1893 +0,0 @@
--/**
-- * Contains SSE2 and MMX versions of certain operations for char, byte, and
-- * ubyte ('a', 'g' and 'h' suffixes).
-- *
-- * Copyright: Copyright Digital Mars 2008 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, based on code originally written by Burton Radons
-- *
-- *          Copyright Digital Mars 2008 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arraybyte;
--
--import core.cpuid;
--
--version (unittest)
--{
--    private import core.stdc.stdio : printf;
--    /* This is so unit tests will test every CPU variant
--     */
--    int cpuid;
--    const int CPUID_MAX = 4;
--    bool mmx()      { return cpuid == 1 && core.cpuid.mmx(); }
--    bool sse()      { return cpuid == 2 && core.cpuid.sse(); }
--    bool sse2()     { return cpuid == 3 && core.cpuid.sse2(); }
--    bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
--}
--else
--{
--    alias core.cpuid.mmx mmx;
--    alias core.cpuid.sse sse;
--    alias core.cpuid.sse2 sse2;
--    alias core.cpuid.amd3dnow amd3dnow;
--}
--
--//version = log;
--
--bool disjoint(T)(T[] a, T[] b)
--{
--    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
--}
--
--alias byte T;
--
--extern (C):
--
--/* ======================================================================== */
--
--
--/***********************
-- * Computes:
-- *      a[] = b[] + value
-- */
--
--T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
--{
--    return _arraySliceExpAddSliceAssign_g(a, value, b);
--}
--
--T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
--{
--    return _arraySliceExpAddSliceAssign_g(a, value, b);
--}
--
--T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpAddSliceAssign_g()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1088% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--            l |= (l << 16);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startaddsse2u:
--                    add ESI, 64;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM2, [EAX+32];
--                    movdqu XMM3, [EAX+48];
--                    add EAX, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM4;
--                    paddb XMM2, XMM4;
--                    paddb XMM3, XMM4;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startaddsse2a:
--                    add ESI, 64;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM2, [EAX+32];
--                    movdqa XMM3, [EAX+48];
--                    add EAX, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM4;
--                    paddb XMM2, XMM4;
--                    paddb XMM3, XMM4;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 1000% faster
--        if (mmx() && a.length >= 32)
--        {
--            auto n = aptr + (a.length & ~31);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM4, l;
--                pshufw MM4, MM4, 0;
--
--                align 4;
--            startaddmmx:
--                add ESI, 32;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                add EAX, 32;
--                paddb MM0, MM4;
--                paddb MM1, MM4;
--                paddb MM2, MM4;
--                paddb MM3, MM4;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startaddmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--        /* trying to be fair and treat normal 32-bit cpu the same way as we do
--         * the SIMD units, with unrolled asm.  There's not enough registers,
--         * really.
--         */
--        else
--        if (a.length >= 4)
--        {
--
--            auto n = aptr + (a.length & ~3);
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov CL, value;
--
--                align 4;
--            startadd386:
--                add ESI, 4;
--                mov DX, [EAX];
--                mov BX, [EAX+2];
--                add EAX, 4;
--                add BL, CL;
--                add BH, CL;
--                add DL, CL;
--                add DH, CL;
--                mov [ESI   -4], DX;
--                mov [ESI+2 -4], BX;
--                cmp ESI, EDI;
--                jb startadd386;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ + value);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpAddSliceAssign_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + c[]
-- */
--
--T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceAddSliceAssign_g(a, c, b);
--}
--
--T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceAddSliceAssign_g(a, c, b);
--}
--
--T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    //printf("_arraySliceSliceAddSliceAssign_g()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 5739% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                version (log) printf("\tsse2 unaligned\n");
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 8;
--                startaddlsse2u:
--                    add ESI, 64;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM2, [EAX+32];
--                    movdqu XMM3, [EAX+48];
--                    add EAX, 64;
--                    movdqu XMM4, [ECX];
--                    movdqu XMM5, [ECX+16];
--                    movdqu XMM6, [ECX+32];
--                    movdqu XMM7, [ECX+48];
--                    add ECX, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM5;
--                    paddb XMM2, XMM6;
--                    paddb XMM3, XMM7;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddlsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                version (log) printf("\tsse2 aligned\n");
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 8;
--                startaddlsse2a:
--                    add ESI, 64;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM2, [EAX+32];
--                    movdqa XMM3, [EAX+48];
--                    add EAX, 64;
--                    movdqa XMM4, [ECX];
--                    movdqa XMM5, [ECX+16];
--                    movdqa XMM6, [ECX+32];
--                    movdqa XMM7, [ECX+48];
--                    add ECX, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM5;
--                    paddb XMM2, XMM6;
--                    paddb XMM3, XMM7;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddlsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 4428% faster
--        if (mmx() && a.length >= 32)
--        {
--            version (log) printf("\tmmx\n");
--            auto n = aptr + (a.length & ~31);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startaddlmmx:
--                add ESI, 32;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                add EAX, 32;
--                movq MM4, [ECX];
--                movq MM5, [ECX+8];
--                movq MM6, [ECX+16];
--                movq MM7, [ECX+24];
--                add ECX, 32;
--                paddb MM0, MM4;
--                paddb MM1, MM5;
--                paddb MM2, MM6;
--                paddb MM3, MM7;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startaddlmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    version (log) if (aptr < aend) printf("\tbase\n");
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ + *cptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddSliceAssign_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += value
-- */
--
--T[] _arrayExpSliceAddass_a(T[] a, T value)
--{
--    return _arrayExpSliceAddass_g(a, value);
--}
--
--T[] _arrayExpSliceAddass_h(T[] a, T value)
--{
--    return _arrayExpSliceAddass_g(a, value);
--}
--
--T[] _arrayExpSliceAddass_g(T[] a, T value)
--{
--    //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1578% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--            l |= (l << 16);
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startaddasssse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM2, [ESI+32];
--                    movdqu XMM3, [ESI+48];
--                    add ESI, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM4;
--                    paddb XMM2, XMM4;
--                    paddb XMM3, XMM4;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddasssse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startaddasssse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM2, [ESI+32];
--                    movdqa XMM3, [ESI+48];
--                    add ESI, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM4;
--                    paddb XMM2, XMM4;
--                    paddb XMM3, XMM4;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddasssse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 1721% faster
--        if (mmx() && a.length >= 32)
--        {
--
--            auto n = aptr + (a.length & ~31);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movd MM4, l;
--                pshufw MM4, MM4, 0;
--
--                align 8;
--            startaddassmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                movq MM2, [ESI+16];
--                movq MM3, [ESI+24];
--                add ESI, 32;
--                paddb MM0, MM4;
--                paddb MM1, MM4;
--                paddb MM2, MM4;
--                paddb MM3, MM4;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startaddassmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceAddass_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] += 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += b[]
-- */
--
--T[] _arraySliceSliceAddass_a(T[] a, T[] b)
--{
--    return _arraySliceSliceAddass_g(a, b);
--}
--
--T[] _arraySliceSliceAddass_h(T[] a, T[] b)
--{
--    return _arraySliceSliceAddass_g(a, b);
--}
--
--T[] _arraySliceSliceAddass_g(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceAddass_g()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 4727% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 8;
--                startaddasslsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM2, [ESI+32];
--                    movdqu XMM3, [ESI+48];
--                    add ESI, 64;
--                    movdqu XMM4, [ECX];
--                    movdqu XMM5, [ECX+16];
--                    movdqu XMM6, [ECX+32];
--                    movdqu XMM7, [ECX+48];
--                    add ECX, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM5;
--                    paddb XMM2, XMM6;
--                    paddb XMM3, XMM7;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddasslsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 8;
--                startaddasslsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM2, [ESI+32];
--                    movdqa XMM3, [ESI+48];
--                    add ESI, 64;
--                    movdqa XMM4, [ECX];
--                    movdqa XMM5, [ECX+16];
--                    movdqa XMM6, [ECX+32];
--                    movdqa XMM7, [ECX+48];
--                    add ECX, 64;
--                    paddb XMM0, XMM4;
--                    paddb XMM1, XMM5;
--                    paddb XMM2, XMM6;
--                    paddb XMM3, XMM7;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddasslsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 3059% faster
--        if (mmx() && a.length >= 32)
--        {
--
--            auto n = aptr + (a.length & ~31);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 8;
--            startaddasslmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                movq MM2, [ESI+16];
--                movq MM3, [ESI+24];
--                add ESI, 32;
--                movq MM4, [ECX];
--                movq MM5, [ECX+8];
--                movq MM6, [ECX+16];
--                movq MM7, [ECX+24];
--                add ECX, 32;
--                paddb MM0, MM4;
--                paddb MM1, MM5;
--                paddb MM2, MM6;
--                paddb MM3, MM7;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startaddasslmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddass_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] += b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--
--/***********************
-- * Computes:
-- *      a[] = b[] - value
-- */
--
--T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMinSliceAssign_g(a, value, b);
--}
--
--T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMinSliceAssign_g(a, value, b);
--}
--
--T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMinSliceAssign_g()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1189% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--            l |= (l << 16);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startsubsse2u:
--                    add ESI, 64;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM2, [EAX+32];
--                    movdqu XMM3, [EAX+48];
--                    add EAX, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM4;
--                    psubb XMM2, XMM4;
--                    psubb XMM3, XMM4;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsubsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startsubsse2a:
--                    add ESI, 64;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM2, [EAX+32];
--                    movdqa XMM3, [EAX+48];
--                    add EAX, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM4;
--                    psubb XMM2, XMM4;
--                    psubb XMM3, XMM4;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsubsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 1079% faster
--        if (mmx() && a.length >= 32)
--        {
--            auto n = aptr + (a.length & ~31);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM4, l;
--                pshufw MM4, MM4, 0;
--
--                align 4;
--            startsubmmx:
--                add ESI, 32;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                add EAX, 32;
--                psubb MM0, MM4;
--                psubb MM1, MM4;
--                psubb MM2, MM4;
--                psubb MM3, MM4;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startsubmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--        // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm.  There's not enough registers, really.
--        else
--        if (a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov CL, value;
--
--                align 4;
--            startsub386:
--                add ESI, 4;
--                mov DX, [EAX];
--                mov BX, [EAX+2];
--                add EAX, 4;
--                sub BL, CL;
--                sub BH, CL;
--                sub DL, CL;
--                sub DH, CL;
--                mov [ESI   -4], DX;
--                mov [ESI+2 -4], BX;
--                cmp ESI, EDI;
--                jb startsub386;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ - value);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMinSliceAssign_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] = b[] - 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(b[i] - 6))
--                {
--                    printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = value - b[]
-- */
--
--T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
--{
--    return _arrayExpSliceMinSliceAssign_g(a, b, value);
--}
--
--T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
--{
--    return _arrayExpSliceMinSliceAssign_g(a, b, value);
--}
--
--T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arrayExpSliceMinSliceAssign_g()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 8748% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--            l |= (l << 16);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startsubrsse2u:
--                    add ESI, 64;
--                    movdqa XMM5, XMM4;
--                    movdqa XMM6, XMM4;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    psubb XMM5, XMM0;
--                    psubb XMM6, XMM1;
--                    movdqu [ESI   -64], XMM5;
--                    movdqu [ESI+16-64], XMM6;
--                    movdqa XMM5, XMM4;
--                    movdqa XMM6, XMM4;
--                    movdqu XMM2, [EAX+32];
--                    movdqu XMM3, [EAX+48];
--                    add EAX, 64;
--                    psubb XMM5, XMM2;
--                    psubb XMM6, XMM3;
--                    movdqu [ESI+32-64], XMM5;
--                    movdqu [ESI+48-64], XMM6;
--                    cmp ESI, EDI;
--                    jb startsubrsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startsubrsse2a:
--                    add ESI, 64;
--                    movdqa XMM5, XMM4;
--                    movdqa XMM6, XMM4;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    psubb XMM5, XMM0;
--                    psubb XMM6, XMM1;
--                    movdqa [ESI   -64], XMM5;
--                    movdqa [ESI+16-64], XMM6;
--                    movdqa XMM5, XMM4;
--                    movdqa XMM6, XMM4;
--                    movdqa XMM2, [EAX+32];
--                    movdqa XMM3, [EAX+48];
--                    add EAX, 64;
--                    psubb XMM5, XMM2;
--                    psubb XMM6, XMM3;
--                    movdqa [ESI+32-64], XMM5;
--                    movdqa [ESI+48-64], XMM6;
--                    cmp ESI, EDI;
--                    jb startsubrsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 7397% faster
--        if (mmx() && a.length >= 32)
--        {
--            auto n = aptr + (a.length & ~31);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM4, l;
--                pshufw MM4, MM4, 0;
--
--                align 4;
--            startsubrmmx:
--                add ESI, 32;
--                movq MM5, MM4;
--                movq MM6, MM4;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                psubb MM5, MM0;
--                psubb MM6, MM1;
--                movq [ESI   -32], MM5;
--                movq [ESI+8 -32], MM6;
--                movq MM5, MM4;
--                movq MM6, MM4;
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                add EAX, 32;
--                psubb MM5, MM2;
--                psubb MM6, MM3;
--                movq [ESI+16-32], MM5;
--                movq [ESI+24-32], MM6;
--                cmp ESI, EDI;
--                jb startsubrmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(value - *bptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinSliceAssign_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] = 6 - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(6 - b[i]))
--                {
--                    printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - c[]
-- */
--
--T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMinSliceAssign_g(a, c, b);
--}
--
--T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMinSliceAssign_g(a, c, b);
--}
--
--T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 5756% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 8;
--                startsublsse2u:
--                    add ESI, 64;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM2, [EAX+32];
--                    movdqu XMM3, [EAX+48];
--                    add EAX, 64;
--                    movdqu XMM4, [ECX];
--                    movdqu XMM5, [ECX+16];
--                    movdqu XMM6, [ECX+32];
--                    movdqu XMM7, [ECX+48];
--                    add ECX, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM5;
--                    psubb XMM2, XMM6;
--                    psubb XMM3, XMM7;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsublsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 8;
--                startsublsse2a:
--                    add ESI, 64;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM2, [EAX+32];
--                    movdqa XMM3, [EAX+48];
--                    add EAX, 64;
--                    movdqa XMM4, [ECX];
--                    movdqa XMM5, [ECX+16];
--                    movdqa XMM6, [ECX+32];
--                    movdqa XMM7, [ECX+48];
--                    add ECX, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM5;
--                    psubb XMM2, XMM6;
--                    psubb XMM3, XMM7;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsublsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 4428% faster
--        if (mmx() && a.length >= 32)
--        {
--            auto n = aptr + (a.length & ~31);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 8;
--            startsublmmx:
--                add ESI, 32;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                add EAX, 32;
--                movq MM4, [ECX];
--                movq MM5, [ECX+8];
--                movq MM6, [ECX+16];
--                movq MM7, [ECX+24];
--                add ECX, 32;
--                psubb MM0, MM4;
--                psubb MM1, MM5;
--                psubb MM2, MM6;
--                psubb MM3, MM7;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startsublmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ - *cptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMinSliceAssign_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= value
-- */
--
--T[] _arrayExpSliceMinass_a(T[] a, T value)
--{
--    return _arrayExpSliceMinass_g(a, value);
--}
--
--T[] _arrayExpSliceMinass_h(T[] a, T value)
--{
--    return _arrayExpSliceMinass_g(a, value);
--}
--
--T[] _arrayExpSliceMinass_g(T[] a, T value)
--{
--    //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1577% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--            l |= (l << 16);
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startsubasssse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM2, [ESI+32];
--                    movdqu XMM3, [ESI+48];
--                    add ESI, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM4;
--                    psubb XMM2, XMM4;
--                    psubb XMM3, XMM4;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsubasssse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 8;
--                startsubasssse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM2, [ESI+32];
--                    movdqa XMM3, [ESI+48];
--                    add ESI, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM4;
--                    psubb XMM2, XMM4;
--                    psubb XMM3, XMM4;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsubasssse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 1577% faster
--        if (mmx() && a.length >= 32)
--        {
--
--            auto n = aptr + (a.length & ~31);
--
--            uint l = cast(ubyte) value;
--            l |= (l << 8);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movd MM4, l;
--                pshufw MM4, MM4, 0;
--
--                align 8;
--            startsubassmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                movq MM2, [ESI+16];
--                movq MM3, [ESI+24];
--                add ESI, 32;
--                psubb MM0, MM4;
--                psubb MM1, MM4;
--                psubb MM2, MM4;
--                psubb MM3, MM4;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startsubassmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinass_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[]
-- */
--
--T[] _arraySliceSliceMinass_a(T[] a, T[] b)
--{
--    return _arraySliceSliceMinass_g(a, b);
--}
--
--T[] _arraySliceSliceMinass_h(T[] a, T[] b)
--{
--    return _arraySliceSliceMinass_g(a, b);
--}
--
--T[] _arraySliceSliceMinass_g(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMinass_g()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 4800% faster
--        if (sse2() && a.length >= 64)
--        {
--            auto n = aptr + (a.length & ~63);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 8;
--                startsubasslsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM2, [ESI+32];
--                    movdqu XMM3, [ESI+48];
--                    add ESI, 64;
--                    movdqu XMM4, [ECX];
--                    movdqu XMM5, [ECX+16];
--                    movdqu XMM6, [ECX+32];
--                    movdqu XMM7, [ECX+48];
--                    add ECX, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM5;
--                    psubb XMM2, XMM6;
--                    psubb XMM3, XMM7;
--                    movdqu [ESI   -64], XMM0;
--                    movdqu [ESI+16-64], XMM1;
--                    movdqu [ESI+32-64], XMM2;
--                    movdqu [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsubasslsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 8;
--                startsubasslsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM2, [ESI+32];
--                    movdqa XMM3, [ESI+48];
--                    add ESI, 64;
--                    movdqa XMM4, [ECX];
--                    movdqa XMM5, [ECX+16];
--                    movdqa XMM6, [ECX+32];
--                    movdqa XMM7, [ECX+48];
--                    add ECX, 64;
--                    psubb XMM0, XMM4;
--                    psubb XMM1, XMM5;
--                    psubb XMM2, XMM6;
--                    psubb XMM3, XMM7;
--                    movdqa [ESI   -64], XMM0;
--                    movdqa [ESI+16-64], XMM1;
--                    movdqa [ESI+32-64], XMM2;
--                    movdqa [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsubasslsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 3107% faster
--        if (mmx() && a.length >= 32)
--        {
--
--            auto n = aptr + (a.length & ~31);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 8;
--            startsubasslmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                movq MM2, [ESI+16];
--                movq MM3, [ESI+24];
--                add ESI, 32;
--                movq MM4, [ECX];
--                movq MM5, [ECX+8];
--                movq MM6, [ECX+16];
--                movq MM7, [ECX+24];
--                add ECX, 32;
--                psubb MM0, MM4;
--                psubb MM1, MM5;
--                psubb MM2, MM6;
--                psubb MM3, MM7;
--                movq [ESI   -32], MM0;
--                movq [ESI+8 -32], MM1;
--                movq [ESI+16-32], MM2;
--                movq [ESI+24-32], MM3;
--                cmp ESI, EDI;
--                jb startsubasslmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMinass_g unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] -= b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycast.d druntime/src/rt/arraycast.d
---- druntime-old/src/rt/arraycast.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arraycast.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,94 +0,0 @@
--/**
-- * Implementation of array cast support routines.
-- *
-- * Copyright: Copyright Digital Mars 2004 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, Sean Kelly
-- *
-- *          Copyright Digital Mars 2004 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arraycast;
--
--/******************************************
-- * Runtime helper to convert dynamic array of one
-- * type to dynamic array of another.
-- * Adjusts the length of the array.
-- * Throws exception if new length is not aligned.
-- */
--
--extern (C)
--
--void[] _d_arraycast(size_t tsize, size_t fsize, void[] a)
--{
--    auto length = a.length;
--
--    auto nbytes = length * fsize;
--    if (nbytes % tsize != 0)
--    {
--    throw new Exception("array cast misalignment");
--    }
--    length = nbytes / tsize;
--    *cast(size_t *)&a = length; // jam new length
--    return a;
--}
--
--unittest
--{
--    byte[int.sizeof * 3] b;
--    int[] i;
--    short[] s;
--
--    i = cast(int[])b;
--    assert(i.length == 3);
--
--    s = cast(short[])b;
--    assert(s.length == 6);
--
--    s = cast(short[])i;
--    assert(s.length == 6);
--}
--
--/******************************************
-- * Runtime helper to convert dynamic array of bits
-- * dynamic array of another.
-- * Adjusts the length of the array.
-- * Throws exception if new length is not aligned.
-- */
--
--version (none)
--{
--extern (C)
--
--void[] _d_arraycast_frombit(uint tsize, void[] a)
--{
--    uint length = a.length;
--
--    if (length & 7)
--    {
--    throw new Exception("bit[] array cast misalignment");
--    }
--    length /= 8 * tsize;
--    *cast(size_t *)&a = length; // jam new length
--    return a;
--}
--
--unittest
--{
--    version (D_Bits)
--    {
--    bit[int.sizeof * 3 * 8] b;
--    int[] i;
--    short[] s;
--
--    i = cast(int[])b;
--    assert(i.length == 3);
--
--    s = cast(short[])b;
--    assert(s.length == 6);
--    }
--}
--
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycat.d druntime/src/rt/arraycat.d
---- druntime-old/src/rt/arraycat.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arraycat.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,42 +0,0 @@
--/**
-- * Implementation of array copy support routines.
-- *
-- * Copyright: Copyright Digital Mars 2004 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, Sean Kelly
-- *
-- *          Copyright Digital Mars 2004 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arraycat;
--
--private
--{
--    import core.stdc.string;
--    debug import core.stdc.stdio;
--}
--
--extern (C):
--
--byte[] _d_arraycopy(size_t size, byte[] from, byte[] to)
--{
--    debug printf("f = %p,%d, t = %p,%d, size = %d\n",
--                 from.ptr, from.length, to.ptr, to.length, size);
--
--    if (to.length != from.length)
--    {
--        throw new Exception("lengths don't match for array copy");
--    }
--    else if (to.ptr + to.length * size <= from.ptr ||
--             from.ptr + from.length * size <= to.ptr)
--    {
--        memcpy(to.ptr, from.ptr, to.length * size);
--    }
--    else
--    {
--        throw new Exception("overlapping array copy");
--    }
--    return to;
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraydouble.d druntime/src/rt/arraydouble.d
---- druntime-old/src/rt/arraydouble.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arraydouble.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,1720 +0,0 @@
--/**
-- * Contains SSE2 and MMX versions of certain operations for double.
-- *
-- * Copyright: Copyright Digital Mars 2008 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, based on code originally written by Burton Radons
-- *
-- *          Copyright Digital Mars 2008 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arraydouble;
--
--private import core.cpuid;
--
--version (unittest)
--{
--    private import core.stdc.stdio : printf;
--    /* This is so unit tests will test every CPU variant
--     */
--    int cpuid;
--    const int CPUID_MAX = 5;
--    bool mmx()      { return cpuid == 1 && core.cpuid.mmx(); }
--    bool sse()      { return cpuid == 2 && core.cpuid.sse(); }
--    bool sse2()     { return cpuid == 3 && core.cpuid.sse2(); }
--    bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
--}
--else
--{
--    alias core.cpuid.mmx mmx;
--    alias core.cpuid.sse sse;
--    alias core.cpuid.sse2 sse2;
--    alias core.cpuid.amd3dnow amd3dnow;
--}
--
--//version = log;
--
--bool disjoint(T)(T[] a, T[] b)
--{
--    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
--}
--
--/* Performance figures measured by Burton Radons
-- */
--
--alias double T;
--
--extern (C):
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + c[]
-- */
--
--T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 333% faster
--        if (sse2() && b.length >= 16)
--        {
--            auto n = aptr + (b.length & ~15);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr; // left operand
--                mov ECX, cptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n;    // end comparison
--
--                align 8;
--            startsseloopb:
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                movupd XMM4, [ECX];
--                movupd XMM5, [ECX+16];
--                movupd XMM6, [ECX+32];
--                movupd XMM7, [ECX+48];
--                add ESI, 64;
--                addpd XMM0, XMM4;
--                addpd XMM1, XMM5;
--                addpd XMM2, XMM6;
--                addpd XMM3, XMM7;
--                add ECX, 64;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    // Handle remainder
--    while (aptr < aend)
--        *aptr++ = *bptr++ + *cptr++;
--
--    return a;
--}
--
--
--unittest
--{
--    printf("_arraySliceSliceAddSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - c[]
-- */
--
--T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 324% faster
--        if (sse2() && b.length >= 8)
--        {
--            auto n = aptr + (b.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr; // left operand
--                mov ECX, cptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n;    // end comparison
--
--                align 8;
--            startsseloopb:
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                movupd XMM4, [ECX];
--                movupd XMM5, [ECX+16];
--                movupd XMM6, [ECX+32];
--                movupd XMM7, [ECX+48];
--                add ESI, 64;
--                subpd XMM0, XMM4;
--                subpd XMM1, XMM5;
--                subpd XMM2, XMM6;
--                subpd XMM3, XMM7;
--                add ECX, 64;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    // Handle remainder
--    while (aptr < aend)
--        *aptr++ = *bptr++ - *cptr++;
--
--    return a;
--}
--
--
--unittest
--{
--    printf("_arraySliceSliceMinSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + value
-- */
--
--T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpAddSliceAssign_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 305% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                addpd XMM0, XMM4;
--                addpd XMM1, XMM4;
--                addpd XMM2, XMM4;
--                addpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ + value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpAddSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += value
-- */
--
--T[] _arrayExpSliceAddass_d(T[] a, T value)
--{
--    //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 114% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--            if (aptr < n)
--
--            // Unaligned case
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloopa:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                addpd XMM0, XMM4;
--                addpd XMM1, XMM4;
--                addpd XMM2, XMM4;
--                addpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopa;
--
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceAddass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] += 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += b[]
-- */
--
--T[] _arraySliceSliceAddass_d(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceAddass_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 183% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov ECX, bptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n; // end comparison
--
--                align 8;
--            startsseloopb:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                movupd XMM4, [ECX];
--                movupd XMM5, [ECX+16];
--                movupd XMM6, [ECX+32];
--                movupd XMM7, [ECX+48];
--                add ECX, 64;
--                addpd XMM0, XMM4;
--                addpd XMM1, XMM5;
--                addpd XMM2, XMM6;
--                addpd XMM3, XMM7;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] += b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - value
-- */
--
--T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMinSliceAssign_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 305% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                subpd XMM0, XMM4;
--                subpd XMM1, XMM4;
--                subpd XMM2, XMM4;
--                subpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ - value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMinSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = value - b[]
-- */
--
--T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arrayExpSliceMinSliceAssign_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 66% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movapd XMM5, XMM4;
--                movapd XMM6, XMM4;
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                subpd XMM5, XMM0;
--                subpd XMM6, XMM1;
--                movupd [ESI+ 0-64], XMM5;
--                movupd [ESI+16-64], XMM6;
--                movapd XMM5, XMM4;
--                movapd XMM6, XMM4;
--                subpd XMM5, XMM2;
--                subpd XMM6, XMM3;
--                movupd [ESI+32-64], XMM5;
--                movupd [ESI+48-64], XMM6;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = value - *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = 6 - a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(6 - a[i]))
--                {
--                    printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= value
-- */
--
--T[] _arrayExpSliceMinass_d(T[] a, T value)
--{
--    //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 115% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--            if (aptr < n)
--
--            // Unaligned case
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloopa:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                subpd XMM0, XMM4;
--                subpd XMM1, XMM4;
--                subpd XMM2, XMM4;
--                subpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopa;
--
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[]
-- */
--
--T[] _arraySliceSliceMinass_d(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMinass_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 183% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov ECX, bptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n; // end comparison
--
--                align 8;
--            startsseloopb:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                movupd XMM4, [ECX];
--                movupd XMM5, [ECX+16];
--                movupd XMM6, [ECX+32];
--                movupd XMM7, [ECX+48];
--                add ECX, 64;
--                subpd XMM0, XMM4;
--                subpd XMM1, XMM5;
--                subpd XMM2, XMM6;
--                subpd XMM3, XMM7;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMulSliceAssign_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 304% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                mulpd XMM0, XMM4;
--                mulpd XMM1, XMM4;
--                mulpd XMM2, XMM4;
--                mulpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ * value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * c[]
-- */
--
--T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    //printf("_arraySliceSliceMulSliceAssign_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 329% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr; // left operand
--                mov ECX, cptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n; // end comparison
--
--                align 8;
--            startsseloopb:
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add ESI, 64;
--                movupd XMM4, [ECX];
--                movupd XMM5, [ECX+16];
--                movupd XMM6, [ECX+32];
--                movupd XMM7, [ECX+48];
--                add EAX, 64;
--                mulpd XMM0, XMM4;
--                mulpd XMM1, XMM5;
--                mulpd XMM2, XMM6;
--                mulpd XMM3, XMM7;
--                add ECX, 64;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ * *cptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMulSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * b[i]))
--                {
--                    printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= value
-- */
--
--T[] _arrayExpSliceMulass_d(T[] a, T value)
--{
--    //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 109% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--            if (aptr < n)
--
--            // Unaligned case
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, value;
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloopa:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                mulpd XMM0, XMM4;
--                mulpd XMM1, XMM4;
--                mulpd XMM2, XMM4;
--                mulpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopa;
--
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ *= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMulass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] *= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= b[]
-- */
--
--T[] _arraySliceSliceMulass_d(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMulass_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 205% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov ECX, bptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n; // end comparison
--
--                align 8;
--            startsseloopb:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                movupd XMM4, [ECX];
--                movupd XMM5, [ECX+16];
--                movupd XMM6, [ECX+32];
--                movupd XMM7, [ECX+48];
--                add ECX, 64;
--                mulpd XMM0, XMM4;
--                mulpd XMM1, XMM5;
--                mulpd XMM2, XMM6;
--                mulpd XMM3, XMM7;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ *= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMulass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] *= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] / value
-- */
--
--T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpDivSliceAssign_d()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    /* Multiplying by the reciprocal is faster, but does
--     * not produce as accurate an answer.
--     */
--    T recip = cast(T)1 / value;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 299% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, recip;
--                //movsd XMM4, value
--                //rcpsd XMM4, XMM4
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movupd XMM0, [EAX];
--                movupd XMM1, [EAX+16];
--                movupd XMM2, [EAX+32];
--                movupd XMM3, [EAX+48];
--                add EAX, 64;
--                mulpd XMM0, XMM4;
--                mulpd XMM1, XMM4;
--                mulpd XMM2, XMM4;
--                mulpd XMM3, XMM4;
--                //divpd XMM0, XMM4;
--                //divpd XMM1, XMM4;
--                //divpd XMM2, XMM4;
--                //divpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--    {
--        *aptr++ = *bptr++ / value;
--        //*aptr++ = *bptr++ * recip;
--    }
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpDivSliceAssign_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] / 8;
--
--            for (int i = 0; i < dim; i++)
--            {
--                //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
--                if (c[i] != cast(T)(a[i] / 8))
--                {
--                    printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] /= value
-- */
--
--T[] _arrayExpSliceDivass_d(T[] a, T value)
--{
--    //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    /* Multiplying by the reciprocal is faster, but does
--     * not produce as accurate an answer.
--     */
--    T recip = cast(T)1 / value;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 version is 65% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            // Unaligned case
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movsd XMM4, recip;
--                //movsd XMM4, value
--                //rcpsd XMM4, XMM4
--                shufpd XMM4, XMM4, 0;
--
--                align 8;
--            startsseloopa:
--                movupd XMM0, [ESI];
--                movupd XMM1, [ESI+16];
--                movupd XMM2, [ESI+32];
--                movupd XMM3, [ESI+48];
--                add ESI, 64;
--                mulpd XMM0, XMM4;
--                mulpd XMM1, XMM4;
--                mulpd XMM2, XMM4;
--                mulpd XMM3, XMM4;
--                //divpd XMM0, XMM4;
--                //divpd XMM1, XMM4;
--                //divpd XMM2, XMM4;
--                //divpd XMM3, XMM4;
--                movupd [ESI+ 0-64], XMM0;
--                movupd [ESI+16-64], XMM1;
--                movupd [ESI+32-64], XMM2;
--                movupd [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopa;
--
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ *= recip;
--
--    return a;
--}
--
--
--unittest
--{
--    printf("_arrayExpSliceDivass_d unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] /= 8;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] / 8))
--                {
--                    printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[] * value
-- */
--
--T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAddass_d(a, -value, b);
--}
--
--/***********************
-- * Computes:
-- *      a[] += b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
--in
--{
--        assert(a.length == b.length);
--        assert(disjoint(a, b));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    // Handle remainder
--    while (aptr < aend)
--        *aptr++ += *bptr++ * value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAddass_d unittest\n");
--
--    cpuid = 1;
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 1; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] += a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
--                if (c[i] != cast(T)(b[i] + a[i] * 6))
--                {
--                    printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayfloat.d druntime/src/rt/arrayfloat.d
---- druntime-old/src/rt/arrayfloat.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arrayfloat.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,1435 +0,0 @@
--/**
-- * Contains SSE2 and MMX versions of certain operations for float.
-- *
-- * Copyright: Copyright Digital Mars 2008 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, based on code originally written by Burton Radons
-- *
-- *          Copyright Digital Mars 2008 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arrayfloat;
--
--private import core.cpuid;
--
--version (unittest)
--{
--    private import core.stdc.stdio : printf;
--    /* This is so unit tests will test every CPU variant
--     */
--    int cpuid;
--    const int CPUID_MAX = 5;
--    bool mmx()      { return cpuid == 1 && core.cpuid.mmx(); }
--    bool sse()      { return cpuid == 2 && core.cpuid.sse(); }
--    bool sse2()     { return cpuid == 3 && core.cpuid.sse2(); }
--    bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
--}
--else
--{
--    alias core.cpuid.mmx mmx;
--    alias core.cpuid.sse sse;
--    alias core.cpuid.sse2 sse2;
--    alias core.cpuid.amd3dnow amd3dnow;
--}
--
--//version = log;
--
--bool disjoint(T)(T[] a, T[] b)
--{
--    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
--}
--
--alias float T;
--
--extern (C):
--
--/* ======================================================================== */
--/* ======================================================================== */
--
--/* template for the case
-- *   a[] = b[] ? c[]
-- * with some binary operator ?
-- */
--private template CodeGenSliceSliceOp(string opD, string opSSE, string op3DNow)
--{
--    const CodeGenSliceSliceOp = `
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE version is 834% faster
--        if (sse() && b.length >= 16)
--        {
--            auto n = aptr + (b.length & ~15);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr; // left operand
--                mov ECX, cptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n;    // end comparison
--
--                align 8;
--            startsseloopb:
--                movups XMM0, [EAX];
--                movups XMM1, [EAX+16];
--                movups XMM2, [EAX+32];
--                movups XMM3, [EAX+48];
--                add EAX, 64;
--                movups XMM4, [ECX];
--                movups XMM5, [ECX+16];
--                movups XMM6, [ECX+32];
--                movups XMM7, [ECX+48];
--                add ESI, 64;
--                ` ~ opSSE ~ ` XMM0, XMM4;
--                ` ~ opSSE ~ ` XMM1, XMM5;
--                ` ~ opSSE ~ ` XMM2, XMM6;
--                ` ~ opSSE ~ ` XMM3, XMM7;
--                add ECX, 64;
--                movups [ESI+ 0-64], XMM0;
--                movups [ESI+16-64], XMM1;
--                movups [ESI+32-64], XMM2;
--                movups [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--        else
--        // 3DNow! version is only 13% faster
--        if (amd3dnow() && b.length >= 8)
--        {
--            auto n = aptr + (b.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr; // destination operand
--                mov EDI, n;    // end comparison
--                mov EAX, bptr; // left operand
--                mov ECX, cptr; // right operand
--
--                align 4;
--            start3dnow:
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                ` ~ op3DNow ~ ` MM0, [ECX];
--                ` ~ op3DNow ~ ` MM1, [ECX+8];
--                ` ~ op3DNow ~ ` MM2, [ECX+16];
--                ` ~ op3DNow ~ ` MM3, [ECX+24];
--                movq [ESI], MM0;
--                movq [ESI+8], MM1;
--                movq [ESI+16], MM2;
--                movq [ESI+24], MM3;
--                add ECX, 32;
--                add ESI, 32;
--                add EAX, 32;
--                cmp ESI, EDI;
--                jb start3dnow;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    // Handle remainder
--    while (aptr < aend)
--        *aptr++ = *bptr++ ` ~ opD ~ ` *cptr++;
--
--    return a;`;
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + c[]
-- */
--
--T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    mixin(CodeGenSliceSliceOp!("+", "addps", "pfadd"));
--}
--
--
--unittest
--{
--    printf("_arraySliceSliceAddSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - c[]
-- */
--
--T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    mixin(CodeGenSliceSliceOp!("-", "subps", "pfsub"));
--}
--
--
--unittest
--{
--    printf("_arraySliceSliceMinSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * c[]
-- */
--
--T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    mixin(CodeGenSliceSliceOp!("*", "mulps", "pfmul"));
--}
--
--unittest
--{
--    printf("_arraySliceSliceMulSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * b[i]))
--                {
--                    printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/* template for the case
-- *   a[] ?= value
-- * with some binary operator ?
-- */
--private template CodeGenExpSliceOpAssign(string opD, string opSSE, string op3DNow)
--{
--    const CodeGenExpSliceOpAssign = `
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        if (sse() && a.length >= 16)
--        {
--            auto aabeg = cast(T*)((cast(uint)aptr + 15) & ~15); // beginning of paragraph-aligned slice of a
--            auto aaend = cast(T*)((cast(uint)aend) & ~15);      // end of paragraph-aligned slice of a
--
--            int numAligned = cast(int)(aaend - aabeg);          // how many floats are in the aligned slice?
--
--            // are there at least 16 floats in the paragraph-aligned slice?
--            // otherwise we can't do anything with SSE.
--            if (numAligned >= 16)
--            {
--                aaend = aabeg + (numAligned & ~15);     // make sure the slice is actually a multiple of 16 floats long
--
--                // process values up to aligned slice one by one
--                while (aptr < aabeg)
--                    *aptr++ ` ~ opD ~ ` value;
--
--                // process aligned slice with fast SSE operations
--                asm
--                {
--                    mov ESI, aabeg;
--                    mov EDI, aaend;
--                    movss XMM4, value;
--                    shufps XMM4, XMM4, 0;
--
--                    align 8;
--                startsseloopa:
--                    movaps XMM0, [ESI];
--                    movaps XMM1, [ESI+16];
--                    movaps XMM2, [ESI+32];
--                    movaps XMM3, [ESI+48];
--                    add ESI, 64;
--                    ` ~ opSSE ~ ` XMM0, XMM4;
--                    ` ~ opSSE ~ ` XMM1, XMM4;
--                    ` ~ opSSE ~ ` XMM2, XMM4;
--                    ` ~ opSSE ~ ` XMM3, XMM4;
--                    movaps [ESI+ 0-64], XMM0;
--                    movaps [ESI+16-64], XMM1;
--                    movaps [ESI+32-64], XMM2;
--                    movaps [ESI+48-64], XMM3;
--                    cmp ESI, EDI;
--                    jb startsseloopa;
--                }
--                aptr = aaend;
--            }
--        }
--        else
--        // 3DNow! version is 63% faster
--        if (amd3dnow() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            ulong w = *cast(uint *) &value;
--            ulong v = w | (w << 32L);
--
--            asm
--            {
--                mov ESI, dword ptr [aptr];
--                mov EDI, dword ptr [n];
--                movq MM4, qword ptr [v];
--
--                align 8;
--            start:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                movq MM2, [ESI+16];
--                movq MM3, [ESI+24];
--                ` ~ op3DNow ~ ` MM0, MM4;
--                ` ~ op3DNow ~ ` MM1, MM4;
--                ` ~ op3DNow ~ ` MM2, MM4;
--                ` ~ op3DNow ~ ` MM3, MM4;
--                movq [ESI], MM0;
--                movq [ESI+8], MM1;
--                movq [ESI+16], MM2;
--                movq [ESI+24], MM3;
--                add ESI, 32;
--                cmp ESI, EDI;
--                jb start;
--
--                emms;
--                mov dword ptr [aptr], ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ ` ~ opD ~ ` value;
--
--    return a;`;
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += value
-- */
--
--T[] _arrayExpSliceAddass_f(T[] a, T value)
--{
--    mixin(CodeGenExpSliceOpAssign!("+=", "addps", "pfadd"));
--}
--
--unittest
--{
--    printf("_arrayExpSliceAddass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] += 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= value
-- */
--
--T[] _arrayExpSliceMinass_f(T[] a, T value)
--{
--    mixin(CodeGenExpSliceOpAssign!("-=", "subps", "pfsub"));
--}
--
--unittest
--{
--    printf("_arrayExpSliceminass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= value
-- */
--
--T[] _arrayExpSliceMulass_f(T[] a, T value)
--{
--    mixin(CodeGenExpSliceOpAssign!("*=", "mulps", "pfmul"));
--}
--
--unittest
--{
--    printf("_arrayExpSliceMulass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] *= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] /= value
-- */
--
--T[] _arrayExpSliceDivass_f(T[] a, T value)
--{
--    return _arrayExpSliceMulass_f(a, 1f / value);
--}
--
--unittest
--{
--    printf("_arrayExpSliceDivass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] /= 8;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] / 8))
--                {
--                    printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--/* ======================================================================== */
--
--/* template for the case
-- *   a[] = b[] ? value
-- * with some binary operator ?
-- */
--private template CodeGenSliceExpOp(string opD, string opSSE, string op3DNow)
--{
--    const CodeGenSliceExpOp = `
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE version is 665% faster
--        if (sse() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movss XMM4, value;
--                shufps XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movups XMM0, [EAX];
--                movups XMM1, [EAX+16];
--                movups XMM2, [EAX+32];
--                movups XMM3, [EAX+48];
--                add EAX, 64;
--                ` ~ opSSE ~ ` XMM0, XMM4;
--                ` ~ opSSE ~ ` XMM1, XMM4;
--                ` ~ opSSE ~ ` XMM2, XMM4;
--                ` ~ opSSE ~ ` XMM3, XMM4;
--                movups [ESI+ 0-64], XMM0;
--                movups [ESI+16-64], XMM1;
--                movups [ESI+32-64], XMM2;
--                movups [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--        else
--        // 3DNow! version is 69% faster
--        if (amd3dnow() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            ulong w = *cast(uint *) &value;
--            ulong v = w | (w << 32L);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movq MM4, qword ptr [v];
--
--                align 8;
--            start3dnow:
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                ` ~ op3DNow ~ ` MM0, MM4;
--                ` ~ op3DNow ~ ` MM1, MM4;
--                ` ~ op3DNow ~ ` MM2, MM4;
--                ` ~ op3DNow ~ ` MM3, MM4;
--                movq [ESI],    MM0;
--                movq [ESI+8],  MM1;
--                movq [ESI+16], MM2;
--                movq [ESI+24], MM3;
--                add ESI, 32;
--                add EAX, 32;
--                cmp ESI, EDI;
--                jb start3dnow;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ ` ~ opD ~ ` value;
--
--    return a;`;
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + value
-- */
--
--T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    mixin(CodeGenSliceExpOp!("+", "addps", "pfadd"));
--}
--
--unittest
--{
--    printf("_arraySliceExpAddSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - value
-- */
--
--T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    mixin(CodeGenSliceExpOp!("-", "subps", "pfsub"));
--}
--
--unittest
--{
--    printf("_arraySliceExpMinSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    mixin(CodeGenSliceExpOp!("*", "mulps", "pfmul"));
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] / value
-- */
--
--T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAssign_f(a, 1f/value, b);
--}
--
--unittest
--{
--    printf("_arraySliceExpDivSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] / 8;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] / 8))
--                {
--                    printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--/* ======================================================================== */
--
--private template CodeGenSliceOpAssign(string opD, string opSSE, string op3DNow)
--{
--    const CodeGenSliceOpAssign = `
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE version is 468% faster
--        if (sse() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            // Unaligned case
--            asm
--            {
--                mov ECX, bptr; // right operand
--                mov ESI, aptr; // destination operand
--                mov EDI, n; // end comparison
--
--                align 8;
--            startsseloopb:
--                movups XMM0, [ESI];
--                movups XMM1, [ESI+16];
--                movups XMM2, [ESI+32];
--                movups XMM3, [ESI+48];
--                add ESI, 64;
--                movups XMM4, [ECX];
--                movups XMM5, [ECX+16];
--                movups XMM6, [ECX+32];
--                movups XMM7, [ECX+48];
--                add ECX, 64;
--                ` ~ opSSE ~ ` XMM0, XMM4;
--                ` ~ opSSE ~ ` XMM1, XMM5;
--                ` ~ opSSE ~ ` XMM2, XMM6;
--                ` ~ opSSE ~ ` XMM3, XMM7;
--                movups [ESI+ 0-64], XMM0;
--                movups [ESI+16-64], XMM1;
--                movups [ESI+32-64], XMM2;
--                movups [ESI+48-64], XMM3;
--                cmp ESI, EDI;
--                jb startsseloopb;
--
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--        else
--        // 3DNow! version is 57% faster
--        if (amd3dnow() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, dword ptr [aptr]; // destination operand
--                mov EDI, dword ptr [n];    // end comparison
--                mov ECX, dword ptr [bptr]; // right operand
--
--                align 4;
--            start3dnow:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                movq MM2, [ESI+16];
--                movq MM3, [ESI+24];
--                ` ~ op3DNow ~ ` MM0, [ECX];
--                ` ~ op3DNow ~ ` MM1, [ECX+8];
--                ` ~ op3DNow ~ ` MM2, [ECX+16];
--                ` ~ op3DNow ~ ` MM3, [ECX+24];
--                movq [ESI], MM0;
--                movq [ESI+8], MM1;
--                movq [ESI+16], MM2;
--                movq [ESI+24], MM3;
--                add ESI, 32;
--                add ECX, 32;
--                cmp ESI, EDI;
--                jb start3dnow;
--
--                emms;
--                mov dword ptr [aptr], ESI;
--                mov dword ptr [bptr], ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ ` ~ opD ~ ` *bptr++;
--
--    return a;`;
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += b[]
-- */
--
--T[] _arraySliceSliceAddass_f(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    mixin(CodeGenSliceOpAssign!("+=", "addps", "pfadd"));
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] += b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[]
-- */
--
--T[] _arraySliceSliceMinass_f(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    mixin(CodeGenSliceOpAssign!("-=", "subps", "pfsub"));
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= b[]
-- */
--
--T[] _arraySliceSliceMulass_f(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    mixin(CodeGenSliceOpAssign!("*=", "mulps", "pfmul"));
--}
--
--unittest
--{
--    printf("_arrayExpSliceMulass_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            c[] *= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = value - b[]
-- */
--
--T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arrayExpSliceMinSliceAssign_f()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE version is 690% faster
--        if (sse() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            // Unaligned case
--            asm
--            {
--                mov EAX, bptr;
--                mov ESI, aptr;
--                mov EDI, n;
--                movss XMM4, value;
--                shufps XMM4, XMM4, 0;
--
--                align 8;
--            startsseloop:
--                add ESI, 64;
--                movaps XMM5, XMM4;
--                movaps XMM6, XMM4;
--                movups XMM0, [EAX];
--                movups XMM1, [EAX+16];
--                movups XMM2, [EAX+32];
--                movups XMM3, [EAX+48];
--                add EAX, 64;
--                subps XMM5, XMM0;
--                subps XMM6, XMM1;
--                movups [ESI+ 0-64], XMM5;
--                movups [ESI+16-64], XMM6;
--                movaps XMM5, XMM4;
--                movaps XMM6, XMM4;
--                subps XMM5, XMM2;
--                subps XMM6, XMM3;
--                movups [ESI+32-64], XMM5;
--                movups [ESI+48-64], XMM6;
--                cmp ESI, EDI;
--                jb startsseloop;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--        else
--        // 3DNow! version is 67% faster
--        if (amd3dnow() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            ulong w = *cast(uint *) &value;
--            ulong v = w | (w << 32L);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movq MM4, qword ptr [v];
--
--                align 8;
--            start3dnow:
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                movq MM2, [EAX+16];
--                movq MM3, [EAX+24];
--                pfsubr MM0, MM4;
--                pfsubr MM1, MM4;
--                pfsubr MM2, MM4;
--                pfsubr MM3, MM4;
--                movq [ESI], MM0;
--                movq [ESI+8], MM1;
--                movq [ESI+16], MM2;
--                movq [ESI+24], MM3;
--                add ESI, 32;
--                add EAX, 32;
--                cmp ESI, EDI;
--                jb start3dnow;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = value - *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinSliceAssign_f unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = 6 - a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(6 - a[i]))
--                {
--                    printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[] * value
-- */
--
--T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAddass_f(a, -value, b);
--}
--
--/***********************
-- * Computes:
-- *      a[] += b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
--in
--{
--        assert(a.length == b.length);
--        assert(disjoint(a, b));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    // Handle remainder
--    while (aptr < aend)
--        *aptr++ += *bptr++ * value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAddass_f unittest\n");
--
--    cpuid = 1;
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 1; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] += a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
--                if (c[i] != cast(T)(b[i] + a[i] * 6))
--                {
--                    printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayint.d druntime/src/rt/arrayint.d
---- druntime-old/src/rt/arrayint.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arrayint.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,2430 +0,0 @@
--/**
-- * Contains MMX versions of certain operations for dchar, int, and uint ('w',
-- * 'i' and 'k' suffixes).
-- *
-- * Copyright: Copyright Digital Mars 2008 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, based on code originally written by Burton Radons
-- *
-- *          Copyright Digital Mars 2008 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arrayint;
--
--private import core.cpuid;
--
--version (unittest)
--{
--    private import core.stdc.stdio : printf;
--    /* This is so unit tests will test every CPU variant
--     */
--    int cpuid;
--    const int CPUID_MAX = 4;
--    bool mmx()      { return cpuid == 1 && core.cpuid.mmx(); }
--    bool sse()      { return cpuid == 2 && core.cpuid.sse(); }
--    bool sse2()     { return cpuid == 3 && core.cpuid.sse2(); }
--    bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
--}
--else
--{
--    alias core.cpuid.mmx mmx;
--    alias core.cpuid.sse sse;
--    alias core.cpuid.sse2 sse2;
--    alias core.cpuid.amd3dnow amd3dnow;
--}
--
--//version = log;
--
--bool disjoint(T)(T[] a, T[] b)
--{
--    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
--}
--
--alias int T;
--
--extern (C):
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + value
-- */
--
--T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
--{
--    return _arraySliceExpAddSliceAssign_i(a, value, b);
--}
--
--T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
--{
--    return _arraySliceExpAddSliceAssign_i(a, value, b);
--}
--
--T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpAddSliceAssign_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 380% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 298% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movq MM2, l;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                paddd MM0, MM2;
--                paddd MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--        else
--        if (a.length >= 2)
--        {
--            auto n = aptr + (a.length & ~1);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov EDX, value;
--
--                align 4;
--            start386:
--                add ESI, 8;
--                mov EBX, [EAX];
--                mov ECX, [EAX+4];
--                add EAX, 8;
--                add EBX, EDX;
--                add ECX, EDX;
--                mov [ESI  -8], EBX;
--                mov [ESI+4-8], ECX;
--                cmp ESI, EDI;
--                jb start386;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ + value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpAddSliceAssign_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + c[]
-- */
--
--T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceAddSliceAssign_i(a, c, b);
--}
--
--T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceAddSliceAssign_i(a, c, b);
--}
--
--T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    //printf("_arraySliceSliceAddSliceAssign_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1710% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 995% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM2, [ECX];
--                movq MM1, [EAX+8];
--                movq MM3, [ECX+8];
--                add EAX, 16;
--                add ECX, 16;
--                paddd MM0, MM2;
--                paddd MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--normal:
--    while (aptr < aend)
--        *aptr++ = *bptr++ + *cptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddSliceAssign_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += value
-- */
--
--T[] _arrayExpSliceAddass_w(T[] a, T value)
--{
--    return _arrayExpSliceAddass_i(a, value);
--}
--
--T[] _arrayExpSliceAddass_k(T[] a, T value)
--{
--    return _arrayExpSliceAddass_i(a, value);
--}
--
--T[] _arrayExpSliceAddass_i(T[] a, T value)
--{
--    //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 83% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 81% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movq MM2, l;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                paddd MM0, MM2;
--                paddd MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--        else
--        if (a.length >= 2)
--        {
--            auto n = aptr + (a.length & ~1);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EDX, value;
--
--                align 4;
--            start386:
--                mov EBX, [ESI];
--                mov ECX, [ESI+4];
--                add ESI, 8;
--                add EBX, EDX;
--                add ECX, EDX;
--                mov [ESI  -8], EBX;
--                mov [ESI+4-8], ECX;
--                cmp ESI, EDI;
--                jb start386;
--
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceAddass_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            a[] += 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(c[i] + 6))
--                {
--                    printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += b[]
-- */
--
--T[] _arraySliceSliceAddass_w(T[] a, T[] b)
--{
--    return _arraySliceSliceAddass_i(a, b);
--}
--
--T[] _arraySliceSliceAddass_k(T[] a, T[] b)
--{
--    return _arraySliceSliceAddass_i(a, b);
--}
--
--T[] _arraySliceSliceAddass_i(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceAddass_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 695% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    paddd XMM0, XMM2;
--                    paddd XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 471% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM2, [ECX];
--                movq MM1, [ESI+8];
--                movq MM3, [ECX+8];
--                add ESI, 16;
--                add ECX, 16;
--                paddd MM0, MM2;
--                paddd MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--normal:
--    while (aptr < aend)
--        *aptr++ += *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddass_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] += a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(b[i] + a[i]))
--                {
--                    printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - value
-- */
--
--T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMinSliceAssign_i(a, value, b);
--}
--
--T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMinSliceAssign_i(a, value, b);
--}
--
--T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMinSliceAssign_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 400% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 315% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movq MM2, l;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                psubd MM0, MM2;
--                psubd MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--        else
--        if (a.length >= 2)
--        {
--            auto n = aptr + (a.length & ~1);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov EDX, value;
--
--                align 4;
--            start386:
--                add ESI, 8;
--                mov EBX, [EAX];
--                mov ECX, [EAX+4];
--                add EAX, 8;
--                sub EBX, EDX;
--                sub ECX, EDX;
--                mov [ESI  -8], EBX;
--                mov [ESI+4-8], ECX;
--                cmp ESI, EDI;
--                jb start386;
--
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ - value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMinSliceAssign_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = value - b[]
-- */
--
--T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
--{
--    return _arrayExpSliceMinSliceAssign_i(a, b, value);
--}
--
--T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
--{
--    return _arrayExpSliceMinSliceAssign_i(a, b, value);
--}
--
--T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arrayExpSliceMinSliceAssign_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1812% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 4;
--                startaddsse2u:
--                    add ESI, 32;
--                    movdqu XMM2, [EAX];
--                    movdqu XMM3, [EAX+16];
--                    movdqa XMM0, XMM4;
--                    movdqa XMM1, XMM4;
--                    add EAX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM4, l;
--                    pshufd XMM4, XMM4, 0;
--
--                    align 4;
--                startaddsse2a:
--                    add ESI, 32;
--                    movdqa XMM2, [EAX];
--                    movdqa XMM3, [EAX+16];
--                    movdqa XMM0, XMM4;
--                    movdqa XMM1, XMM4;
--                    add EAX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 1077% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movq MM4, l;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM2, [EAX];
--                movq MM3, [EAX+8];
--                movq MM0, MM4;
--                movq MM1, MM4;
--                add EAX, 16;
--                psubd MM0, MM2;
--                psubd MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = value - *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinSliceAssign_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = 6 - a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(6 - a[i]))
--                {
--                    printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - c[]
-- */
--
--T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMinSliceAssign_i(a, c, b);
--}
--
--T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMinSliceAssign_i(a, c, b);
--}
--
--T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1721% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 1002% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM2, [ECX];
--                movq MM1, [EAX+8];
--                movq MM3, [ECX+8];
--                add EAX, 16;
--                add ECX, 16;
--                psubd MM0, MM2;
--                psubd MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ - *cptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMinSliceAssign_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= value
-- */
--
--T[] _arrayExpSliceMinass_w(T[] a, T value)
--{
--    return _arrayExpSliceMinass_i(a, value);
--}
--
--T[] _arrayExpSliceMinass_k(T[] a, T value)
--{
--    return _arrayExpSliceMinass_i(a, value);
--}
--
--T[] _arrayExpSliceMinass_i(T[] a, T value)
--{
--    //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 81% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 81% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movq MM2, l;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                psubd MM0, MM2;
--                psubd MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--        else
--        if (a.length >= 2)
--        {
--            auto n = aptr + (a.length & ~1);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EDX, value;
--
--                align 4;
--            start386:
--                mov EBX, [ESI];
--                mov ECX, [ESI+4];
--                add ESI, 8;
--                sub EBX, EDX;
--                sub ECX, EDX;
--                mov [ESI  -8], EBX;
--                mov [ESI+4-8], ECX;
--                cmp ESI, EDI;
--                jb start386;
--
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinass_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            a[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(c[i] - 6))
--                {
--                    printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[]
-- */
--
--T[] _arraySliceSliceMinass_w(T[] a, T[] b)
--{
--    return _arraySliceSliceMinass_i(a, b);
--}
--
--T[] _arraySliceSliceMinass_k(T[] a, T[] b)
--{
--    return _arraySliceSliceMinass_i(a, b);
--}
--
--T[] _arraySliceSliceMinass_i(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMinass_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 731% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    psubd XMM0, XMM2;
--                    psubd XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 441% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM2, [ECX];
--                movq MM1, [ESI+8];
--                movq MM3, [ECX+8];
--                add ESI, 16;
--                add ECX, 16;
--                psubd MM0, MM2;
--                psubd MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMinass_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] -= a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(b[i] - a[i]))
--                {
--                    printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAssign_i(a, value, b);
--}
--
--T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAssign_i(a, value, b);
--}
--
--T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMulSliceAssign_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--  version (none)        // multiplying a pair is not supported by MMX
--  {
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1380% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        {
--        // MMX version is 1380% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movq MM2, l;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                pmuludq MM0, MM2;       // only multiplies low 32 bits
--                pmuludq MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--        }
--  }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ * value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * c[]
-- */
--
--T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMulSliceAssign_i(a, c, b);
--}
--
--T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMulSliceAssign_i(a, c, b);
--}
--
--T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    //printf("_arraySliceSliceMulSliceAssign_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--  version (none)
--  {
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 1407% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--               }
--            }
--        }
--        else
--        // MMX version is 1029% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM2, [ECX];
--                movq MM1, [EAX+8];
--                movq MM3, [ECX+8];
--                add EAX, 16;
--                add ECX, 16;
--                pmuludq MM0, MM2;
--                pmuludq MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--  }
--
--    while (aptr < aend)
--        *aptr++ = *bptr++ * *cptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMulSliceAssign_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * b[i]))
--                {
--                    printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= value
-- */
--
--T[] _arrayExpSliceMulass_w(T[] a, T value)
--{
--    return _arrayExpSliceMulass_i(a, value);
--}
--
--T[] _arrayExpSliceMulass_k(T[] a, T value)
--{
--    return _arrayExpSliceMulass_i(a, value);
--}
--
--T[] _arrayExpSliceMulass_i(T[] a, T value)
--{
--    //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--  version (none)
--  {
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 400% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = value;
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 402% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movq MM2, l;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                pmuludq MM0, MM2;
--                pmuludq MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--    }
--  }
--
--    while (aptr < aend)
--        *aptr++ *= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMulass_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = a[];
--            a[] *= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(b[i] * 6))
--                {
--                    printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= b[]
-- */
--
--T[] _arraySliceSliceMulass_w(T[] a, T[] b)
--{
--    return _arraySliceSliceMulass_i(a, b);
--}
--
--T[] _arraySliceSliceMulass_k(T[] a, T[] b)
--{
--    return _arraySliceSliceMulass_i(a, b);
--}
--
--T[] _arraySliceSliceMulass_i(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMulass_i()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--  version (none)
--  {
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 873% faster
--        if (sse2() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    pmuludq XMM0, XMM2;
--                    pmuludq XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--               }
--            }
--        }
--/+ BUG: comment out this section until we figure out what is going
--   wrong with the invalid pshufd instructions.
--
--        else
--        // MMX version is 573% faster
--        if (mmx() && a.length >= 4)
--        {
--            auto n = aptr + (a.length & ~3);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM2, [ECX];
--                movq MM1, [ESI+8];
--                movq MM3, [ECX+8];
--                pxor MM4, MM4;
--                pxor MM5, MM5;
--                punpckldq MM4, MM0;
--                punpckldq MM5, MM2;
--                add ESI, 16;
--                add ECX, 16;
--                pmuludq MM4, MM5;
--                pshufd MM4, MM4, 8;     // ?
--                movq [ESI  -16], MM4;
--                pxor MM4, MM4;
--                pxor MM5, MM5;
--                punpckldq MM4, MM1;
--                punpckldq MM5, MM3;
--                pmuludq MM4, MM5;
--                pshufd MM4, MM4, 8;     // ?
--                movq [ESI+8-16], MM4;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--+/
--    }
--  }
--
--    while (aptr < aend)
--        *aptr++ *= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMulass_i unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = a[];
--            a[] *= c[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(b[i] * c[i]))
--                {
--                    printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayreal.d druntime/src/rt/arrayreal.d
---- druntime-old/src/rt/arrayreal.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arrayreal.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,241 +0,0 @@
--/**
-- * Contains SSE2 and MMX versions of certain operations for real.
-- *
-- * Copyright: Copyright Digital Mars 2008 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, based on code originally written by Burton Radons
-- *
-- *          Copyright Digital Mars 2008 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arrayreal;
--
--import core.cpuid;
--
--version (unittest)
--{
--    private import core.stdc.stdio : printf;
--    /* This is so unit tests will test every CPU variant
--     */
--    int cpuid;
--    const int CPUID_MAX = 1;
--    bool mmx()      { return cpuid == 1 && core.cpuid.mmx(); }
--    bool sse()      { return cpuid == 2 && core.cpuid.sse(); }
--    bool sse2()     { return cpuid == 3 && core.cpuid.sse2(); }
--    bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
--}
--else
--{
--    alias core.cpuid.mmx mmx;
--    alias core.cpuid.sse sse;
--    alias core.cpuid.sse2 sse2;
--    alias core.cpuid.amd3dnow amd3dnow;
--}
--
--//version = log;
--
--bool disjoint(T)(T[] a, T[] b)
--{
--    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
--}
--
--alias real T;
--
--extern (C):
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + c[]
-- */
--
--T[] _arraySliceSliceAddSliceAssign_r(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    for (int i = 0; i < a.length; i++)
--        a[i] = b[i] + c[i];
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddSliceAssign_r unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %Lg != %Lg + %Lg\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - c[]
-- */
--
--T[] _arraySliceSliceMinSliceAssign_r(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    for (int i = 0; i < a.length; i++)
--        a[i] = b[i] - c[i];
--    return a;
--}
--
--
--unittest
--{
--    printf("_arraySliceSliceMinSliceAssign_r unittest\n");
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %Lg != %Lg - %Lg\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[] * value
-- */
--
--T[] _arraySliceExpMulSliceMinass_r(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAddass_r(a, -value, b);
--}
--
--/***********************
-- * Computes:
-- *      a[] += b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAddass_r(T[] a, T value, T[] b)
--in
--{
--        assert(a.length == b.length);
--        assert(disjoint(a, b));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    // Handle remainder
--    while (aptr < aend)
--        *aptr++ += *bptr++ * value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAddass_r unittest\n");
--
--    cpuid = 1;
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 1; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] += a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                //printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]);
--                if (c[i] != cast(T)(b[i] + a[i] * 6))
--                {
--                    printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayshort.d druntime/src/rt/arrayshort.d
---- druntime-old/src/rt/arrayshort.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/arrayshort.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,2303 +0,0 @@
--/**
-- * Contains SSE2 and MMX versions of certain operations for wchar, short,
-- * and ushort ('u', 's' and 't' suffixes).
-- *
-- * Copyright: Copyright Digital Mars 2008 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright, based on code originally written by Burton Radons
-- *
-- *          Copyright Digital Mars 2008 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.arrayshort;
--
--private import core.cpuid;
--
--version (unittest)
--{
--    private import core.stdc.stdio : printf;
--    /* This is so unit tests will test every CPU variant
--     */
--    int cpuid;
--    const int CPUID_MAX = 4;
--    bool mmx()      { return cpuid == 1 && core.cpuid.mmx(); }
--    bool sse()      { return cpuid == 2 && core.cpuid.sse(); }
--    bool sse2()     { return cpuid == 3 && core.cpuid.sse2(); }
--    bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
--}
--else
--{
--    alias core.cpuid.mmx mmx;
--    alias core.cpuid.sse sse;
--    alias core.cpuid.sse2 sse2;
--    alias core.cpuid.sse2 sse2;
--}
--
--//version = log;
--
--bool disjoint(T)(T[] a, T[] b)
--{
--    return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
--}
--
--alias short T;
--
--extern (C):
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + value
-- */
--
--T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
--{
--    return _arraySliceExpAddSliceAssign_s(a, value, b);
--}
--
--T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
--{
--    return _arraySliceExpAddSliceAssign_s(a, value, b);
--}
--
--T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpAddSliceAssign_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 3343% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= (l << 16);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 3343% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM2, l;
--                pshufw MM2, MM2, 0;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                paddw MM0, MM2;
--                paddw MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ + value);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpAddSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + 6))
--                {
--                    printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] + c[]
-- */
--
--T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceAddSliceAssign_s(a, c, b);
--}
--
--T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceAddSliceAssign_s(a, c, b);
--}
--
--T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    //printf("_arraySliceSliceAddSliceAssign_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 3777% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    movdqu XMM2, [ECX];
--                    movdqu XMM3, [ECX+16];
--                    add ECX, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    movdqa XMM2, [ECX];
--                    movdqa XMM3, [ECX+16];
--                    add ECX, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 2068% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                movq MM2, [ECX];
--                movq MM3, [ECX+8];
--                add ECX, 16;
--                paddw MM0, MM2;
--                paddw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ + *cptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] + b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] + b[i]))
--                {
--                    printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += value
-- */
--
--T[] _arrayExpSliceAddass_u(T[] a, T value)
--{
--    return _arrayExpSliceAddass_s(a, value);
--}
--
--T[] _arrayExpSliceAddass_t(T[] a, T value)
--{
--    return _arrayExpSliceAddass_s(a, value);
--}
--
--T[] _arrayExpSliceAddass_s(T[] a, T value)
--{
--    //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 832% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= (l << 16);
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 826% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movd MM2, l;
--                pshufw MM2, MM2, 0;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                paddw MM0, MM2;
--                paddw MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceAddass_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            a[] += 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(c[i] + 6))
--                {
--                    printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] += b[]
-- */
--
--T[] _arraySliceSliceAddass_u(T[] a, T[] b)
--{
--    return _arraySliceSliceAddass_s(a, b);
--}
--
--T[] _arraySliceSliceAddass_t(T[] a, T[] b)
--{
--    return _arraySliceSliceAddass_s(a, b);
--}
--
--T[] _arraySliceSliceAddass_s(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceAddass_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 2085% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    movdqu XMM2, [ECX];
--                    movdqu XMM3, [ECX+16];
--                    add ECX, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    movdqa XMM2, [ECX];
--                    movdqa XMM3, [ECX+16];
--                    add ECX, 32;
--                    paddw XMM0, XMM2;
--                    paddw XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 1022% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 4;
--            start:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                movq MM2, [ECX];
--                movq MM3, [ECX+8];
--                add ECX, 16;
--                paddw MM0, MM2;
--                paddw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb start;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ += *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceAddass_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] += a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(b[i] + a[i]))
--                {
--                    printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - value
-- */
--
--T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMinSliceAssign_s(a, value, b);
--}
--
--T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMinSliceAssign_s(a, value, b);
--}
--
--T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMinSliceAssign_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 3695% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= (l << 16);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 3049% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM2, l;
--                pshufw MM2, MM2, 0;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                psubw MM0, MM2;
--                psubw MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ - value);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMinSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - 6))
--                {
--                    printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = value - b[]
-- */
--
--T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
--{
--    return _arrayExpSliceMinSliceAssign_s(a, b, value);
--}
--
--T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
--{
--    return _arrayExpSliceMinSliceAssign_s(a, b, value);
--}
--
--T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arrayExpSliceMinSliceAssign_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 4995% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= (l << 16);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--
--                    align 4;
--                startaddsse2u:
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--                    movd XMM3, l;
--                    pshufd XMM3, XMM3, 0;
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    psubw XMM2, XMM0;
--                    psubw XMM3, XMM1;
--                    movdqu [ESI   -32], XMM2;
--                    movdqu [ESI+16-32], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--
--                    align 4;
--                startaddsse2a:
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--                    movd XMM3, l;
--                    pshufd XMM3, XMM3, 0;
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    psubw XMM2, XMM0;
--                    psubw XMM3, XMM1;
--                    movdqa [ESI   -32], XMM2;
--                    movdqa [ESI+16-32], XMM3;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 4562% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM4, l;
--                pshufw MM4, MM4, 0;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM2, [EAX];
--                movq MM3, [EAX+8];
--                movq MM0, MM4;
--                movq MM1, MM4;
--                add EAX, 16;
--                psubw MM0, MM2;
--                psubw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(value - *bptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = 6 - a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(6 - a[i]))
--                {
--                    printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] - c[]
-- */
--
--T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMinSliceAssign_s(a, c, b);
--}
--
--T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMinSliceAssign_s(a, c, b);
--}
--
--T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 4129% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    movdqu XMM2, [ECX];
--                    movdqu XMM3, [ECX+16];
--                    add ECX, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    movdqa XMM2, [ECX];
--                    movdqa XMM3, [ECX+16];
--                    add ECX, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 2018% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                movq MM2, [ECX];
--                movq MM3, [ECX+8];
--                add ECX, 16;
--                psubw MM0, MM2;
--                psubw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ - *cptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMinSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] - b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] - b[i]))
--                {
--                    printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= value
-- */
--
--T[] _arrayExpSliceMinass_u(T[] a, T value)
--{
--    return _arrayExpSliceMinass_s(a, value);
--}
--
--T[] _arrayExpSliceMinass_t(T[] a, T value)
--{
--    return _arrayExpSliceMinass_s(a, value);
--}
--
--T[] _arrayExpSliceMinass_s(T[] a, T value)
--{
--    //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 835% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= (l << 16);
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startaddsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startaddsse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 835% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movd MM2, l;
--                pshufw MM2, MM2, 0;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                psubw MM0, MM2;
--                psubw MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMinass_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            a[] = c[];
--            a[] -= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(c[i] - 6))
--                {
--                    printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] -= b[]
-- */
--
--T[] _arraySliceSliceMinass_u(T[] a, T[] b)
--{
--    return _arraySliceSliceMinass_s(a, b);
--}
--
--T[] _arraySliceSliceMinass_t(T[] a, T[] b)
--{
--    return _arraySliceSliceMinass_s(a, b);
--}
--
--T[] _arraySliceSliceMinass_s(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMinass_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 2121% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm // unaligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    movdqu XMM2, [ECX];
--                    movdqu XMM3, [ECX+16];
--                    add ECX, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm // aligned case
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    movdqa XMM2, [ECX];
--                    movdqa XMM3, [ECX+16];
--                    add ECX, 32;
--                    psubw XMM0, XMM2;
--                    psubw XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--        }
--        else
--        // MMX version is 1116% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 4;
--            start:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                movq MM2, [ECX];
--                movq MM3, [ECX+8];
--                add ECX, 16;
--                psubw MM0, MM2;
--                psubw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb start;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ -= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMinass_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = c[];
--            c[] -= a[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(b[i] - a[i]))
--                {
--                    printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * value
-- */
--
--T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAssign_s(a, value, b);
--}
--
--T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
--{
--    return _arraySliceExpMulSliceAssign_s(a, value, b);
--}
--
--T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
--in
--{
--    assert(a.length == b.length);
--    assert(disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceExpMulSliceAssign_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 3733% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= l << 16;
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM1, [EAX+16];
--                    add EAX, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM1, [EAX+16];
--                    add EAX, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                }
--            }
--        }
--        else
--        // MMX version is 3733% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                movd MM2, l;
--                pshufw MM2, MM2, 0;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM1, [EAX+8];
--                add EAX, 16;
--                pmullw MM0, MM2;
--                pmullw MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ * value);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceExpMulSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * 6))
--                {
--                    printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] = b[] * c[]
-- */
--
--T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMulSliceAssign_s(a, c, b);
--}
--
--T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
--{
--    return _arraySliceSliceMulSliceAssign_s(a, c, b);
--}
--
--T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
--in
--{
--        assert(a.length == b.length && b.length == c.length);
--        assert(disjoint(a, b));
--        assert(disjoint(a, c));
--        assert(disjoint(b, c));
--}
--body
--{
--    //printf("_arraySliceSliceMulSliceAssign_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--    auto cptr = c.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 2515% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2u:
--                    add ESI, 32;
--                    movdqu XMM0, [EAX];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [EAX+16];
--                    movdqu XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov EAX, bptr;
--                    mov ECX, cptr;
--
--                    align 4;
--                startsse2a:
--                    add ESI, 32;
--                    movdqa XMM0, [EAX];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [EAX+16];
--                    movdqa XMM3, [ECX+16];
--                    add EAX, 32;
--                    add ECX, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, EAX;
--                    mov cptr, ECX;
--               }
--            }
--        }
--        else
--        // MMX version is 2515% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov EAX, bptr;
--                mov ECX, cptr;
--
--                align 4;
--            startmmx:
--                add ESI, 16;
--                movq MM0, [EAX];
--                movq MM2, [ECX];
--                movq MM1, [EAX+8];
--                movq MM3, [ECX+8];
--                add EAX, 16;
--                add ECX, 16;
--                pmullw MM0, MM2;
--                pmullw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, EAX;
--                mov cptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ = cast(T)(*bptr++ * *cptr++);
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMulSliceAssign_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            c[] = a[] * b[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (c[i] != cast(T)(a[i] * b[i]))
--                {
--                    printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= value
-- */
--
--T[] _arrayExpSliceMulass_u(T[] a, T value)
--{
--    return _arrayExpSliceMulass_s(a, value);
--}
--
--T[] _arrayExpSliceMulass_t(T[] a, T value)
--{
--    return _arrayExpSliceMulass_s(a, value);
--}
--
--T[] _arrayExpSliceMulass_s(T[] a, T value)
--{
--    //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 2044% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            uint l = cast(ushort) value;
--            l |= l << 16;
--
--            if (((cast(uint) aptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM1, [ESI+16];
--                    add ESI, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM2;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    movd XMM2, l;
--                    pshufd XMM2, XMM2, 0;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM1, [ESI+16];
--                    add ESI, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM2;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                }
--            }
--        }
--        else
--        // MMX version is 2056% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            uint l = cast(ushort) value;
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                movd MM2, l;
--                pshufw MM2, MM2, 0;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM1, [ESI+8];
--                add ESI, 16;
--                pmullw MM0, MM2;
--                pmullw MM1, MM2;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ *= value;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arrayExpSliceMulass_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = a[];
--            a[] *= 6;
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(b[i] * 6))
--                {
--                    printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
--
--
--/* ======================================================================== */
--
--/***********************
-- * Computes:
-- *      a[] *= b[]
-- */
--
--T[] _arraySliceSliceMulass_u(T[] a, T[] b)
--{
--    return _arraySliceSliceMulass_s(a, b);
--}
--
--T[] _arraySliceSliceMulass_t(T[] a, T[] b)
--{
--    return _arraySliceSliceMulass_s(a, b);
--}
--
--T[] _arraySliceSliceMulass_s(T[] a, T[] b)
--in
--{
--    assert (a.length == b.length);
--    assert (disjoint(a, b));
--}
--body
--{
--    //printf("_arraySliceSliceMulass_s()\n");
--    auto aptr = a.ptr;
--    auto aend = aptr + a.length;
--    auto bptr = b.ptr;
--
--    version (D_InlineAsm_X86)
--    {
--        // SSE2 aligned version is 2519% faster
--        if (sse2() && a.length >= 16)
--        {
--            auto n = aptr + (a.length & ~15);
--
--            if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2u:
--                    movdqu XMM0, [ESI];
--                    movdqu XMM2, [ECX];
--                    movdqu XMM1, [ESI+16];
--                    movdqu XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM3;
--                    movdqu [ESI   -32], XMM0;
--                    movdqu [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2u;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--                }
--            }
--            else
--            {
--                asm
--                {
--                    mov ESI, aptr;
--                    mov EDI, n;
--                    mov ECX, bptr;
--
--                    align 4;
--                startsse2a:
--                    movdqa XMM0, [ESI];
--                    movdqa XMM2, [ECX];
--                    movdqa XMM1, [ESI+16];
--                    movdqa XMM3, [ECX+16];
--                    add ESI, 32;
--                    add ECX, 32;
--                    pmullw XMM0, XMM2;
--                    pmullw XMM1, XMM3;
--                    movdqa [ESI   -32], XMM0;
--                    movdqa [ESI+16-32], XMM1;
--                    cmp ESI, EDI;
--                    jb startsse2a;
--
--                    mov aptr, ESI;
--                    mov bptr, ECX;
--               }
--            }
--        }
--        else
--        // MMX version is 1712% faster
--        if (mmx() && a.length >= 8)
--        {
--            auto n = aptr + (a.length & ~7);
--
--            asm
--            {
--                mov ESI, aptr;
--                mov EDI, n;
--                mov ECX, bptr;
--
--                align 4;
--            startmmx:
--                movq MM0, [ESI];
--                movq MM2, [ECX];
--                movq MM1, [ESI+8];
--                movq MM3, [ECX+8];
--                add ESI, 16;
--                add ECX, 16;
--                pmullw MM0, MM2;
--                pmullw MM1, MM3;
--                movq [ESI  -16], MM0;
--                movq [ESI+8-16], MM1;
--                cmp ESI, EDI;
--                jb startmmx;
--
--                emms;
--                mov aptr, ESI;
--                mov bptr, ECX;
--            }
--        }
--    }
--
--    while (aptr < aend)
--        *aptr++ *= *bptr++;
--
--    return a;
--}
--
--unittest
--{
--    printf("_arraySliceSliceMulass_s unittest\n");
--
--    for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
--    {
--        version (log) printf("    cpuid %d\n", cpuid);
--
--        for (int j = 0; j < 2; j++)
--        {
--            const int dim = 67;
--            T[] a = new T[dim + j];     // aligned on 16 byte boundary
--            a = a[j .. dim + j];        // misalign for second iteration
--            T[] b = new T[dim + j];
--            b = b[j .. dim + j];
--            T[] c = new T[dim + j];
--            c = c[j .. dim + j];
--
--            for (int i = 0; i < dim; i++)
--            {   a[i] = cast(T)i;
--                b[i] = cast(T)(i + 7);
--                c[i] = cast(T)(i * 2);
--            }
--
--            b[] = a[];
--            a[] *= c[];
--
--            for (int i = 0; i < dim; i++)
--            {
--                if (a[i] != cast(T)(b[i] * c[i]))
--                {
--                    printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
--                    assert(0);
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh.c druntime/src/rt/deh.c
---- druntime-old/src/rt/deh.c	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/deh.c	1970-01-01 03:00:00.000000000 +0300
-@@ -1,734 +0,0 @@
--/**
-- * Implementation of exception handling support routines for Windows.
-- *
-- * Copyright: Copyright Digital Mars 1999 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright
-- *
-- *          Copyright Digital Mars 1999 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--#include        <stdio.h>
--#include        <string.h>
--#include        <assert.h>
--#include        <stdlib.h>
--
--/* ======================== Win32 =============================== */
--
--#if _WIN32
--
--#include        <excpt.h>
--#include        <windows.h>
--
--//#include      "\sc\src\include\ehsup.h"
--
--/*** From Digital Mars C runtime library ***/
--EXCEPTION_DISPOSITION __cdecl _local_except_handler (EXCEPTION_RECORD *ExceptionRecord,
--    void* EstablisherFrame,
--        void *ContextRecord,
--        void *DispatcherContext
--        );
--void __cdecl _global_unwind(void *frame,EXCEPTION_RECORD *eRecord);
--#define EXCEPTION_UNWIND  6  // Flag to indicate if the system is unwinding
--
--extern DWORD _except_list;
--/*** ***/
--
--#include        "mars.h"
--
--extern ClassInfo D6object9Throwable7__ClassZ;
--#define _Class_9Throwable D6object9Throwable7__ClassZ;
--
--extern ClassInfo D6object5Error7__ClassZ;
--#define _Class_5Error D6object5Error7__ClassZ
--
--typedef int (__pascal *fp_t)();   // function pointer in ambient memory model
--
--void _d_setunhandled(Object*);
--
--// The layout of DEstablisherFrame is the same for C++
--
--struct DEstablisherFrame
--{
--    void *prev;                 // pointer to previous exception list
--    void *handler;              // pointer to routine for exception handler
--    DWORD table_index;          // current index into handler_info[]
--    DWORD ebp;                  // this is EBP of routine
--};
--
--struct DHandlerInfo
--{
--    int prev_index;             // previous table index
--    unsigned cioffset;          // offset to DCatchInfo data from start of table (!=0 if try-catch)
--    void *finally_code;         // pointer to finally code to execute
--                                // (!=0 if try-finally)
--};
--
--// Address of DHandlerTable is passed in EAX to _d_framehandler()
--
--struct DHandlerTable
--{
--    void *fptr;                 // pointer to start of function
--    unsigned espoffset;         // offset of ESP from EBP
--    unsigned retoffset;         // offset from start of function to return code
--    struct DHandlerInfo handler_info[1];
--};
--
--struct DCatchBlock
--{
--    ClassInfo *type;            // catch type
--    unsigned bpoffset;          // EBP offset of catch var
--    void *code;                 // catch handler code
--};
--
--// Create one of these for each try-catch
--struct DCatchInfo
--{
--    unsigned ncatches;                  // number of catch blocks
--    struct DCatchBlock catch_block[1];  // data for each catch block
--};
--
--// Macro to make our own exception code
--#define MAKE_EXCEPTION_CODE(severity, facility, exception)      \
--        (((severity) << 30) | (1 << 29) | (0 << 28) | ((facility) << 16) | (exception))
--
--#define STATUS_DIGITAL_MARS_D_EXCEPTION         MAKE_EXCEPTION_CODE(3,'D',1)
--
--Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record);
--void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, struct DEstablisherFrame *frame, int stop_index);
--
--
--/***********************************
-- * The frame handler, this is called for each frame that has been registered
-- * in the OS except_list.
-- * Input:
-- *      EAX     the handler table for the frame
-- */
--
--EXCEPTION_DISPOSITION _d_framehandler(
--            EXCEPTION_RECORD *exception_record,
--            struct DEstablisherFrame *frame,
--            CONTEXT *context,
--            void *dispatcher_context)
--{
--    struct DHandlerTable *handler_table;
--
--    __asm { mov handler_table,EAX }
--
--    if (exception_record->ExceptionFlags & EXCEPTION_UNWIND)
--    {
--         // Call all the finally blocks in this frame
--         _d_local_unwind(handler_table, frame, -1);
--    }
--    else
--    {
--        // Jump to catch block if matching one is found
--
--        int ndx,prev_ndx,i;
--        struct DHandlerInfo *phi;
--        struct DCatchInfo *pci;
--        struct DCatchBlock *pcb;
--        unsigned ncatches;              // number of catches in the current handler
--        Object *pti;
--        ClassInfo *ci;
--
--        ci = NULL;                      // only compute it if we need it
--
--        // walk through handler table, checking each handler
--        // with an index smaller than the current table_index
--        for (ndx = frame->table_index; ndx != -1; ndx = prev_ndx)
--        {
--            phi = &handler_table->handler_info[ndx];
--            prev_ndx = phi->prev_index;
--            if (phi->cioffset)
--            {
--                // this is a catch handler (no finally)
--                pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset);
--                ncatches = pci->ncatches;
--                for (i = 0; i < ncatches; i++)
--                {
--                    pcb = &pci->catch_block[i];
--
--                    if (!ci)
--                    {
--                        // This code must match the translation code
--                        if (exception_record->ExceptionCode == STATUS_DIGITAL_MARS_D_EXCEPTION)
--                        {
--                            //printf("ei[0] = %p\n", exception_record->ExceptionInformation[0]);
--                            ci = **(ClassInfo ***)(exception_record->ExceptionInformation[0]);
--                        }
--                        else
--                            ci = &_Class_9Throwable;
--                    }
--
--                    if (_d_isbaseof(ci, pcb->type))
--                    {
--                        // Matched the catch type, so we've found the handler.
--                        int regebp;
--
--                        pti = _d_translate_se_to_d_exception(exception_record);
--
--                        // Initialize catch variable
--                        regebp = (int)&frame->ebp;              // EBP for this frame
--                        *(void **)(regebp + (pcb->bpoffset)) = pti;
--
--                        _d_setunhandled(pti);
--
--                        // Have system call all finally blocks in intervening frames
--                        _global_unwind(frame, exception_record);
--
--                        // Call all the finally blocks skipped in this frame
--                        _d_local_unwind(handler_table, frame, ndx);
--
--                        _d_setunhandled(NULL);
--
--                        frame->table_index = prev_ndx;  // we are out of this handler
--
--                        // Jump to catch block. Does not return.
--                        {
--                            unsigned catch_esp;
--                            fp_t catch_addr;
--
--                            catch_addr = (fp_t)(pcb->code);
--                            catch_esp = regebp - handler_table->espoffset - sizeof(fp_t);
--                            _asm
--                            {
--                                mov     EAX,catch_esp
--                                mov     ECX,catch_addr
--                                mov     [EAX],ECX
--                                mov     EBP,regebp
--                                mov     ESP,EAX         // reset stack
--                                ret                     // jump to catch block
--                            }
--                        }
--                    }
--                }
--            }
--        }
--    }
--    return ExceptionContinueSearch;
--}
--
--/***********************************
-- * Exception filter for use in __try..__except block
-- * surrounding call to Dmain()
-- */
--
--int _d_exception_filter(struct _EXCEPTION_POINTERS *eptrs,
--                        int retval,
--                        Object **exception_object)
--{
--    *exception_object = _d_translate_se_to_d_exception(eptrs->ExceptionRecord);
--    return retval;
--}
--
--/***********************************
-- * Throw a D object.
-- */
--
--void __stdcall _d_throw(Object *h)
--{
--    //printf("_d_throw(h = %p, &h = %p)\n", h, &h);
--    //printf("\tvptr = %p\n", *(void **)h);
--    RaiseException(STATUS_DIGITAL_MARS_D_EXCEPTION,
--                   EXCEPTION_NONCONTINUABLE,
--                   1, (DWORD *)&h);
--}
--
--/***********************************
-- * Create an exception object
-- */
--
--Object *_d_create_exception_object(ClassInfo *ci, char *msg)
--{
--    Throwable *exc;
--
--    exc = (Throwable *)_d_newclass(ci);
--    // BUG: what if _d_newclass() throws an out of memory exception?
--
--    if (msg)
--    {
--        exc->msglen = strlen(msg);
--        exc->msg = msg;
--    }
--    return (Object *)exc;
--}
--
--/***********************************
-- * Converts a Windows Structured Exception code to a D Exception Object.
-- */
--
--Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record)
--{
--    Object *pti;
--
--    switch (exception_record->ExceptionCode) {
--        case STATUS_DIGITAL_MARS_D_EXCEPTION:
--            // Generated D exception
--            pti = (Object *)(exception_record->ExceptionInformation[0]);
--            break;
--
--        case STATUS_INTEGER_DIVIDE_BY_ZERO:
--            pti = _d_create_exception_object(&_Class_5Error, "Integer Divide by Zero");
--            break;
--
--        case STATUS_FLOAT_DIVIDE_BY_ZERO:
--            pti = _d_create_exception_object(&_Class_5Error, "Float Divide by Zero");
--            break;
--
--        case STATUS_ACCESS_VIOLATION:
--            pti = _d_create_exception_object(&_Class_5Error, "Access Violation");
--            break;
--
--        case STATUS_STACK_OVERFLOW:
--            pti = _d_create_exception_object(&_Class_5Error, "Stack Overflow");
--            break;
--
--        case STATUS_DATATYPE_MISALIGNMENT:
--            pti = _d_create_exception_object(&_Class_5Error, "Datatype Misalignment");
--            break;
--
--        case STATUS_ARRAY_BOUNDS_EXCEEDED:
--            pti = _d_create_exception_object(&_Class_5Error, "Array Bounds Exceeded");
--            break;
--
--        case STATUS_FLOAT_INVALID_OPERATION:
--            pti = _d_create_exception_object(&_Class_5Error, "Invalid Floating Point Operation");
--            break;
--
--        case STATUS_FLOAT_DENORMAL_OPERAND:
--            pti = _d_create_exception_object(&_Class_5Error, "Floating Point Denormal Operand");
--            break;
--
--        case STATUS_FLOAT_INEXACT_RESULT:
--            pti = _d_create_exception_object(&_Class_5Error, "Floating Point Inexact Result");
--            break;
--
--        case STATUS_FLOAT_OVERFLOW:
--            pti = _d_create_exception_object(&_Class_5Error, "Floating Point Overflow");
--            break;
--
--        case STATUS_FLOAT_UNDERFLOW:
--            pti = _d_create_exception_object(&_Class_5Error, "Floating Point Underflow");
--            break;
--
--        case STATUS_FLOAT_STACK_CHECK:
--            pti = _d_create_exception_object(&_Class_5Error, "Floating Point Stack Check");
--            break;
--
--        case STATUS_PRIVILEGED_INSTRUCTION:
--            if (*((unsigned char *)(exception_record->ExceptionAddress))==0xF4) { // HLT
--                pti = _d_create_exception_object(&_Class_5Error, "assert(0) or HLT instruction");
--            } else {
--                pti = _d_create_exception_object(&_Class_5Error, "Privileged Instruction");
--            }
--            break;
--
--        case STATUS_ILLEGAL_INSTRUCTION:
--            pti = _d_create_exception_object(&_Class_5Error, "Illegal Instruction");
--            break;
--
--        case STATUS_BREAKPOINT:
--            pti = _d_create_exception_object(&_Class_5Error, "Breakpoint");
--            break;
--
--        case STATUS_IN_PAGE_ERROR:
--            pti = _d_create_exception_object(&_Class_5Error, "Win32 In Page Exception");
--            break;
--/*
--        case STATUS_INTEGER_OVERFLOW: // not supported on any x86 processor
--        case STATUS_INVALID_DISPOSITION:
--        case STATUS_NONCONTINUABLE_EXCEPTION:
--        case STATUS_SINGLE_STEP:
--		case DBG_CONTROL_C: // only when a debugger is attached
--        // In DMC, but not in Microsoft docs
--        case STATUS_GUARD_PAGE_VIOLATION:
--        case STATUS_INVALID_HANDLE:
--*/
--        // convert all other exception codes into a Win32Exception
--        default:
--            pti = _d_create_exception_object(&_Class_5Error, "Win32 Exception");
--            break;
--    }
--
--    return pti;
--}
--
--/**************************************
-- * Call finally blocks in the current stack frame until stop_index.
-- * This is roughly equivalent to _local_unwind() for C in \src\win32\ehsup.c
-- */
--
--void __cdecl _d_local_unwind(struct DHandlerTable *handler_table,
--        struct DEstablisherFrame *frame, int stop_index)
--{
--    struct DHandlerInfo *phi;
--    struct DCatchInfo *pci;
--    int i;
--
--    // Set up a special exception handler to catch double-fault exceptions.
--    __asm
--    {
--        push    dword ptr -1
--        push    dword ptr 0
--        push    offset _local_except_handler    // defined in src\win32\ehsup.c
--        push    dword ptr fs:_except_list
--        mov     FS:_except_list,ESP
--    }
--
--    for (i = frame->table_index; i != -1 && i != stop_index; i = phi->prev_index)
--    {
--        phi = &handler_table->handler_info[i];
--        if (phi->finally_code)
--        {
--            // Note that it is unnecessary to adjust the ESP, as the finally block
--            // accesses all items on the stack as relative to EBP.
--
--            DWORD *catch_ebp = &frame->ebp;
--            void *blockaddr = phi->finally_code;
--
--            _asm
--            {
--                push    EBX
--                mov     EBX,blockaddr
--                push    EBP
--                mov     EBP,catch_ebp
--                call    EBX
--                pop     EBP
--                pop     EBX
--            }
--        }
--    }
--
--    _asm
--    {
--        pop     FS:_except_list
--        add     ESP,12
--    }
--}
--
--/***********************************
-- * external version of the unwinder
-- */
--
--__declspec(naked) void __cdecl _d_local_unwind2()
--{
--    __asm
--    {
--        jmp     _d_local_unwind
--    }
--}
--
--/***********************************
-- * The frame handler, this is called for each frame that has been registered
-- * in the OS except_list.
-- * Input:
-- *      EAX     the handler table for the frame
-- */
--
--EXCEPTION_DISPOSITION _d_monitor_handler(
--            EXCEPTION_RECORD *exception_record,
--            struct DEstablisherFrame *frame,
--            CONTEXT *context,
--            void *dispatcher_context)
--{
--    if (exception_record->ExceptionFlags & EXCEPTION_UNWIND)
--    {
--        _d_monitorexit((Object *)frame->table_index);
--    }
--    else
--    {
--    }
--    return ExceptionContinueSearch;
--}
--
--/***********************************
-- */
--
--void _d_monitor_prolog(void *x, void *y, Object *h)
--{
--    __asm
--    {
--        push    EAX
--    }
--    //printf("_d_monitor_prolog(x=%p, y=%p, h=%p)\n", x, y, h);
--    _d_monitorenter(h);
--    __asm
--    {
--        pop     EAX
--    }
--}
--
--/***********************************
-- */
--
--void _d_monitor_epilog(void *x, void *y, Object *h)
--{
--    //printf("_d_monitor_epilog(x=%p, y=%p, h=%p)\n", x, y, h);
--    __asm
--    {
--        push    EAX
--        push    EDX
--    }
--    _d_monitorexit(h);
--    __asm
--    {
--        pop     EDX
--        pop     EAX
--    }
--}
--
--#endif
--
--/* ======================== linux =============================== */
--
--#if linux
--
--#include        "mars.h"
--
--extern ClassInfo D6object9Throwable7__ClassZ;
--#define _Class_9Throwable D6object9Throwable7__ClassZ;
--
--extern ClassInfo D6object5Error7__ClassZ;
--#define _Class_5Error D6object5Error7__ClassZ
--
--typedef int (*fp_t)();   // function pointer in ambient memory model
--
--struct DHandlerInfo
--{
--    unsigned offset;            // offset from function address to start of guarded section
--    int prev_index;             // previous table index
--    unsigned cioffset;          // offset to DCatchInfo data from start of table (!=0 if try-catch)
--    void *finally_code;         // pointer to finally code to execute
--                                // (!=0 if try-finally)
--};
--
--// Address of DHandlerTable, searched for by eh_finddata()
--
--struct DHandlerTable
--{
--    void *fptr;                 // pointer to start of function
--    unsigned espoffset;         // offset of ESP from EBP
--    unsigned retoffset;         // offset from start of function to return code
--    unsigned nhandlers;         // dimension of handler_info[]
--    struct DHandlerInfo handler_info[1];
--};
--
--struct DCatchBlock
--{
--    ClassInfo *type;            // catch type
--    unsigned bpoffset;          // EBP offset of catch var
--    void *code;                 // catch handler code
--};
--
--// Create one of these for each try-catch
--struct DCatchInfo
--{
--    unsigned ncatches;                  // number of catch blocks
--    struct DCatchBlock catch_block[1];  // data for each catch block
--};
--
--// One of these is generated for each function with try-catch or try-finally
--
--struct FuncTable
--{
--    void *fptr;                 // pointer to start of function
--    struct DHandlerTable *handlertable; // eh data for this function
--    unsigned size;              // size of function in bytes
--};
--
--extern struct FuncTable *table_start;
--extern struct FuncTable *table_end;
--
--void terminate()
--{
--//    _asm
--//    {
--//      hlt
--//    }
--}
--
--/*******************************************
-- * Given address that is inside a function,
-- * figure out which function it is in.
-- * Return DHandlerTable if there is one, NULL if not.
-- */
--
--struct DHandlerTable *__eh_finddata(void *address)
--{
--    struct FuncTable *ft;
--
--    for (ft = (struct FuncTable *)table_start;
--         ft < (struct FuncTable *)table_end;
--         ft++)
--    {
--        if (ft->fptr <= address &&
--            address < (void *)((char *)ft->fptr + ft->size))
--        {
--            return ft->handlertable;
--        }
--    }
--    return NULL;
--}
--
--
--/******************************
-- * Given EBP, find return address to caller, and caller's EBP.
-- * Input:
-- *   regbp       Value of EBP for current function
-- *   *pretaddr   Return address
-- * Output:
-- *   *pretaddr   return address to caller
-- * Returns:
-- *   caller's EBP
-- */
--
--unsigned __eh_find_caller(unsigned regbp, unsigned *pretaddr)
--{
--    unsigned bp = *(unsigned *)regbp;
--
--    if (bp)         // if not end of call chain
--    {
--        // Perform sanity checks on new EBP.
--        // If it is screwed up, terminate() hopefully before we do more damage.
--        if (bp <= regbp)
--            // stack should grow to smaller values
--            terminate();
--
--        *pretaddr = *(unsigned *)(regbp + sizeof(int));
--    }
--    return bp;
--}
--
--/***********************************
-- * Throw a D object.
-- */
--
--void __stdcall _d_throw(Object *h)
--{
--    unsigned regebp;
--
--    //printf("_d_throw(h = %p, &h = %p)\n", h, &h);
--    //printf("\tvptr = %p\n", *(void **)h);
--
--    regebp = _EBP;
--
--    while (1)           // for each function on the stack
--    {
--        struct DHandlerTable *handler_table;
--        struct FuncTable *pfunc;
--        struct DHandlerInfo *phi;
--        unsigned retaddr;
--        unsigned funcoffset;
--        unsigned spoff;
--        unsigned retoffset;
--        int index;
--        int dim;
--        int ndx;
--        int prev_ndx;
--
--        regebp = __eh_find_caller(regebp,&retaddr);
--        if (!regebp)
--            // if end of call chain
--            break;
--
--        handler_table = __eh_finddata((void *)retaddr);   // find static data associated with function
--        if (!handler_table)         // if no static data
--        {
--            continue;
--        }
--        funcoffset = (unsigned)handler_table->fptr;
--        spoff = handler_table->espoffset;
--        retoffset = handler_table->retoffset;
--
--#ifdef DEBUG
--        printf("retaddr = x%x\n",(unsigned)retaddr);
--        printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n",
--        regebp,funcoffset,spoff,retoffset);
--#endif
--
--        // Find start index for retaddr in static data
--        dim = handler_table->nhandlers;
--        index = -1;
--        for (int i = 0; i < dim; i++)
--        {
--            phi = &handler_table->handler_info[i];
--
--            if ((unsigned)retaddr >= funcoffset + phi->offset)
--                index = i;
--        }
--
--        // walk through handler table, checking each handler
--        // with an index smaller than the current table_index
--        for (ndx = index; ndx != -1; ndx = prev_ndx)
--        {
--            phi = &handler_table->handler_info[ndx];
--            prev_ndx = phi->prev_index;
--            if (phi->cioffset)
--            {
--                // this is a catch handler (no finally)
--                struct DCatchInfo *pci;
--                int ncatches;
--                int i;
--
--                pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset);
--                ncatches = pci->ncatches;
--                for (i = 0; i < ncatches; i++)
--                {
--                    struct DCatchBlock *pcb;
--                    ClassInfo *ci = **(ClassInfo ***)h;
--
--                    pcb = &pci->catch_block[i];
--
--                    if (_d_isbaseof(ci, pcb->type))
--                    {   // Matched the catch type, so we've found the handler.
--
--                        // Initialize catch variable
--                        *(void **)(regebp + (pcb->bpoffset)) = h;
--
--                        // Jump to catch block. Does not return.
--                        {
--                            unsigned catch_esp;
--                            fp_t catch_addr;
--
--                            catch_addr = (fp_t)(pcb->code);
--                            catch_esp = regebp - handler_table->espoffset - sizeof(fp_t);
--                            _asm
--                            {
--                                mov     EAX,catch_esp
--                                mov     ECX,catch_addr
--                                mov     [EAX],ECX
--                                mov     EBP,regebp
--                                mov     ESP,EAX         // reset stack
--                                ret                     // jump to catch block
--                            }
--                        }
--                    }
--                }
--            }
--            else if (phi->finally_code)
--            {   // Call finally block
--                // Note that it is unnecessary to adjust the ESP, as the finally block
--                // accesses all items on the stack as relative to EBP.
--
--                void *blockaddr = phi->finally_code;
--
--                _asm
--                {
--                    push        EBX
--                    mov         EBX,blockaddr
--                    push        EBP
--                    mov         EBP,regebp
--                    call        EBX
--                    pop         EBP
--                    pop         EBX
--                }
--            }
--        }
--    }
--}
--
--
--#endif
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh2.d druntime/src/rt/deh2.d
---- druntime-old/src/rt/deh2.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/deh2.d	1970-01-01 03:00:00.000000000 +0300
-@@ -1,322 +0,0 @@
--/**
-- * Implementation of exception handling support routines for Posix.
-- *
-- * Copyright: Copyright Digital Mars 2000 - 2009.
-- * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
-- * Authors:   Walter Bright
-- *
-- *          Copyright Digital Mars 2000 - 2009.
-- * Distributed under the Boost Software License, Version 1.0.
-- *    (See accompanying file LICENSE_1_0.txt or copy at
-- *          http://www.boost.org/LICENSE_1_0.txt)
-- */
--module rt.deh2;
--
--//debug=1;
--
--extern (C)
--{
--    extern __gshared
--    {
--        void* _deh_beg;
--        void* _deh_end;
--    }
--
--    int _d_isbaseof(ClassInfo oc, ClassInfo c);
--    
--    void _d_setunhandled(Object* o);
--}
--
--alias int (*fp_t)();   // function pointer in ambient memory model
--
--struct DHandlerInfo
--{
--    uint offset;                // offset from function address to start of guarded section
--    uint endoffset;             // offset of end of guarded section
--    int prev_index;             // previous table index
--    uint cioffset;              // offset to DCatchInfo data from start of table (!=0 if try-catch)
--    void *finally_code;         // pointer to finally code to execute
--                                // (!=0 if try-finally)
--}
--
--// Address of DHandlerTable, searched for by eh_finddata()
--
--struct DHandlerTable
--{
--    void *fptr;                 // pointer to start of function
--    uint espoffset;             // offset of ESP from EBP
--    uint retoffset;             // offset from start of function to return code
--    uint nhandlers;             // dimension of handler_info[]
--    DHandlerInfo handler_info[1];
--}
--
--struct DCatchBlock
--{
--    ClassInfo type;             // catch type
--    uint bpoffset;              // EBP offset of catch var
--    void *code;                 // catch handler code
--}
--
--// Create one of these for each try-catch
--struct DCatchInfo
--{
--    uint ncatches;                      // number of catch blocks
--    DCatchBlock catch_block[1];         // data for each catch block
--}
--
--// One of these is generated for each function with try-catch or try-finally
--
--struct FuncTable
--{
--    void *fptr;                 // pointer to start of function
--    DHandlerTable *handlertable; // eh data for this function
--    uint fsize;         // size of function in bytes
--}
--
--void terminate()
--{
--    asm
--    {
--        hlt ;
--    }
--}
--
--/*******************************************
-- * Given address that is inside a function,
-- * figure out which function it is in.
-- * Return DHandlerTable if there is one, NULL if not.
-- */
--
--DHandlerTable *__eh_finddata(void *address)
--{
--    FuncTable *ft;
--
--//    debug printf("__eh_finddata(address = x%x)\n", address);
--//    debug printf("_deh_beg = x%x, _deh_end = x%x\n", &_deh_beg, &_deh_end);
--    for (ft = cast(FuncTable *)&_deh_beg;
--         ft < cast(FuncTable *)&_deh_end;
--         ft++)
--    {
--//      debug printf("\tfptr = x%x, fsize = x%03x, handlertable = x%x\n",
--//              ft.fptr, ft.fsize, ft.handlertable);
--
--        if (ft.fptr <= address &&
--            address < cast(void *)(cast(char *)ft.fptr + ft.fsize))
--        {
--//          debug printf("\tfound handler table\n");
--            return ft.handlertable;
--        }
--    }
--//    debug printf("\tnot found\n");
--    return null;
--}
--
--
--/******************************
-- * Given EBP, find return address to caller, and caller's EBP.
-- * Input:
-- *   regbp       Value of EBP for current function
-- *   *pretaddr   Return address
-- * Output:
-- *   *pretaddr   return address to caller
-- * Returns:
-- *   caller's EBP
-- */
--
--uint __eh_find_caller(uint regbp, uint *pretaddr)
--{
--    uint bp = *cast(uint *)regbp;
--
--    if (bp)         // if not end of call chain
--    {
--        // Perform sanity checks on new EBP.
--        // If it is screwed up, terminate() hopefully before we do more damage.
--        if (bp <= regbp)
--            // stack should grow to smaller values
--            terminate();
--
--        *pretaddr = *cast(uint *)(regbp + int.sizeof);
--    }
--    return bp;
--}
--
--/***********************************
-- * Throw a D object.
-- */
--
--extern (Windows) void _d_throw(Object *h)
--{
--    uint regebp;
--
--    debug
--    {
--        printf("_d_throw(h = %p, &h = %p)\n", h, &h);
--        printf("\tvptr = %p\n", *cast(void **)h);
--    }
--
--    asm
--    {
--        mov regebp,EBP  ;
--    }
--    
--    _d_setunhandled(h);
--
--//static uint abc;
--//if (++abc == 2) *(char *)0=0;
--
--//int count = 0;
--    while (1)           // for each function on the stack
--    {
--        DHandlerTable *handler_table;
--        FuncTable *pfunc;
--        DHandlerInfo *phi;
--        uint retaddr;
--        uint funcoffset;
--        uint spoff;
--        uint retoffset;
--        int index;
--        int dim;
--        int ndx;
--        int prev_ndx;
--
--        regebp = __eh_find_caller(regebp,&retaddr);
--        if (!regebp)
--        {   // if end of call chain
--            debug printf("end of call chain\n");
--            break;
--        }
--
--        debug printf("found caller, EBP = x%x, retaddr = x%x\n", regebp, retaddr);
--//if (++count == 12) *(char*)0=0;
--        handler_table = __eh_finddata(cast(void *)retaddr);   // find static data associated with function
--        if (!handler_table)         // if no static data
--        {
--            debug printf("no handler table\n");
--            continue;
--        }
--        funcoffset = cast(uint)handler_table.fptr;
--        spoff = handler_table.espoffset;
--        retoffset = handler_table.retoffset;
--
--        debug
--        {
--            printf("retaddr = x%x\n",cast(uint)retaddr);
--            printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n",
--            regebp,funcoffset,spoff,retoffset);
--        }
--
--        // Find start index for retaddr in static data
--        dim = handler_table.nhandlers;
--
--        debug
--        {
--            printf("handler_info[]:\n");
--            for (int i = 0; i < dim; i++)
--            {
--                phi = &handler_table.handler_info[i];
--                printf("\t[%d]: offset = x%04x, endoffset = x%04x, prev_index = %d, cioffset = x%04x, finally_code = %x\n",
--                        i, phi.offset, phi.endoffset, phi.prev_index, phi.cioffset, phi.finally_code);
--            }
--        }
--
--        index = -1;
--        for (int i = 0; i < dim; i++)
--        {
--            phi = &handler_table.handler_info[i];
--
--            debug printf("i = %d, phi.offset = %04x\n", i, funcoffset + phi.offset);
--            if (cast(uint)retaddr > funcoffset + phi.offset &&
--                cast(uint)retaddr <= funcoffset + phi.endoffset)
--                index = i;
--        }
--        debug printf("index = %d\n", index);
--
--        // walk through handler table, checking each handler
--        // with an index smaller than the current table_index
--        for (ndx = index; ndx != -1; ndx = prev_ndx)
--        {
--            phi = &handler_table.handler_info[ndx];
--            prev_ndx = phi.prev_index;
--            if (phi.cioffset)
--            {
--                // this is a catch handler (no finally)
--                DCatchInfo *pci;
--                int ncatches;
--                int i;
--
--                pci = cast(DCatchInfo *)(cast(char *)handler_table + phi.cioffset);
--                ncatches = pci.ncatches;
--                for (i = 0; i < ncatches; i++)
--                {
--                    DCatchBlock *pcb;
--                    ClassInfo ci = **cast(ClassInfo **)h;
--
--                    pcb = &pci.catch_block[i];
--
--                    if (_d_isbaseof(ci, pcb.type))
--                    {   // Matched the catch type, so we've found the handler.
--                    
--                        _d_setunhandled(null);
--
--                        // Initialize catch variable
--                        *cast(void **)(regebp + (pcb.bpoffset)) = h;
--
--                        // Jump to catch block. Does not return.
--                        {
--                            uint catch_esp;
--                            fp_t catch_addr;
--
--                            catch_addr = cast(fp_t)(pcb.code);
--                            catch_esp = regebp - handler_table.espoffset - fp_t.sizeof;
--                            asm
--                            {
--                                mov     EAX,catch_esp   ;
--                                mov     ECX,catch_addr  ;
--                                mov     [EAX],ECX       ;
--                                mov     EBP,regebp      ;
--                                mov     ESP,EAX         ; // reset stack
--                                ret                     ; // jump to catch block
--                            }
--                        }
--                    }
--                }
--            }
--            else if (phi.finally_code)
--            {   // Call finally block
--                // Note that it is unnecessary to adjust the ESP, as the finally block
--                // accesses all items on the stack as relative to EBP.
--
--                void *blockaddr = phi.finally_code;
--
--                version (OSX)
--                {
--                    asm
--                    {
--                        sub     ESP,4           ; // align stack to 16
--                        push    EBX             ;
--                        mov     EBX,blockaddr   ;
--                        push    EBP             ;
--                        mov     EBP,regebp      ;
--                        call    EBX             ;
--                        pop     EBP             ;
--                        pop     EBX             ;
--                        add     ESP,4           ;
--                    }
--                }
--                else
--                {
--                    asm
--                    {
--                        push        EBX             ;
--                        mov         EBX,blockaddr   ;
--                        push        EBP             ;
--                        mov         EBP,regebp      ;
--                        call        EBX             ;
--                        pop         EBP             ;
--                        pop         EBX             ;
--                    }
--                }
--            }
--        }
--    }
--}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/eh.d druntime/src/rt/eh.d
---- druntime-old/src/rt/eh.d	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/src/rt/eh.d	2010-10-03 18:29:58.099624002 +0400
-@@ -0,0 +1,428 @@
-+/**
-+ * This module contains functions and structures required for
-+ * exception handling.
-+ */
-+module eh;
-+
-+private import core.stdc.stdio;
-+private import core.stdc.stdlib;
-+private import rt.util.console;
-+private import ldc.cstdarg;
-+
-+// debug = EH_personality;
-+// debug = EH_personality_verbose;
-+
-+// current EH implementation works on x86
-+// if it has a working unwind runtime
-+version(X86) {
-+    version(linux) version=X86_UNWIND;
-+    version(darwin) version=X86_UNWIND;
-+    version(solaris) version=X86_UNWIND;
-+}
-+version(X86_64) {
-+    version(linux) version=X86_UNWIND;
-+    version(darwin) version=X86_UNWIND;
-+    version(solaris) version=X86_UNWIND;
-+}
-+
-+//version = HP_LIBUNWIND;
-+
-+// D runtime functions
-+extern(C) {
-+    int _d_isbaseof(ClassInfo oc, ClassInfo c);
-+}
-+
-+// libunwind headers
-+extern(C)
-+{
-+    enum _Unwind_Reason_Code : int
-+    {
-+        NO_REASON = 0,
-+        FOREIGN_EXCEPTION_CAUGHT = 1,
-+        FATAL_PHASE2_ERROR = 2,
-+        FATAL_PHASE1_ERROR = 3,
-+        NORMAL_STOP = 4,
-+        END_OF_STACK = 5,
-+        HANDLER_FOUND = 6,
-+        INSTALL_CONTEXT = 7,
-+        CONTINUE_UNWIND = 8
-+    }
-+
-+    enum _Unwind_Action : int
-+    {
-+        SEARCH_PHASE = 1,
-+        CLEANUP_PHASE = 2,
-+        HANDLER_FRAME = 4,
-+        FORCE_UNWIND = 8
-+    }
-+
-+    alias void* _Unwind_Context_Ptr;
-+
-+    alias void function(_Unwind_Reason_Code, _Unwind_Exception*) _Unwind_Exception_Cleanup_Fn;
-+
-+    struct _Unwind_Exception
-+    {
-+        ulong exception_class;
-+        _Unwind_Exception_Cleanup_Fn exception_cleanup;
-+        ptrdiff_t private_1;
-+        ptrdiff_t private_2;
-+    }
-+
-+// interface to HP's libunwind from http://www.nongnu.org/libunwind/
-+version(HP_LIBUNWIND)
-+{
-+    void __libunwind_Unwind_Resume(_Unwind_Exception *);
-+    _Unwind_Reason_Code __libunwind_Unwind_RaiseException(_Unwind_Exception *);
-+    ptrdiff_t __libunwind_Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr
-+            context);
-+    ptrdiff_t __libunwind_Unwind_GetIP(_Unwind_Context_Ptr context);
-+    ptrdiff_t __libunwind_Unwind_SetIP(_Unwind_Context_Ptr context,
-+            ptrdiff_t new_value);
-+    ptrdiff_t __libunwind_Unwind_SetGR(_Unwind_Context_Ptr context, int index,
-+            ptrdiff_t new_value);
-+    ptrdiff_t __libunwind_Unwind_GetRegionStart(_Unwind_Context_Ptr context);
-+
-+    alias __libunwind_Unwind_Resume _Unwind_Resume;
-+    alias __libunwind_Unwind_RaiseException _Unwind_RaiseException;
-+    alias __libunwind_Unwind_GetLanguageSpecificData
-+        _Unwind_GetLanguageSpecificData;
-+    alias __libunwind_Unwind_GetIP _Unwind_GetIP;
-+    alias __libunwind_Unwind_SetIP _Unwind_SetIP;
-+    alias __libunwind_Unwind_SetGR _Unwind_SetGR;
-+    alias __libunwind_Unwind_GetRegionStart _Unwind_GetRegionStart;
-+}
-+else version(X86_UNWIND) 
-+{
-+    void _Unwind_Resume(_Unwind_Exception*);
-+    _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*);
-+    ptrdiff_t _Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr context);
-+    ptrdiff_t _Unwind_GetIP(_Unwind_Context_Ptr context);
-+    ptrdiff_t _Unwind_SetIP(_Unwind_Context_Ptr context, ptrdiff_t new_value);
-+    ptrdiff_t _Unwind_SetGR(_Unwind_Context_Ptr context, int index,
-+            ptrdiff_t new_value);
-+    ptrdiff_t _Unwind_GetRegionStart(_Unwind_Context_Ptr context);
-+}
-+else
-+{
-+    // runtime calls these directly
-+    void _Unwind_Resume(_Unwind_Exception*)
-+    {
-+        console("_Unwind_Resume is not implemented on this platform.\n");
-+    }
-+    _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*)
-+    {
-+        console("_Unwind_RaiseException is not implemented on this platform.\n");
-+        return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
-+    }
-+}
-+
-+}
-+
-+// error and exit
-+extern(C) private void fatalerror(in char* format, ...)
-+{
-+  va_list args;
-+  va_start(args, format);
-+  printf("Fatal error in EH code: ");
-+  vprintf(format, args);
-+  printf("\n");
-+  abort();
-+}
-+
-+
-+// helpers for reading certain DWARF data
-+private ubyte* get_uleb128(ubyte* addr, ref size_t res)
-+{
-+  res = 0;
-+  size_t bitsize = 0;
-+
-+  // read as long as high bit is set
-+  while(*addr & 0x80) {
-+    res |= (*addr & 0x7f) << bitsize;
-+    bitsize += 7;
-+    addr += 1;
-+    if(bitsize >= size_t.sizeof*8)
-+       fatalerror("tried to read uleb128 that exceeded size of size_t");
-+  }
-+  // read last
-+  if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize)
-+    fatalerror("Fatal error in EH code: tried to read uleb128 that exceeded size of size_t");
-+  res |= (*addr) << bitsize;
-+
-+  return addr + 1;
-+}
-+
-+private ubyte* get_sleb128(ubyte* addr, ref ptrdiff_t res)
-+{
-+  res = 0;
-+  size_t bitsize = 0;
-+
-+  // read as long as high bit is set
-+  while(*addr & 0x80) {
-+    res |= (*addr & 0x7f) << bitsize;
-+    bitsize += 7;
-+    addr += 1;
-+    if(bitsize >= size_t.sizeof*8)
-+       fatalerror("tried to read sleb128 that exceeded size of size_t");
-+  }
-+  // read last
-+  if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize)
-+    fatalerror("tried to read sleb128 that exceeded size of size_t");
-+  res |= (*addr) << bitsize;
-+
-+  // take care of sign
-+  if(bitsize < size_t.sizeof*8 && ((*addr) & 0x40))
-+    res |= cast(ptrdiff_t)(-1) ^ ((1 << (bitsize+7)) - 1);
-+
-+  return addr + 1;
-+}
-+
-+
-+// exception struct used by the runtime.
-+// _d_throw allocates a new instance and passes the address of its
-+// _Unwind_Exception member to the unwind call. The personality
-+// routine is then able to get the whole struct by looking at the data
-+// surrounding the unwind info.
-+struct _d_exception
-+{
-+  Object exception_object;
-+  _Unwind_Exception unwind_info;
-+}
-+
-+// the 8-byte string identifying the type of exception
-+// the first 4 are for vendor, the second 4 for language
-+//TODO: This may be the wrong way around
-+char[8] _d_exception_class = "LLDCD1\0\0";
-+
-+
-+//
-+// x86 unwind specific implementation of personality function
-+// and helpers
-+//
-+version(X86_UNWIND)
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/alloca.d druntime/src/rt/alloca.d
+--- druntime-orig/src/rt/alloca.d	2010-08-05 05:39:06.000000000 +0400
++++ druntime/src/rt/alloca.d	2010-10-08 22:31:50.989547000 +0400
+@@ -12,6 +12,9 @@
+  */
+ module rt.alloca;
+ 
++version (DMD)
 +{
 +
-+// the personality routine gets called by the unwind handler and is responsible for
-+// reading the EH tables and deciding what to do
-+extern(C) _Unwind_Reason_Code _d_eh_personality(int ver, _Unwind_Action actions, ulong exception_class, _Unwind_Exception* exception_info, _Unwind_Context_Ptr context)
-+{
-+  debug(EH_personality_verbose) printf("entering personality function. context: %p\n", context);
-+  // check ver: the C++ Itanium ABI only allows ver == 1
-+  if(ver != 1)
-+    return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
-+
-+  // check exceptionClass
-+  //TODO: Treat foreign exceptions with more respect
-+  if((cast(char*)&exception_class)[0..8] != _d_exception_class)
-+    return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
-+
-+  // find call site table, action table and classinfo table
-+  // Note: callsite and action tables do not contain static-length
-+  // data and will be parsed as needed
-+  // Note: classinfo_table points past the end of the table
-+  ubyte* callsite_table;
-+  ubyte* action_table;
-+  ClassInfo* classinfo_table;
-+  _d_getLanguageSpecificTables(context, callsite_table, action_table, classinfo_table);
-+  if (callsite_table is null)
-+    return _Unwind_Reason_Code.CONTINUE_UNWIND;
-+
-+  /*
-+    find landing pad and action table index belonging to ip by walking
-+    the callsite_table
-+  */
-+  ubyte* callsite_walker = callsite_table;
-+
-+  // get the instruction pointer
-+  // will be used to find the right entry in the callsite_table
-+  // -1 because it will point past the last instruction
-+  ptrdiff_t ip = _Unwind_GetIP(context) - 1;
-+
-+  // address block_start is relative to
-+  ptrdiff_t region_start = _Unwind_GetRegionStart(context);
-+
-+  // table entries
-+  uint block_start_offset, block_size;
-+  ptrdiff_t landing_pad;
-+  size_t action_offset;
-+
-+  while(true) {
-+    // if we've gone through the list and found nothing...
-+    if(callsite_walker >= action_table)
-+      return _Unwind_Reason_Code.CONTINUE_UNWIND;
-+
-+    block_start_offset = *cast(uint*)callsite_walker;
-+    block_size = *(cast(uint*)callsite_walker + 1);
-+    landing_pad = *(cast(uint*)callsite_walker + 2);
-+    if(landing_pad)
-+      landing_pad += region_start;
-+    callsite_walker = get_uleb128(callsite_walker + 3*uint.sizeof, action_offset);
-+
-+    debug(EH_personality_verbose) printf("ip=%llx %d %d %llx\n", ip, block_start_offset, block_size, landing_pad);
-+
-+    // since the list is sorted, as soon as we're past the ip
-+    // there's no handler to be found
-+    if(ip < region_start + block_start_offset)
-+      return _Unwind_Reason_Code.CONTINUE_UNWIND;
-+
-+    // if we've found our block, exit
-+    if(ip < region_start + block_start_offset + block_size)
-+      break;
-+  }
-+
-+  debug(EH_personality) printf("Found correct landing pad and actionOffset %d\n", action_offset);
-+
-+  // now we need the exception's classinfo to find a handler
-+  // the exception_info is actually a member of a larger _d_exception struct
-+  // the runtime allocated. get that now
-+  _d_exception* exception_struct = cast(_d_exception*)(cast(ubyte*)exception_info - _d_exception.unwind_info.offsetof);
-+
-+  // if there's no action offset and no landing pad, continue unwinding
-+  if(!action_offset && !landing_pad)
-+    return _Unwind_Reason_Code.CONTINUE_UNWIND;
-+
-+  // if there's no action offset but a landing pad, this is a cleanup handler
-+  else if(!action_offset && landing_pad)
-+    return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context);
-+
-+  /*
-+   walk action table chain, comparing classinfos using _d_isbaseof
-+  */
-+  ubyte* action_walker = action_table + action_offset - 1;
-+
-+  ptrdiff_t ti_offset, next_action_offset;
-+  while(true) {
-+    action_walker = get_sleb128(action_walker, ti_offset);
-+    // it is intentional that we not modify action_walker here
-+    // next_action_offset is from current action_walker position
-+    get_sleb128(action_walker, next_action_offset);
-+
-+    // negative are 'filters' which we don't use
-+    if(!(ti_offset >= 0))
-+      fatalerror("Filter actions are unsupported");
-+
-+    // zero means cleanup, which we require to be the last action
-+    if(ti_offset == 0) {
-+      if(!(next_action_offset == 0))
-+        fatalerror("Cleanup action must be last in chain");
-+      return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context);
-+    }
-+
-+    // get classinfo for action and check if the one in the
-+    // exception structure is a base
-+    ClassInfo catch_ci = *(classinfo_table - ti_offset);
-+    debug(EH_personality) printf("Comparing catch %s to exception %s\n", catch_ci.name.ptr, exception_struct.exception_object.classinfo.name.ptr);
-+    if(_d_isbaseof(exception_struct.exception_object.classinfo, catch_ci))
-+      return _d_eh_install_catch_context(actions, ti_offset, landing_pad, exception_struct, context);
-+
-+    // we've walked through all actions and found nothing...
-+    if(next_action_offset == 0)
-+      return _Unwind_Reason_Code.CONTINUE_UNWIND;
-+    else
-+      action_walker += next_action_offset;
-+  }
-+
-+  fatalerror("reached unreachable");
-+  return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
-+}
-+
-+// These are the register numbers for SetGR that
-+// llvm's eh.exception and eh.selector intrinsics
-+// will pick up.
-+// Hints for these can be found by looking at the
-+// EH_RETURN_DATA_REGNO macro in GCC, careful testing
-+// is required though.
-+version (X86_64)
-+{
-+  private int eh_exception_regno = 0;
-+  private int eh_selector_regno = 1;
-+} else {
-+  private int eh_exception_regno = 0;
-+  private int eh_selector_regno = 2;
-+}
-+
-+private _Unwind_Reason_Code _d_eh_install_catch_context(_Unwind_Action actions, ptrdiff_t switchval, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context)
-+{
-+  debug(EH_personality) printf("Found catch clause!\n");
-+
-+  if(actions & _Unwind_Action.SEARCH_PHASE)
-+    return _Unwind_Reason_Code.HANDLER_FOUND;
-+
-+  else if(actions & _Unwind_Action.CLEANUP_PHASE)
-+  {
-+    debug(EH_personality) printf("Setting switch value to: %d!\n", switchval);
-+    _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)cast(void*)(exception_struct.exception_object));
-+    _Unwind_SetGR(context, eh_selector_regno, cast(ptrdiff_t)switchval);
-+    _Unwind_SetIP(context, landing_pad);
-+    return _Unwind_Reason_Code.INSTALL_CONTEXT;
-+  }
-+
-+  fatalerror("reached unreachable");
-+  return _Unwind_Reason_Code.FATAL_PHASE2_ERROR;
-+}
-+
-+private _Unwind_Reason_Code _d_eh_install_finally_context(_Unwind_Action actions, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context)
-+{
-+  // if we're merely in search phase, continue
-+  if(actions & _Unwind_Action.SEARCH_PHASE)
-+    return _Unwind_Reason_Code.CONTINUE_UNWIND;
-+
-+  debug(EH_personality) printf("Calling cleanup routine...\n");
-+
-+  _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)exception_struct);
-+  _Unwind_SetGR(context, eh_selector_regno, 0);
-+  _Unwind_SetIP(context, landing_pad);
-+  return _Unwind_Reason_Code.INSTALL_CONTEXT;
-+}
-+
-+private void _d_getLanguageSpecificTables(_Unwind_Context_Ptr context, ref ubyte* callsite, ref ubyte* action, ref ClassInfo* ci)
-+{
-+  ubyte* data = cast(ubyte*)_Unwind_GetLanguageSpecificData(context);
-+  if (data is null)
-+  {
-+    //printf("language specific data was null\n");
-+    callsite = null;
-+    action = null;
-+    ci = null;
-+    return;
-+  }
-+
-+  //TODO: Do proper DWARF reading here
-+  if(*data++ != 0xff)
-+    fatalerror("DWARF header has unexpected format 1");
-+
-+  if(*data++ != 0x00)
-+    fatalerror("DWARF header has unexpected format 2");
-+  size_t cioffset;
-+  data = get_uleb128(data, cioffset);
-+  ci = cast(ClassInfo*)(data + cioffset);
-+
-+  if(*data++ != 0x03)
-+    fatalerror("DWARF header has unexpected format 3");
-+  size_t callsitelength;
-+  data = get_uleb128(data, callsitelength);
-+  action = data + callsitelength;
-+
-+  callsite = data;
-+}
-+
-+} // end of x86 Linux specific implementation
-+
-+
-+extern(C) void _d_throw_exception(Object e)
-+{
-+    if (e !is null)
-+    {
-+        _d_exception* exc_struct = new _d_exception;
-+        exc_struct.unwind_info.exception_class = *cast(ulong*)_d_exception_class.ptr;
-+        exc_struct.exception_object = e;
-+        _Unwind_Reason_Code ret = _Unwind_RaiseException(&exc_struct.unwind_info);
-+        console("_Unwind_RaiseException failed with reason code: ")(ret)("\n");
-+    }
-+    abort();
-+}
-+
-+extern(C) void _d_eh_resume_unwind(_d_exception* exception_struct)
-+{
-+  _Unwind_Resume(&exception_struct.unwind_info);
-+}
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/lifetime.d druntime/src/rt/lifetime.d
---- druntime-old/src/rt/lifetime.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/lifetime.d	2010-10-08 14:55:56.581547002 +0400
-@@ -81,6 +81,28 @@
-            MAXSMALLSIZE = 256-SMALLPAD,
-            MAXMEDSIZE = (PAGESIZE / 2) - MEDPAD
-        }
-+
-+    version( LDC )
-+    {
-+    size_t length_adjust(size_t sizeelem, size_t newlength)
-+    {
-+        size_t newsize = void;
-+        static if (size_t.sizeof < ulong.sizeof)
-+        {
-+            ulong s = cast(ulong)sizeelem * cast(ulong)newlength;
-+            if (s > size_t.max)
-+                onOutOfMemoryError();
-+            newsize = cast(size_t)s;
-+        }
-+        else
-+        {
-+            newsize = sizeelem * newlength;
-+            if (newsize / newlength != sizeelem)
-+                onOutOfMemoryError();
-+        }
-+        return newsize;
-+    }
-+    }
+ /+
+ #if DOS386
+ extern size_t _x386_break;
+@@ -133,3 +136,5 @@
+         ret                     ;
+     }
  }
- 
- 
-@@ -92,6 +114,13 @@
++
++}
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/lifetime.d druntime/src/rt/lifetime.d
+--- druntime-orig/src/rt/lifetime.d	2010-08-05 05:39:06.000000000 +0400
++++ druntime/src/rt/lifetime.d	2010-10-29 10:40:39.533035001 +0400
+@@ -92,6 +92,18 @@
      return gc_malloc(sz);
  }
  
++version (LDC)
++{
++
 +/**
 + * for allocating a single POD value
 + */
@@ -13434,10 +751,12 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
 +{
 +    return gc_malloc(ti.tsize(), !(ti.flags() & 1) ? BlkAttr.NO_SCAN : 0);
 +}
++
++} // version (LDC)
  
  /**
   *
-@@ -670,7 +699,7 @@
+@@ -670,7 +682,7 @@
   * ti is the type of the resulting array, or pointer to element.
   * (For when the array is initialized to 0)
   */
@@ -13446,7 +765,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      ulong result;
      auto size = ti.next.tsize();                // array element size
-@@ -702,7 +731,7 @@
+@@ -702,7 +714,7 @@
          __setArrayAllocLength(info, size, isshared);
          result = cast(ulong)length + (cast(ulong)cast(size_t)arrstart << 32);
      }
@@ -13455,7 +774,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  
  Loverflow:
      onOutOfMemoryError();
-@@ -711,7 +740,7 @@
+@@ -711,7 +723,7 @@
  /**
   * For when the array has a non-zero initializer.
   */
@@ -13464,7 +783,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      ulong result;
      auto size = ti.next.tsize();                // array element size
-@@ -764,7 +793,7 @@
+@@ -764,7 +776,7 @@
          __setArrayAllocLength(info, size, isshared);
          result = cast(ulong)length + (cast(ulong)cast(uint)arrstart << 32);
      }
@@ -13473,7 +792,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  
  Loverflow:
      onOutOfMemoryError();
-@@ -773,7 +802,7 @@
+@@ -773,7 +785,7 @@
  /**
   *
   */
@@ -13482,7 +801,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      ulong result;
  
-@@ -823,14 +852,14 @@
+@@ -823,14 +835,14 @@
          }
          va_end(q);
      }
@@ -13499,7 +818,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      ulong result;
  
-@@ -881,10 +910,9 @@
+@@ -881,7 +893,7 @@
          }
          va_end(q);
      }
@@ -13507,34 +826,8 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
 +    return *cast(void[]*)&result;
  }
  
--
- /**
-  *
-  */
-@@ -1046,7 +1074,7 @@
- /**
-  * Resize dynamic arrays with 0 initializers.
-  */
--extern (C) byte[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p)
-+extern (C) void[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p)
- in
- {
-     assert(ti);
-@@ -1206,7 +1234,7 @@
-  *      initsize        size of initializer
-  *      ...             initializer
-  */
--extern (C) byte[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p)
-+extern (C) void[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p)
- in
- {
-     assert(!p.length || p.data);
-@@ -1376,12 +1404,11 @@
-     onOutOfMemoryError();
- }
  
--
- /**
+@@ -1381,7 +1393,7 @@
   * Append y[] to array pointed to by px
   * size is size of each array element.
   */
@@ -13543,7 +836,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      // only optimize array append where ti is not a shared type
      auto sizeelem = ti.next.tsize();            // array element size
-@@ -1468,10 +1495,9 @@
+@@ -1468,7 +1480,7 @@
    L1:
      px.length = newlength;
      memcpy(px.data + length * sizeelem, y.ptr, y.length * sizeelem);
@@ -13551,11 +844,8 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
 +    return *cast(void[]*)px;
  }
  
--
- /**
-  *
-  */
-@@ -1552,21 +1578,36 @@
+ 
+@@ -1552,21 +1564,36 @@
      return newcap;
  }
  
@@ -13594,7 +884,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      // c could encode into from 1 to 4 characters
      char[4] buf = void;
-@@ -1612,7 +1653,7 @@
+@@ -1612,7 +1639,7 @@
  /**
   * Append dchar to wchar[]
   */
@@ -13603,19 +893,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
      // c could encode into from 1 to 2 w characters
      wchar[2] buf = void;
-@@ -1641,7 +1682,6 @@
-     return _d_arrayappendT(typeid(shared wchar[]), cast(Array *)&x, appendthis);
- }
- 
--
- /**
-  *
-  */
-@@ -1794,11 +1834,10 @@
-     void*  ptr;
- }
- 
--
+@@ -1798,7 +1825,7 @@
  /**
   *
   */
@@ -13624,7 +902,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  out (result)
  {
      auto sizeelem = ti.next.tsize();            // array element size
-@@ -1819,7 +1858,7 @@
+@@ -1819,7 +1846,7 @@
          r.length = a.length;
          memcpy(r.ptr, a.ptr, size);
      }
@@ -13633,10 +911,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort.d druntime/src/rt/qsort.d
---- druntime-old/src/rt/qsort.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/qsort.d	2010-10-07 13:59:06.815253002 +0400
-@@ -44,7 +44,7 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/qsort.d druntime/src/rt/qsort.d
+--- druntime-orig/src/rt/qsort.d	2010-08-05 05:39:06.000000000 +0400
++++ druntime/src/rt/qsort.d	2010-10-07 13:59:06.815253002 +0400
+@@ -44,7 +44,7 @@
  structures.  The default value is optimized for a high cost for compares. */
  
  
@@ -13645,7 +923,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  {
    byte* base;
    byte*[40] stack;              // stack
-@@ -124,7 +124,7 @@
+@@ -124,7 +124,7 @@
        limit = sp[1];
      }
      else                                // else stack empty, all done
@@ -13654,10 +932,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
    }
    assert(0);
  }
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort2.d druntime/src/rt/qsort2.d
---- druntime-old/src/rt/qsort2.d	2010-08-05 05:39:06.000000000 +0400
-+++ druntime/src/rt/qsort2.d	2010-10-07 14:01:41.359253001 +0400
-@@ -31,14 +31,14 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/qsort2.d druntime/src/rt/qsort2.d
+--- druntime-orig/src/rt/qsort2.d	2010-08-05 05:39:06.000000000 +0400
++++ druntime/src/rt/qsort2.d	2010-10-07 14:01:41.359253001 +0400
+@@ -31,14 +31,14 @@
      return tiglobal.compare(p1, p2);
  }
  
@@ -13674,10 +952,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
  }
  
  
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/trace.d druntime/src/rt/trace.d
---- druntime-old/src/rt/trace.d	2010-08-07 09:46:06.000000000 +0400
-+++ druntime/src/rt/trace.d	2010-10-01 21:01:58.444892002 +0400
-@@ -855,7 +855,7 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/trace.d druntime/src/rt/trace.d
+--- druntime-orig/src/rt/trace.d	2010-08-07 09:46:06.000000000 +0400
++++ druntime/src/rt/trace.d	2010-10-01 21:01:58.444892002 +0400
+@@ -855,7 +855,7 @@
    version (OSX)
    { // 16 byte align stack
      asm
@@ -13686,7 +964,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
          pushad  ;
          sub     ESP,12  ;
      }
-@@ -870,7 +870,7 @@
+@@ -870,7 +870,7 @@
    else
    {
      asm
@@ -13695,219 +973,3 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.
          pushad  ;
      }
      trace_epi();
-diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/std/intrinsic.d druntime/src/std/intrinsic.d
---- druntime-old/src/std/intrinsic.d	1970-01-01 03:00:00.000000000 +0300
-+++ druntime/src/std/intrinsic.d	2010-10-03 20:07:21.183624002 +0400
-@@ -0,0 +1,212 @@
-+/*
-+ * D phobos intrinsics for LDC
-+ *
-+ * From GDC ... public domain!
-+ */
-+module std.intrinsic;
-+
-+// Check for the right compiler
-+version(LDC)
-+{
-+    // OK
-+}
-+else
-+{
-+    static assert(false, "This module is only valid for LDC");
-+}
-+
-+/**
-+ * Scans the bits in v starting with bit 0, looking
-+ * for the first set bit.
-+ * Returns:
-+ *      The bit number of the first bit set.
-+ *      The return value is undefined if v is zero.
-+ */
-+nothrow int bsf(uint v)
-+{
-+    uint m = 1;
-+    uint i;
-+    for (i = 0; i < 32; i++,m<<=1) {
-+        if (v&m)
-+        return i;
-+    }
-+    return i; // supposed to be undefined
-+}
-+
-+/**
-+ * Scans the bits in v from the most significant bit
-+ * to the least significant bit, looking
-+ * for the first set bit.
-+ * Returns:
-+ *      The bit number of the first bit set.
-+ *      The return value is undefined if v is zero.
-+ * Example:
-+ * ---
-+ * import std.intrinsic;
-+ *
-+ * int main()
-+ * {
-+ *     uint v;
-+ *     int x;
-+ *
-+ *     v = 0x21;
-+ *     x = bsf(v);
-+ *     printf("bsf(x%x) = %d\n", v, x);
-+ *     x = bsr(v);
-+ *     printf("bsr(x%x) = %d\n", v, x);
-+ *     return 0;
-+ * }
-+ * ---
-+ * Output:
-+ *  bsf(x21) = 0<br>
-+ *  bsr(x21) = 5
-+ */
-+nothrow int bsr(uint v)
-+{
-+    uint m = 0x80000000;
-+    uint i;
-+    for (i = 32; i ; i--,m>>>=1) {
-+    if (v&m)
-+        return i-1;
-+    }
-+    return i; // supposed to be undefined
-+}
-+
-+
-+/**
-+ * Tests the bit.
-+ */
-+nothrow int bt(uint *p, uint bitnum)
-+{
-+    return (p[bitnum / (uint.sizeof*8)] & (1<<(bitnum & ((uint.sizeof*8)-1)))) ? -1 : 0 ;
-+}
-+
-+
-+/**
-+ * Tests and complements the bit.
-+ */
-+nothrow int btc(uint *p, uint bitnum)
-+{
-+    uint * q = p + (bitnum / (uint.sizeof*8));
-+    uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1));
-+    int result = *q & mask;
-+    *q ^= mask;
-+    return result ? -1 : 0;
-+}
-+
-+
-+/**
-+ * Tests and resets (sets to 0) the bit.
-+ */
-+nothrow int btr(uint *p, uint bitnum)
-+{
-+    uint * q = p + (bitnum / (uint.sizeof*8));
-+    uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1));
-+    int result = *q & mask;
-+    *q &= ~mask;
-+    return result ? -1 : 0;
-+}
-+
-+
-+/**
-+ * Tests and sets the bit.
-+ * Params:
-+ * p = a non-NULL pointer to an array of uints.
-+ * index = a bit number, starting with bit 0 of p[0],
-+ * and progressing. It addresses bits like the expression:
-+---
-+p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1)))
-+---
-+ * Returns:
-+ *      A non-zero value if the bit was set, and a zero
-+ *      if it was clear.
-+ *
-+ * Example:
-+ * ---
-+import std.intrinsic;
-+
-+int main()
-+{
-+    uint array[2];
-+
-+    array[0] = 2;
-+    array[1] = 0x100;
-+
-+    printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
-+    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
-+
-+    printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
-+    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
-+
-+    printf("bts(array, 35) = %d\n", <b>bts</b>(array, 35));
-+    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
-+
-+    printf("btr(array, 35) = %d\n", <b>btr</b>(array, 35));
-+    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
-+
-+    printf("bt(array, 1) = %d\n", <b>bt</b>(array, 1));
-+    printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
-+
-+    return 0;
-+}
-+ * ---
-+ * Output:
-+<pre>
-+btc(array, 35) = 0
-+array = [0]:x2, [1]:x108
-+btc(array, 35) = -1
-+array = [0]:x2, [1]:x100
-+bts(array, 35) = 0
-+array = [0]:x2, [1]:x108
-+btr(array, 35) = -1
-+array = [0]:x2, [1]:x100
-+bt(array, 1) = -1
-+array = [0]:x2, [1]:x100
-+</pre>
-+ */
-+nothrow int bts(uint *p, uint bitnum)
-+{
-+    uint * q = p + (bitnum / (uint.sizeof*8));
-+    uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1));
-+    int result = *q & mask;
-+    *q |= mask;
-+    return result ? -1 : 0;
-+}
-+
-+/**
-+ * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes
-+ * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3
-+ * becomes byte 0.
-+ */
-+pragma(intrinsic, "llvm.bswap.i32")
-+    uint bswap(uint val);
-+
-+/**
-+ * Reads I/O port at port_address.
-+ */
-+ubyte  inp(uint p) { throw new Exception("inp intrinsic not yet implemented"); }
-+
-+/**
-+ * ditto
-+ */
-+ushort inpw(uint p) { throw new Exception("inpw intrinsic not yet implemented"); }
-+
-+/**
-+ * ditto
-+ */
-+uint   inpl(uint p) { throw new Exception("inpl intrinsic not yet implemented"); }
-+
-+/**
-+ * ditto
-+ */
-+ubyte  outp(uint p, ubyte v) { throw new Exception("outp intrinsic not yet implemented"); }
-+
-+/**
-+ * ditto
-+ */
-+ushort outpw(uint p, ushort v) { throw new Exception("outpw intrinsic not yet implemented"); }
-+
-+/**
-+ * ditto
-+ */
-+uint   outpl(uint p, uint v) { throw new Exception("outpl intrinsic not yet implemented"); }
diff --git a/phobos.patch b/phobos.patch
new file mode 100644
index 00000000..a25ed120
--- /dev/null
+++ b/phobos.patch
@@ -0,0 +1,243 @@
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/conv.d phobos/std/conv.d
+--- phobos-orig/std/conv.d	2010-09-17 00:27:48.000000000 +0400
++++ phobos/std/conv.d	2010-10-29 12:06:21.221035000 +0400
+@@ -1395,7 +1395,7 @@
+     else // not hex
+     {
+         if (toupper(p.front) == 'N')
+-        {
++        {   
+             // nan
+             enforce((p.popFront(), !p.empty && toupper(p.front) == 'A')
+                     && (p.popFront(), !p.empty && toupper(p.front) == 'N'),
+@@ -3191,6 +3191,11 @@
+ T toImpl(T, S)(S d) if (is(Unqual!S == double) && isSomeString!(T))
+ {
+     //alias Unqual!(ElementType!T) Char;
++    version(LDC) // FIXME: workarond for case when this function returns "-nan" 
++    {
++        if (isnan(d))
++            return "nan";
++    }
+     char[20] buffer;
+     int len = sprintf(buffer.ptr, "%g", d);
+     return to!T(buffer[0 .. len].dup);
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/functional.d phobos/std/functional.d
+--- phobos-orig/std/functional.d	2010-09-17 00:27:48.000000000 +0400
++++ phobos/std/functional.d	2010-10-29 12:01:35.285035001 +0400
+@@ -713,6 +713,13 @@
+         assert(dg_pure_nothrow() == 7);
+         //assert(dg_pure_nothrow_safe() == 8);
+     }
++    version (LDC) 
++    {
++        // FIXME: 
++    } 
++    else 
++    {
++
+     /* test for linkage */
+     {
+         struct S
+@@ -724,4 +731,6 @@
+         auto dg_xtrnD = toDelegate(&S.xtrnD);
+         static assert(! is(typeof(dg_xtrnC) == typeof(dg_xtrnD)));
+     }
++
++    }
+ }
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/internal/math/biguintx86.d phobos/std/internal/math/biguintx86.d
+--- phobos-orig/std/internal/math/biguintx86.d	2010-09-17 00:27:48.000000000 +0400
++++ phobos/std/internal/math/biguintx86.d	2010-10-26 14:08:51.480925001 +0400
+@@ -733,7 +733,10 @@
+     // EDI = dest
+     // ESI = src
+ 
+-    enum string OP = (op=='+')? "add" : "sub";
++    version(LDC) { 
++    } else {
++        enum string OP = (op=='+')? "add" : "sub";
++    }
+     version(D_PIC) {
+         enum { zero = 0 }
+     } else {
+@@ -767,7 +770,10 @@
+         jnz L_enter_odd;
+ }
+                 // Main loop, with entry point for even length
+-mixin(asmMulAdd_innerloop(OP, "ESP+LASTPARAM"));
++version(LDC)
++    mixin(asmMulAdd_innerloop((op=='+')? "add" : "sub", "ESP+LASTPARAM"));
++else
++    mixin(asmMulAdd_innerloop(OP, "ESP+LASTPARAM"));
+ asm {
+         mov EAX, EBP; // get final carry
+         pop EBP;
+@@ -777,6 +783,9 @@
+         ret 5*4;
+ }
+ L_enter_odd:
++version(LDC)
++    mixin(asmMulAdd_enter_odd((op=='+')? "add" : "sub", "ESP+LASTPARAM"));
++else
+     mixin(asmMulAdd_enter_odd(OP, "ESP+LASTPARAM"));
+ }
+ 
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/math.d phobos/std/math.d
+--- phobos-orig/std/math.d	2010-09-17 00:27:48.000000000 +0400
++++ phobos/std/math.d	2010-10-29 12:08:18.925035001 +0400
+@@ -318,7 +318,10 @@
+  *      Results are undefined if |x| >= $(POWER 2,64).
+  */
+ 
+-@safe pure nothrow real cos(real x);       /* intrinsic */
++version(LDC)
++    @safe pure nothrow real cos(real x) { return llvm_cos(x); }
++else
++    @safe pure nothrow real cos(real x);       /* intrinsic */
+ 
+ /***********************************
+  * Returns sine of x. x is in radians.
+@@ -333,7 +336,10 @@
+  *      Results are undefined if |x| >= $(POWER 2,64).
+  */
+ 
+-@safe pure nothrow real sin(real x);       /* intrinsic */
++version(LDC)
++    @safe pure nothrow real sin(real x) { return llvm_sin(x); }
++else
++    @safe pure nothrow real sin(real x);       /* intrinsic */
+ 
+ 
+ /***********************************
+@@ -831,6 +837,20 @@
+  *      )
+  */
+ 
++version(LDC) 
++{
++
++@safe pure nothrow
++{
++    float sqrt(float x) { return llvm_sqrt(x); }
++    double sqrt(double x)  { return llvm_sqrt(x); }
++    real sqrt(real x) { return llvm_sqrt(x); }
++}
++
++}
++else
++{
++
+ @safe pure nothrow
+ {
+     float sqrt(float x);    /* intrinsic */
+@@ -838,6 +858,8 @@
+     real sqrt(real x);      /* intrinsic */ /// ditto
+ }
+ 
++}
++
+ @trusted pure nothrow {  // Should be @safe.  See bugs 4628, 4630.
+     // Create explicit overloads for integer sqrts.  No ddoc for these because
+     // hopefully a more elegant solution will eventually be found, so we don't
+@@ -1413,9 +1435,22 @@
+  * Compute n * 2$(SUP exp)
+  * References: frexp
+  */
++version(LDC)
++{
++
++pure nothrow real ldexp(real n, int exp)
++{
++    return core.stdc.math.ldexpl(n, exp);
++}
++
++}
++else
++{
+ 
+ @safe pure nothrow real ldexp(real n, int exp);    /* intrinsic */
+ 
++}
++
+ unittest {
+     assert(ldexp(1, -16384) == 0x1p-16384L);
+     assert(ldexp(1, -16382) == 0x1p-16382L);
+@@ -1608,7 +1643,31 @@
+  *      $(TR $(TD $(PLUSMN)$(INFIN)) $(TD +$(INFIN)) )
+  *      )
+  */
+-@safe pure nothrow real fabs(real x);      /* intrinsic */
++version(LDC) {
++    version( FreeBSD )
++        version (all) // < 8-CURRENT
++            private extern(C) real fabsl(real x) { return fabs(x); }
++        else
++            private extern(C) real fabsl(real x);
++    else
++        private extern(C) real fabsl(real x);
++    pure nothrow real fabs(real x) 
++    {
++        version(D_InlineAsm_X86)
++        {
++            asm {
++                fld x;
++                fabs;
++            }
++        }
++        else
++        {
++            return fabsl(x);
++        }
++    }
++} else {
++    @safe pure nothrow real fabs(real x);      /* intrinsic */
++}
+ 
+ 
+ /***********************************************************************
+diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/openrj.d phobos/std/openrj.d
+--- phobos-orig/std/openrj.d	2009-09-03 12:01:40.000000000 +0400
++++ phobos/std/openrj.d	2010-10-26 13:17:37.480925001 +0400
+@@ -620,11 +620,11 @@
+     /**
+      *
+      */
+-    int opApply(int delegate(inout Field field) dg)
++    int opApply(int delegate(ref Field field) dg)
+     {
+         int result  =   0;
+ 
+-        foreach (inout field; m_fields)
++        foreach (ref Field field; m_fields)
+         {
+             result = dg(field);
+ 
+@@ -1000,11 +1000,11 @@
+     /**
+      *
+      */
+-    int opApply(int delegate(inout Record record) dg)
++    int opApply(int delegate(ref Record record) dg)
+     {
+         int result  =   0;
+ 
+-        foreach(inout Record record; m_records)
++        foreach(ref Record record; m_records)
+         {
+             result = dg(record);
+ 
+@@ -1020,11 +1020,11 @@
+     /**
+      *
+      */
+-    int opApply(int delegate(inout Field field) dg)
++    int opApply(int delegate(ref Field field) dg)
+     {
+         int result  =   0;
+ 
+-        foreach(inout Field field; m_fields)
++        foreach(ref Field field; m_fields)
+         {
+             result = dg(field);
+ 
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index f8b15d8d..686448fc 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -53,6 +53,9 @@ if(D_VERSION EQUAL 1)
 	set(RUNTIME_INCLUDE ${RUNTIME_DC_DIR})
 	file(GLOB CORE_D ${RUNTIME_DIR}/lib/common/tango/core/*.d)
 	file(GLOB CORE_C ${RUNTIME_DIR}/lib/common/tango/stdc/*.c)
+	file(GLOB_RECURSE GC_D ${RUNTIME_GC_DIR}/*.d)
+	file(GLOB_RECURSE DCRT_D ${RUNTIME_DC_DIR}/*.d)
+	file(GLOB DCRT_C ${RUNTIME_DC_DIR}/*.c)
 elseif(D_VERSION EQUAL 2)
 	set(RUNTIME_CC druntime-core)
 	set(RUNTIME_GC druntime-gc-basic)
@@ -63,6 +66,21 @@ elseif(D_VERSION EQUAL 2)
 	set(RUNTIME_INCLUDE ${RUNTIME_DIR}/src)
 	file(GLOB CORE_D ${RUNTIME_DIR}/src/core/*.d )
 	file(GLOB CORE_D_SYNC ${RUNTIME_DIR}/src/core/sync/*.d )
+	file(GLOB_RECURSE GC_D ${RUNTIME_GC_DIR}/*.d)
+	file(GLOB_RECURSE DCRT_D ${RUNTIME_DC_DIR}/*.d)
+	list(REMOVE_ITEM DCRT_D
+		${RUNTIME_DC_DIR}/arrayassign.d
+		${RUNTIME_DC_DIR}/arraybyte.d
+		${RUNTIME_DC_DIR}/arraycast.d
+		${RUNTIME_DC_DIR}/arraycat.d
+		${RUNTIME_DC_DIR}/arraydouble.d
+		${RUNTIME_DC_DIR}/arrayfloat.d
+		${RUNTIME_DC_DIR}/arrayreal.d
+		${RUNTIME_DC_DIR}/arrayshort.d
+		${RUNTIME_DC_DIR}/deh2.d
+	)
+	file(GLOB DCRT_C ${RUNTIME_DC_DIR}/*.c)
+	list(REMOVE_ITEM DCRT_C ${RUNTIME_DC_DIR}/deh.c)
 	if(UNIX)
 		file(GLOB CORE_D_SYS ${RUNTIME_DIR}/src/core/sys/posix/*.d)
 	elseif(WIN32)
@@ -140,10 +158,6 @@ if(D_VERSION EQUAL 2)
 	endif(NOT PATCH_EXE)
 endif(D_VERSION EQUAL 2)
 
-file(GLOB_RECURSE GC_D ${RUNTIME_GC_DIR}/*.d)
-file(GLOB_RECURSE DCRT_D ${RUNTIME_DC_DIR}/*.d)
-file(GLOB DCRT_C ${RUNTIME_DC_DIR}/*.c)
-
 macro(dc INPUT_D OUTLIST_O OUTLIST_BC INCDIR MOREFLAGS PATH)
 	if ("${PATH}" STREQUAL "")
 		file(RELATIVE_PATH output ${RUNTIME_DIR} ${INPUT_D})