From d143966b6a1039613f224fda5f428cec9885ca84 Mon Sep 17 00:00:00 2001 From: Alexey Prokhin Date: Fri, 29 Oct 2010 12:14:24 +0400 Subject: [PATCH] Updated druntime.patch and added phobos.patch --- .hgignore | 5 +- druntime.patch | 13348 +-------------------------------------- phobos.patch | 243 + runtime/CMakeLists.txt | 22 +- 4 files changed, 470 insertions(+), 13148 deletions(-) create mode 100644 phobos.patch diff --git a/.hgignore b/.hgignore index d79da1a2..2c48dc63 100644 --- a/.hgignore +++ b/.hgignore @@ -16,8 +16,10 @@ CMakeFiles CMakeCache.txt cmake_install.cmake .DS_Store -CMakeLists.txt.user +CMakeLists.txt.user* .directory +druntime-orig +phobos-orig syntax: regexp ^obj/ @@ -25,6 +27,7 @@ syntax: regexp ^tests/reference/ ^tango/ ^druntime/ +^phobos/ ^import/ ^bin/ldc2?$ ^bin/ldc2?\.conf$ diff --git a/druntime.patch b/druntime.patch index 51b50e58..8935fe38 100644 --- a/druntime.patch +++ b/druntime.patch @@ -1,525 +1,7 @@ -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/cstdarg.di druntime/import/ldc/cstdarg.di ---- druntime-old/import/ldc/cstdarg.di 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/import/ldc/cstdarg.di 2010-09-30 22:10:37.000000000 +0400 -@@ -0,0 +1,29 @@ -+/* -+ * vararg support for extern(C) functions -+ */ -+ -+module ldc.cstdarg; -+ -+// Check for the right compiler -+version(LDC) -+{ -+ // OK -+} -+else -+{ -+ static assert(false, "This module is only valid for LDC"); -+} -+ -+alias void* va_list; -+ -+pragma(va_start) -+ void va_start(T)(va_list ap, ref T); -+ -+pragma(va_arg) -+ T va_arg(T)(va_list ap); -+ -+pragma(va_end) -+ void va_end(va_list args); -+ -+pragma(va_copy) -+ void va_copy(va_list dst, va_list src); -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/intrinsics.di druntime/import/ldc/intrinsics.di ---- druntime-old/import/ldc/intrinsics.di 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/import/ldc/intrinsics.di 2010-10-02 14:01:02.975890001 +0400 -@@ -0,0 +1,413 @@ -+/* -+ * This module holds declarations to LLVM intrinsics. -+ * -+ * See the LLVM language reference for more information: -+ * -+ * - http://llvm.org/docs/LangRef.html#intrinsics -+ * -+ */ -+ -+module ldc.intrinsics; -+ -+// Check for the right compiler -+version(LDC) -+{ -+ // OK -+} -+else -+{ -+ static assert(false, "This module is only valid for LDC"); -+} -+ -+// -+// CODE GENERATOR INTRINSICS -+// -+ -+ -+// The 'llvm.returnaddress' intrinsic attempts to compute a target-specific -+// value indicating the return address of the current function or one of its -+// callers. -+ -+pragma(intrinsic, "llvm.returnaddress") -+ void* llvm_returnaddress(uint level); -+ -+ -+// The 'llvm.frameaddress' intrinsic attempts to return the target-specific -+// frame pointer value for the specified stack frame. -+ -+pragma(intrinsic, "llvm.frameaddress") -+ void* llvm_frameaddress(uint level); -+ -+ -+// The 'llvm.stacksave' intrinsic is used to remember the current state of the -+// function stack, for use with llvm.stackrestore. This is useful for -+// implementing language features like scoped automatic variable sized arrays -+// in C99. -+ -+pragma(intrinsic, "llvm.stacksave") -+ void* llvm_stacksave(); -+ -+ -+// The 'llvm.stackrestore' intrinsic is used to restore the state of the -+// function stack to the state it was in when the corresponding llvm.stacksave -+// intrinsic executed. This is useful for implementing language features like -+// scoped automatic variable sized arrays in C99. -+ -+pragma(intrinsic, "llvm.stackrestore") -+ void llvm_stackrestore(void* ptr); -+ -+ -+// The 'llvm.prefetch' intrinsic is a hint to the code generator to insert a -+// prefetch instruction if supported; otherwise, it is a noop. Prefetches have -+// no effect on the behavior of the program but can change its performance -+// characteristics. -+ -+pragma(intrinsic, "llvm.prefetch") -+ void llvm_prefetch(void* ptr, uint rw, uint locality); -+ -+ -+// The 'llvm.pcmarker' intrinsic is a method to export a Program Counter (PC) -+// in a region of code to simulators and other tools. The method is target -+// specific, but it is expected that the marker will use exported symbols to -+// transmit the PC of the marker. The marker makes no guarantees that it will -+// remain with any specific instruction after optimizations. It is possible -+// that the presence of a marker will inhibit optimizations. The intended use -+// is to be inserted after optimizations to allow correlations of simulation -+// runs. -+ -+pragma(intrinsic, "llvm.pcmarker") -+ void llvm_pcmarker(uint id); -+ -+ -+// The 'llvm.readcyclecounter' intrinsic provides access to the cycle counter -+// register (or similar low latency, high accuracy clocks) on those targets that -+// support it. On X86, it should map to RDTSC. On Alpha, it should map to RPCC. -+// As the backing counters overflow quickly (on the order of 9 seconds on -+// alpha), this should only be used for small timings. -+ -+pragma(intrinsic, "llvm.readcyclecounter") -+ ulong readcyclecounter(); -+ -+ -+ -+ -+// -+// STANDARD C LIBRARY INTRINSICS -+// -+ -+ -+// The 'llvm.memcpy.*' intrinsics copy a block of memory from the source -+// location to the destination location. -+// Note that, unlike the standard libc function, the llvm.memcpy.* intrinsics do -+// not return a value, and takes an extra alignment argument. -+ -+pragma(intrinsic, "llvm.memcpy.i#") -+ void llvm_memcpy(T)(void* dst, void* src, T len, uint alignment); -+ -+deprecated { -+ alias llvm_memcpy!(uint) llvm_memcpy_i32; -+ alias llvm_memcpy!(ulong) llvm_memcpy_i64; -+} -+ -+ -+// The 'llvm.memmove.*' intrinsics move a block of memory from the source -+// location to the destination location. It is similar to the 'llvm.memcpy' -+// intrinsic but allows the two memory locations to overlap. -+// Note that, unlike the standard libc function, the llvm.memmove.* intrinsics -+// do not return a value, and takes an extra alignment argument. -+ -+pragma(intrinsic, "llvm.memmove.i#") -+ void llvm_memmove(T)(void* dst, void* src, T len, uint alignment); -+ -+deprecated { -+ alias llvm_memmove!(uint) llvm_memmove_i32; -+ alias llvm_memmove!(ulong) llvm_memmove_i64; -+} -+ -+ -+// The 'llvm.memset.*' intrinsics fill a block of memory with a particular byte -+// value. -+// Note that, unlike the standard libc function, the llvm.memset intrinsic does -+// not return a value, and takes an extra alignment argument. -+ -+pragma(intrinsic, "llvm.memset.i#") -+ void llvm_memset(T)(void* dst, ubyte val, T len, uint alignment); -+ -+deprecated { -+ alias llvm_memset!(uint) llvm_memset_i32; -+ alias llvm_memset!(ulong) llvm_memset_i64; -+} -+ -+ -+// The 'llvm.sqrt' intrinsics return the sqrt of the specified operand, -+// returning the same value as the libm 'sqrt' functions would. Unlike sqrt in -+// libm, however, llvm.sqrt has undefined behavior for negative numbers other -+// than -0.0 (which allows for better optimization, because there is no need to -+// worry about errno being set). llvm.sqrt(-0.0) is defined to return -0.0 like -+// IEEE sqrt. -+ -+pragma(intrinsic, "llvm.sqrt.f#") -+ T llvm_sqrt(T)(T val); -+ -+deprecated { -+ alias llvm_sqrt!(float) llvm_sqrt_f32; -+ alias llvm_sqrt!(double) llvm_sqrt_f64; -+ alias llvm_sqrt!(real) llvm_sqrt_f80; // may not actually be .f80 -+} -+ -+ -+// The 'llvm.sin.*' intrinsics return the sine of the operand. -+ -+pragma(intrinsic, "llvm.sin.f#") -+ T llvm_sin(T)(T val); -+ -+deprecated { -+ alias llvm_sin!(float) llvm_sin_f32; -+ alias llvm_sin!(double) llvm_sin_f64; -+ alias llvm_sin!(real) llvm_sin_f80; // may not actually be .f80 -+} -+ -+ -+// The 'llvm.cos.*' intrinsics return the cosine of the operand. -+ -+pragma(intrinsic, "llvm.cos.f#") -+ T llvm_cos(T)(T val); -+ -+deprecated { -+ alias llvm_cos!(float) llvm_cos_f32; -+ alias llvm_cos!(double) llvm_cos_f64; -+ alias llvm_cos!(real) llvm_cos_f80; // may not actually be .f80 -+} -+ -+ -+// The 'llvm.powi.*' intrinsics return the first operand raised to the specified -+// (positive or negative) power. The order of evaluation of multiplications is -+// not defined. When a vector of floating point type is used, the second -+// argument remains a scalar integer value. -+ -+pragma(intrinsic, "llvm.powi.f#") -+ T llvm_powi(T)(T val, int power); -+ -+deprecated { -+ alias llvm_powi!(float) llvm_powi_f32; -+ alias llvm_powi!(double) llvm_powi_f64; -+ alias llvm_powi!(real) llvm_powi_f80; // may not actually be .f80 -+} -+ -+ -+// The 'llvm.pow.*' intrinsics return the first operand raised to the specified -+// (positive or negative) power. -+ -+pragma(intrinsic, "llvm.pow.f#") -+ T llvm_pow(T)(T val, T power); -+ -+deprecated { -+ alias llvm_pow!(float) llvm_pow_f32; -+ alias llvm_pow!(double) llvm_pow_f64; -+ alias llvm_pow!(real) llvm_pow_f80; // may not actually be .f80 -+} -+ -+ -+// -+// BIT MANIPULATION INTRINSICS -+// -+ -+// The 'llvm.bswap' family of intrinsics is used to byte swap integer values -+// with an even number of bytes (positive multiple of 16 bits). These are -+// useful for performing operations on data that is not in the target's native -+// byte order. -+ -+pragma(intrinsic, "llvm.bswap.i#.i#") -+ T llvm_bswap(T)(T val); -+ -+deprecated { -+ alias llvm_bswap!(ushort) llvm_bswap_i16; -+ alias llvm_bswap!(uint) llvm_bswap_i32; -+ alias llvm_bswap!(ulong) llvm_bswap_i64; -+} -+ -+ -+// The 'llvm.ctpop' family of intrinsics counts the number of bits set in a -+// value. -+ -+pragma(intrinsic, "llvm.ctpop.i#") -+ T llvm_ctpop(T)(T src); -+ -+deprecated { -+ alias llvm_ctpop!(ubyte) llvm_ctpop_i8; -+ alias llvm_ctpop!(ushort) llvm_ctpop_i16; -+ alias llvm_ctpop!(uint) llvm_ctpop_i32; -+ alias llvm_ctpop!(ulong) llvm_ctpop_i64; -+} -+ -+ -+// The 'llvm.ctlz' family of intrinsic functions counts the number of leading -+// zeros in a variable. -+ -+pragma(intrinsic, "llvm.ctlz.i#") -+ T llvm_ctlz(T)(T src); -+ -+deprecated { -+ alias llvm_ctlz!(ubyte) llvm_ctlz_i8; -+ alias llvm_ctlz!(ushort) llvm_ctlz_i16; -+ alias llvm_ctlz!(uint) llvm_ctlz_i32; -+ alias llvm_ctlz!(ulong) llvm_ctlz_i64; -+} -+ -+ -+// The 'llvm.cttz' family of intrinsic functions counts the number of trailing -+// zeros. -+ -+pragma(intrinsic, "llvm.cttz.i#") -+ T llvm_cttz(T)(T src); -+ -+deprecated { -+ alias llvm_cttz!(ubyte) llvm_cttz_i8; -+ alias llvm_cttz!(ushort) llvm_cttz_i16; -+ alias llvm_cttz!(uint) llvm_cttz_i32; -+ alias llvm_cttz!(ulong) llvm_cttz_i64; -+} -+ -+ -+// The 'llvm.part.select' family of intrinsic functions selects a range of bits -+// from an integer value and returns them in the same bit width as the original -+// value. -+ -+pragma(intrinsic, "llvm.part.select.i#") -+ T llvm_part_select(T)(T val, uint loBit, uint hiBit); -+ -+deprecated { -+ alias llvm_part_select!(ubyte) llvm_part_select_i; -+ alias llvm_part_select!(ushort) llvm_part_select_i; -+ alias llvm_part_select!(uint) llvm_part_select_i; -+ alias llvm_part_select!(ulong) llvm_part_select_i; -+} -+ -+ -+// The 'llvm.part.set' family of intrinsic functions replaces a range of bits -+// in an integer value with another integer value. It returns the integer with -+// the replaced bits. -+ -+// TODO -+// declare i17 @llvm.part.set.i17.i9 (i17 %val, i9 %repl, i32 %lo, i32 %hi) -+// declare i29 @llvm.part.set.i29.i9 (i29 %val, i9 %repl, i32 %lo, i32 %hi) -+ -+ -+ -+ -+// -+// ATOMIC OPERATIONS AND SYNCHRONIZATION INTRINSICS -+// -+ -+// The llvm.memory.barrier intrinsic guarantees ordering between specific -+// pairs of memory access types. -+ -+pragma(intrinsic, "llvm.memory.barrier") -+ void llvm_memory_barrier(bool ll, bool ls, bool sl, bool ss, bool device); -+ -+// This loads a value in memory and compares it to a given value. If they are -+// equal, it stores a new value into the memory. -+ -+pragma(intrinsic, "llvm.atomic.cmp.swap.i#.p0i#") -+ T llvm_atomic_cmp_swap(T)(shared T* ptr, T cmp, T val); -+ -+// This intrinsic loads the value stored in memory at ptr and yields the value -+// from memory. It then stores the value in val in the memory at ptr. -+ -+pragma(intrinsic, "llvm.atomic.swap.i#.p0i#") -+ T llvm_atomic_swap(T)(T* ptr, T val); -+ -+// This intrinsic adds delta to the value stored in memory at ptr. It yields -+// the original value at ptr. -+ -+pragma(intrinsic, "llvm.atomic.load.add.i#.p0i#") -+ T llvm_atomic_load_add(T)(shared const T* ptr, T val); -+ -+// This intrinsic subtracts delta to the value stored in memory at ptr. It -+// yields the original value at ptr. -+ -+pragma(intrinsic, "llvm.atomic.load.sub.i#.p0i#") -+ T llvm_atomic_load_sub(T)(T* ptr, T val); -+ -+// These intrinsics bitwise the operation (and, nand, or, xor) delta to the -+// value stored in memory at ptr. It yields the original value at ptr. -+ -+pragma(intrinsic, "llvm.atomic.load.and.i#.p0i#") -+ T llvm_atomic_load_and(T)(T* ptr, T val); -+ -+pragma(intrinsic, "llvm.atomic.load.nand.i#.p0i#") -+ T llvm_atomic_load_nand(T)(T* ptr, T val); -+ -+pragma(intrinsic, "llvm.atomic.load.or.i#.p0i#") -+ T llvm_atomic_load_or(T)(T* ptr, T val); -+ -+pragma(intrinsic, "llvm.atomic.load.xor.i#.p0i#") -+ T llvm_atomic_load_xor(T)(T* ptr, T val); -+ -+// These intrinsics takes the signed or unsigned minimum or maximum of delta -+// and the value stored in memory at ptr. It yields the original value at ptr. -+ -+pragma(intrinsic, "llvm.atomic.load.max.i#.p0i#") -+ T llvm_atomic_load_max(T)(T* ptr, T val); -+ -+pragma(intrinsic, "llvm.atomic.load.min.i#.p0i#") -+ T llvm_atomic_load_min(T)(T* ptr, T val); -+ -+pragma(intrinsic, "llvm.atomic.load.umax.i#.p0i#") -+ T llvm_atomic_load_umax(T)(T* ptr, T val); -+ -+pragma(intrinsic, "llvm.atomic.load.umin.i#.p0i#") -+ T llvm_atomic_load_umin(T)(T* ptr, T val); -+ -+ -+// -+// ARITHMETIC-WITH-OVERFLOW INTRINSICS -+// -+ -+struct OverflowRet(T) { -+ static assert(is(T : int), T.stringof ~ " is not an integer type!"); -+ T result; -+ bool overflow; -+} -+ -+// Signed and unsigned addition -+pragma(intrinsic, "llvm.sadd.with.overflow.i#") -+ OverflowRet!(T) llvm_sadd_with_overflow(T)(T lhs, T rhs); -+ -+pragma(intrinsic, "llvm.uadd.with.overflow.i#") -+ OverflowRet!(T) llvm_uadd_with_overflow(T)(T lhs, T rhs); -+ -+ -+// Signed and unsigned subtraction -+pragma(intrinsic, "llvm.ssub.with.overflow.i#") -+ OverflowRet!(T) llvm_ssub_with_overflow(T)(T lhs, T rhs); -+ -+pragma(intrinsic, "llvm.usub.with.overflow.i#") -+ OverflowRet!(T) llvm_usub_with_overflow(T)(T lhs, T rhs); -+ -+ -+// Signed and unsigned multiplication -+pragma(intrinsic, "llvm.smul.with.overflow.i#") -+ OverflowRet!(T) llvm_smul_with_overflow(T)(T lhs, T rhs); -+ -+/* Note: LLVM documentations says: -+ * Warning: 'llvm.umul.with.overflow' is badly broken. -+ * It is actively being fixed, but it should not currently be used! -+ * -+ * See: http://llvm.org/docs/LangRef.html#int_umul_overflow -+ */ -+//pragma(intrinsic, "llvm.umul.with.overflow.i#") -+// OverflowRet!(T) llvm_umul_with_overflow(T)(T lhs, T rhs); -+ -+ -+// -+// GENERAL INTRINSICS -+// -+ -+ -+// This intrinsics is lowered to the target dependent trap instruction. If the -+// target does not have a trap instruction, this intrinsic will be lowered to -+// the call of the abort() function. -+ -+pragma(intrinsic, "llvm.trap") -+ void llvm_trap(); -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/llvmasm.di druntime/import/ldc/llvmasm.di ---- druntime-old/import/ldc/llvmasm.di 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/import/ldc/llvmasm.di 2010-09-30 22:10:37.000000000 +0400 -@@ -0,0 +1,17 @@ -+module ldc.llvmasm; -+ -+struct __asmtuple_t(T...) -+{ -+ T v; -+} -+ -+pragma(llvm_inline_asm) -+{ -+ void __asm( )(char[] asmcode, char[] constraints, ...); -+ T __asm(T)(char[] asmcode, char[] constraints, ...); -+ -+ template __asmtuple(T...) -+ { -+ __asmtuple_t!(T) __asmtuple(char[] asmcode, char[] constraints, ...); -+ } -+} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/vararg.d druntime/import/ldc/vararg.d ---- druntime-old/import/ldc/vararg.d 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/import/ldc/vararg.d 2010-09-30 22:10:37.000000000 +0400 -@@ -0,0 +1,43 @@ -+/* -+ * This module holds the implementation of special vararg templates for D style var args. -+ * -+ * Provides the functions tango.core.Vararg expects to be present! -+ */ -+ -+module ldc.Vararg; -+ -+// Check for the right compiler -+version(LDC) -+{ -+ // OK -+} -+else -+{ -+ static assert(false, "This module is only valid for LDC"); -+} -+ -+alias void* va_list; -+ -+void va_start(T) ( out va_list ap, inout T parmn ) -+{ -+ // not needed ! -+} -+ -+T va_arg(T)(ref va_list vp) -+{ -+ T* arg = cast(T*) vp; -+ // ldc always aligns to size_t.sizeof in vararg lists -+ vp = cast(va_list) ( cast(void*) vp + ( ( T.sizeof + size_t.sizeof - 1 ) & ~( size_t.sizeof - 1 ) ) ); -+ return *arg; -+} -+ -+void va_end( va_list ap ) -+{ -+ // not needed ! -+} -+ -+void va_copy( out va_list dst, va_list src ) -+{ -+ // seems pretty useless ! -+ dst = src; -+} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/object.di druntime/import/object.di ---- druntime-old/import/object.di 2010-09-03 12:28:52.000000000 +0400 -+++ druntime/import/object.di 2010-10-05 12:47:24.873150000 +0400 -@@ -130,7 +130,7 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/import/object.di druntime/import/object.di +--- druntime-orig/import/object.di 2010-09-03 12:28:52.000000000 +0400 ++++ druntime/import/object.di 2010-10-27 00:22:27.444925001 +0400 +@@ -130,7 +130,7 @@ Interface[] interfaces; TypeInfo_Class base; void* destructor; @@ -528,7 +10,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. uint m_flags; // 1: // is IUnknown or is derived from IUnknown // 2: // has no possible pointers into GC memory -@@ -140,7 +140,7 @@ +@@ -140,7 +140,7 @@ // 32: // has typeinfo member void* deallocator; OffsetTypeInfo[] m_offTi; @@ -537,7 +19,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. const(MemberInfo[]) function(string) xgetMembers; static TypeInfo_Class find(in char[] classname); -@@ -179,7 +179,7 @@ +@@ -179,7 +179,7 @@ class TypeInfo_Const : TypeInfo { @@ -546,7 +28,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } class TypeInfo_Invariant : TypeInfo_Const -@@ -288,7 +288,6 @@ +@@ -288,7 +288,6 @@ interface TraceInfo { int opApply(scope int delegate(ref char[])); @@ -554,190 +36,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } string msg; -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/std/intrinsic.di druntime/import/std/intrinsic.di ---- druntime-old/import/std/intrinsic.di 2010-08-05 05:39:08.000000000 +0400 -+++ druntime/import/std/intrinsic.di 1970-01-01 03:00:00.000000000 +0300 -@@ -1,176 +0,0 @@ --/** -- * These functions are built-in intrinsics to the compiler. -- * -- * Intrinsic functions are functions built in to the compiler, usually to take -- * advantage of specific CPU features that are inefficient to handle via -- * external functions. The compiler's optimizer and code generator are fully -- * integrated in with intrinsic functions, bringing to bear their full power on -- * them. This can result in some surprising speedups. -- * -- * Copyright: Public Domain -- * License: Public Domain -- * Authors: Walter Bright -- */ --module std.intrinsic; -- -- --/** -- * Scans the bits in v starting with bit 0, looking -- * for the first set bit. -- * Returns: -- * The bit number of the first bit set. -- * The return value is undefined if v is zero. -- */ --pure nothrow int bsf( uint v ); -- -- --/** -- * Scans the bits in v from the most significant bit -- * to the least significant bit, looking -- * for the first set bit. -- * Returns: -- * The bit number of the first bit set. -- * The return value is undefined if v is zero. -- * Example: -- * --- -- * import std.intrinsic; -- * -- * int main() -- * { -- * uint v; -- * int x; -- * -- * v = 0x21; -- * x = bsf(v); -- * printf("bsf(x%x) = %d\n", v, x); -- * x = bsr(v); -- * printf("bsr(x%x) = %d\n", v, x); -- * return 0; -- * } -- * --- -- * Output: -- * bsf(x21) = 0
-- * bsr(x21) = 5 -- */ --pure nothrow int bsr( uint v ); -- -- --/** -- * Tests the bit. -- */ --pure nothrow int bt( in uint* p, uint bitnum ); -- -- --/** -- * Tests and complements the bit. -- */ --nothrow int btc( uint* p, uint bitnum ); -- -- --/** -- * Tests and resets (sets to 0) the bit. -- */ --nothrow int btr( uint* p, uint bitnum ); -- -- --/** -- * Tests and sets the bit. -- * Params: -- * p = a non-NULL pointer to an array of uints. -- * index = a bit number, starting with bit 0 of p[0], -- * and progressing. It addresses bits like the expression: ----- --p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1))) ----- -- * Returns: -- * A non-zero value if the bit was set, and a zero -- * if it was clear. -- * -- * Example: -- * --- --import std.intrinsic; -- --int main() --{ -- uint array[2]; -- -- array[0] = 2; -- array[1] = 0x100; -- -- printf("btc(array, 35) = %d\n", btc(array, 35)); -- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -- -- printf("btc(array, 35) = %d\n", btc(array, 35)); -- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -- -- printf("bts(array, 35) = %d\n", bts(array, 35)); -- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -- -- printf("btr(array, 35) = %d\n", btr(array, 35)); -- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -- -- printf("bt(array, 1) = %d\n", bt(array, 1)); -- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -- -- return 0; --} -- * --- -- * Output: --
--btc(array, 35) = 0
--array = [0]:x2, [1]:x108
--btc(array, 35) = -1
--array = [0]:x2, [1]:x100
--bts(array, 35) = 0
--array = [0]:x2, [1]:x108
--btr(array, 35) = -1
--array = [0]:x2, [1]:x100
--bt(array, 1) = -1
--array = [0]:x2, [1]:x100
--
-- */ --nothrow int bts( uint* p, uint bitnum ); -- -- --/** -- * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes -- * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3 -- * becomes byte 0. -- */ --pure nothrow uint bswap( uint v ); -- -- --/** -- * Reads I/O port at port_address. -- */ --nothrow ubyte inp( uint port_address ); -- -- --/** -- * ditto -- */ --nothrow ushort inpw( uint port_address ); -- -- --/** -- * ditto -- */ --nothrow uint inpl( uint port_address ); -- -- --/** -- * Writes and returns value to I/O port at port_address. -- */ --nothrow ubyte outp( uint port_address, ubyte value ); -- -- --/** -- * ditto -- */ --nothrow ushort outpw( uint port_address, ushort value ); -- -- --/** -- * ditto -- */ --nothrow uint outpl( uint port_address, uint value ); -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/core/atomic.d druntime/src/core/atomic.d ---- druntime-old/src/core/atomic.d 2010-09-03 12:28:52.000000000 +0400 -+++ druntime/src/core/atomic.d 2010-10-05 15:55:10.893150001 +0400 -@@ -89,6 +89,117 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/core/atomic.d druntime/src/core/atomic.d +--- druntime-orig/src/core/atomic.d 2010-09-03 12:28:52.000000000 +0400 ++++ druntime/src/core/atomic.d 2010-10-05 15:55:10.893150001 +0400 +@@ -89,6 +89,117 @@ return false; } } @@ -855,7 +157,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. else version( AsmX86_32 ) { T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) -@@ -396,6 +507,12 @@ +@@ -396,6 +507,12 @@ } } } @@ -868,22 +170,38 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. else version( AsmX86_64 ) { T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gc.d druntime/src/gc/gc.d ---- druntime-old/src/gc/gc.d 2010-08-05 05:39:08.000000000 +0400 -+++ druntime/src/gc/gc.d 2010-10-04 16:54:06.837685001 +0400 -@@ -100,7 +100,7 @@ - version (GCCLASS) - { void* p; - ClassInfo ci = GC.classinfo; -- -+ - p = malloc(ci.init.length); - (cast(byte*)p)[0 .. ci.init.length] = ci.init[]; - _gc = cast(GC)p; -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcbits.d druntime/src/gc/gcbits.d ---- druntime-old/src/gc/gcbits.d 2010-08-08 04:10:24.000000000 +0400 -+++ druntime/src/gc/gcbits.d 2010-10-01 20:49:51.268892001 +0400 -@@ -26,6 +26,10 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/core/stdc/math.d druntime/src/core/stdc/math.d +--- druntime-orig/src/core/stdc/math.d 2010-09-03 12:28:52.000000000 +0400 ++++ druntime/src/core/stdc/math.d 2010-10-26 16:47:04.036925000 +0400 +@@ -17,6 +17,7 @@ + + extern (C): + nothrow: ++pure: // LDC + + alias float float_t; + alias double double_t; +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/core/stdc/stdlib.d druntime/src/core/stdc/stdlib.d +--- druntime-orig/src/core/stdc/stdlib.d 2010-08-05 05:39:08.000000000 +0400 ++++ druntime/src/core/stdc/stdlib.d 2010-10-26 19:26:03.996925001 +0400 +@@ -92,3 +92,13 @@ + { + void* alloca(size_t size); // non-standard + } ++else version( LDC ) ++{ ++ pragma(alloca) ++ void* alloca(size_t size); ++} ++else version( GNU ) ++{ ++ private import gcc.builtins; ++ alias gcc.builtins.__builtin_alloca alloca; ++} +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/gc/gcbits.d druntime/src/gc/gcbits.d +--- druntime-orig/src/gc/gcbits.d 2010-08-08 04:10:24.000000000 +0400 ++++ druntime/src/gc/gcbits.d 2010-10-01 20:49:51.268892001 +0400 +@@ -26,6 +26,10 @@ { version = bitops; } @@ -894,10 +212,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. else version (GNU) { // use the unoptimized version -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcx.d druntime/src/gc/gcx.d ---- druntime-old/src/gc/gcx.d 2010-08-27 01:23:26.000000000 +0400 -+++ druntime/src/gc/gcx.d 2010-10-07 22:27:41.879253001 +0400 -@@ -1464,7 +1464,8 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/gc/gcx.d druntime/src/gc/gcx.d +--- druntime-orig/src/gc/gcx.d 2010-08-27 01:23:26.000000000 +0400 ++++ druntime/src/gc/gcx.d 2010-10-07 22:27:41.879253001 +0400 +@@ -1464,7 +1464,8 @@ void initialize() @@ -907,7 +225,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. (cast(byte*)&this)[0 .. Gcx.sizeof] = 0; stackBottom = cast(char*)&dummy; -@@ -2200,7 +2201,7 @@ +@@ -2200,7 +2201,7 @@ if ((cast(size_t)p & ~(PAGESIZE-1)) == pcache) continue; @@ -916,7 +234,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. if (pool) { size_t offset = cast(size_t)(p - pool.baseAddr); -@@ -2270,80 +2271,129 @@ +@@ -2270,80 +2271,129 @@ __builtin_unwind_init(); sp = & sp; } @@ -1114,7 +432,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. return result; } -@@ -2357,7 +2407,7 @@ +@@ -2357,7 +2407,7 @@ Pool* pool; debug(COLLECT_PRINTF) printf("Gcx.fullcollect()\n"); @@ -1123,10 +441,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. thread_suspendAll(); -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/object_.d druntime/src/object_.d ---- druntime-old/src/object_.d 2010-09-03 12:28:52.000000000 +0400 -+++ druntime/src/object_.d 2010-10-05 14:50:34.733150002 +0400 -@@ -1073,7 +1073,7 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/object_.d druntime/src/object_.d +--- druntime-orig/src/object_.d 2010-10-26 18:47:41.840925001 +0400 ++++ druntime/src/object_.d 2010-10-26 19:27:09.224925000 +0400 +@@ -1073,7 +1073,7 @@ abstract class MemberInfo { @@ -1135,7 +453,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } class MemberInfo_field : MemberInfo -@@ -1663,7 +1663,6 @@ +@@ -1663,7 +1663,6 @@ { int len = 0; ModuleReference *mr; @@ -1143,19 +461,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. for (mr = _Dmodule_ref; mr; mr = mr.next) len++; _moduleinfo_array = new ModuleInfo*[len]; -@@ -1802,7 +1801,10 @@ - { - debug(PRINTF) printf("_moduleTlsCtor()\n"); - -- void* p = alloca(_moduleinfo_array.length * ubyte.sizeof); -+ version( DMD ) -+ void* p = alloca(_moduleinfo_array.length * ubyte.sizeof); -+ else -+ void* p = malloc(_moduleinfo_array.length * ubyte.sizeof); - auto flags = cast(ubyte[])p[0 .. _moduleinfo_array.length]; - flags[] = 0; - -@@ -2025,7 +2027,6 @@ +@@ -2025,7 +2024,6 @@ _d_monitor_create(h); m = getMonitor(h); } @@ -1163,7 +469,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. IMonitor i = m.impl; if (i is null) -@@ -2124,7 +2125,7 @@ +@@ -2124,7 +2122,7 @@ size_t _aaLen(void* p); void* _aaGet(void** pp, TypeInfo keyti, size_t valuesize, ...); void* _aaGetRvalue(void* p, TypeInfo keyti, size_t valuesize, ...); @@ -1172,25 +478,92 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. void _aaDel(void* p, TypeInfo keyti, ...); void[] _aaValues(void* p, size_t keysize, size_t valuesize); void[] _aaKeys(void* p, size_t keysize, size_t valuesize); -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/adi.d druntime/src/rt/adi.d ---- druntime-old/src/rt/adi.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/adi.d 2010-10-07 14:32:52.911253001 +0400 -@@ -35,6 +35,14 @@ +@@ -2169,7 +2167,7 @@ + return *cast(Key[]*) &a; + } + +- int opApply(scope int delegate(ref Key, ref Value) dg) ++ int opApply(scope int delegate(ref Key, ref const Value) dg) + { + return _aaApply2(p, aligntsize(Key.sizeof), cast(_dg2_t)dg); + } +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/aaA.d druntime/src/rt/aaA.d +--- druntime-orig/src/rt/aaA.d 2010-08-05 05:39:06.000000000 +0400 ++++ druntime/src/rt/aaA.d 2010-10-29 10:48:36.165035001 +0400 +@@ -204,7 +204,7 @@ + * Add entry for key if it is not already there. + */ + +-void* _aaGet(AA* aa, TypeInfo keyti, size_t valuesize, ...) ++void* _aaGet(AA* aa, TypeInfo keyti, size_t valuesize, void *pkey) + in + { + assert(aa); +@@ -218,7 +218,6 @@ + } + body + { +- auto pkey = cast(void *)(&valuesize + 1); + size_t i; + aaA *e; + //printf("keyti = %p\n", keyti); +@@ -274,13 +273,12 @@ + * Returns null if it is not already there. + */ + +-void* _aaGetRvalue(AA aa, TypeInfo keyti, size_t valuesize, ...) ++void* _aaGetRvalue(AA aa, TypeInfo keyti, size_t valuesize, void *pkey) + { + //printf("_aaGetRvalue(valuesize = %u)\n", valuesize); + if (!aa.a) + return null; + +- auto pkey = cast(void *)(&valuesize + 1); + auto keysize = aligntsize(keyti.tsize()); + auto len = aa.a.b.length; + +@@ -312,7 +310,7 @@ + * !=null in aa, return pointer to value + */ + +-void* _aaIn(AA aa, TypeInfo keyti, ...) ++void* _aaIn(AA aa, TypeInfo keyti, void *pkey) + in + { + } +@@ -324,8 +322,6 @@ + { + if (aa.a) + { +- auto pkey = cast(void *)(&keyti + 1); +- + //printf("_aaIn(), .length = %d, .ptr = %x\n", aa.a.length, cast(uint)aa.a.ptr); + auto len = aa.a.b.length; + +@@ -357,9 +353,8 @@ + * If key is not in aa[], do nothing. + */ + +-void _aaDel(AA aa, TypeInfo keyti, ...) ++void _aaDel(AA aa, TypeInfo keyti, void *pkey) + { +- auto pkey = cast(void *)(&keyti + 1); + aaA *e; + + if (aa.a && aa.a.b.length) +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/adi.d druntime/src/rt/adi.d +--- druntime-orig/src/rt/adi.d 2010-08-05 05:39:06.000000000 +0400 ++++ druntime/src/rt/adi.d 2010-10-29 11:49:52.065035002 +0400 +@@ -35,6 +35,8 @@ extern (C) void gc_free( void* p ); } -+version (DMD) -+{ -+ version (X86) -+ { -+ version = DMD_X86; -+ } -+} -+ ++version (DMD) version (X86) ++ version = DMD_X86; struct Array { -@@ -48,7 +56,7 @@ +@@ -48,7 +50,7 @@ * reversed. */ @@ -1199,7 +572,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { if (a.length > 1) { -@@ -108,7 +116,7 @@ +@@ -108,7 +110,7 @@ hi = hi - 1 + (stridehi - stridelo); } } @@ -1208,7 +581,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } unittest -@@ -143,7 +151,7 @@ +@@ -143,7 +145,7 @@ * reversed. */ @@ -1217,7 +590,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { if (a.length > 1) { -@@ -201,7 +209,7 @@ +@@ -201,7 +203,7 @@ hi = hi - 1 + (stridehi - stridelo); } } @@ -1226,7 +599,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } unittest -@@ -225,10 +233,10 @@ +@@ -225,10 +227,10 @@ * Support for array.reverse property. */ @@ -1239,21 +612,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } body { -@@ -243,10 +251,10 @@ - tmp = buffer.ptr; - if (szelem > 16) - { -- //version (Windows) -+ version (Windows) - tmp = cast(byte*) alloca(szelem); -- //else -- //tmp = gc_malloc(szelem); -+ else -+ tmp = cast(byte*) gc_malloc(szelem); - } - - for (; lo < hi; lo += szelem, hi -= szelem) -@@ -267,7 +275,7 @@ +@@ -267,7 +269,7 @@ //gc_free(tmp); } } @@ -1262,7 +621,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } unittest -@@ -311,7 +319,7 @@ +@@ -311,7 +313,7 @@ * Sort array of chars. */ @@ -1271,7 +630,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { if (a.length > 1) { -@@ -326,14 +334,14 @@ +@@ -326,14 +328,14 @@ } delete da; } @@ -1288,7 +647,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { if (a.length > 1) { -@@ -348,7 +356,7 @@ +@@ -348,7 +350,7 @@ } delete da; } @@ -1297,7 +656,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } /*************************************** -@@ -358,7 +366,7 @@ +@@ -358,7 +360,7 @@ * 0 not equal */ @@ -1306,7 +665,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { debug(adi) printf("_adEq(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); if (a1.length != a2.length) -@@ -379,7 +387,7 @@ +@@ -379,7 +381,7 @@ return 1; // equal } @@ -1315,7 +674,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { debug(adi) printf("_adEq2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); if (a1.length != a2.length) -@@ -405,7 +413,7 @@ +@@ -405,7 +407,7 @@ * Support for array compare test. */ @@ -1324,7 +683,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { debug(adi) printf("adCmp()\n"); auto len = a1.length; -@@ -435,7 +443,7 @@ +@@ -435,7 +437,7 @@ return (a1.length > a2.length) ? 1 : -1; } @@ -1333,7 +692,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { debug(adi) printf("_adCmp2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); return ti.compare(&a1, &a2); -@@ -461,9 +469,9 @@ +@@ -461,9 +463,9 @@ * Support for array compare test. */ @@ -1345,7 +704,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { asm { naked ; -@@ -569,8 +577,8 @@ +@@ -569,8 +571,8 @@ ret ; } @@ -1356,12077 +715,35 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { int len; int c; -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayInit.d druntime/src/rt/arrayInit.d ---- druntime-old/src/rt/arrayInit.d 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/src/rt/arrayInit.d 2010-10-03 20:41:52.223624001 +0400 -@@ -0,0 +1,155 @@ -+private import ldc.intrinsics; -+ -+extern(C): -+ -+int memcmp(void*,void*,size_t); -+size_t strlen(char*); -+ -+version(LLVM64) -+alias llvm_memcpy_i64 llvm_memcpy; -+else -+alias llvm_memcpy_i32 llvm_memcpy; -+ -+// per-element array init routines -+ -+void _d_array_init_i16(ushort* a, size_t n, ushort v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_i32(uint* a, size_t n, uint v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_i64(ulong* a, size_t n, ulong v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_float(float* a, size_t n, float v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_double(double* a, size_t n, double v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_real(real* a, size_t n, real v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_cfloat(cfloat* a, size_t n, cfloat v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_cdouble(cdouble* a, size_t n, cdouble v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_creal(creal* a, size_t n, creal v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_pointer(void** a, size_t n, void* v) -+{ -+ auto p = a; -+ auto end = a+n; -+ while (p !is end) -+ *p++ = v; -+} -+ -+void _d_array_init_mem(void* a, size_t na, void* v, size_t nv) -+{ -+ auto p = a; -+ auto end = a + na*nv; -+ while (p !is end) { -+ llvm_memcpy(p,v,nv,0); -+ p += nv; -+ } -+} -+ -+/* -+void _d_array_init(TypeInfo ti, void* a) -+{ -+ auto initializer = ti.next.init(); -+ auto isize = initializer.length; -+ auto q = initializer.ptr; -+ -+ if (isize == 1) -+ memset(p, *cast(ubyte*)q, size); -+ else if (isize == int.sizeof) -+ { -+ int init = *cast(int*)q; -+ size /= int.sizeof; -+ for (size_t u = 0; u < size; u++) -+ { -+ (cast(int*)p)[u] = init; -+ } -+ } -+ else -+ { -+ for (size_t u = 0; u < size; u += isize) -+ { -+ memcpy(p + u, q, isize); -+ } -+ } -+}*/ -+ -+// for array cast -+size_t _d_array_cast_len(size_t len, size_t elemsz, size_t newelemsz) -+{ -+ if (newelemsz == 1) { -+ return len*elemsz; -+ } -+ else if ((len*elemsz) % newelemsz) { -+ throw new Exception("Bad array cast"); -+ } -+ return (len*elemsz)/newelemsz; -+} -+ -+// slice copy when assertions are enabled -+void _d_array_slice_copy(void* dst, size_t dstlen, void* src, size_t srclen) -+{ -+ assert(dst); -+ assert(src); -+ if (dstlen != srclen) -+ throw new Exception("lengths don't match for array copy"); -+ else if (dst+dstlen <= src || src+srclen <= dst) -+ llvm_memcpy(dst, src, dstlen, 0); -+ else -+ throw new Exception("overlapping array copy"); -+} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayassign.d druntime/src/rt/arrayassign.d ---- druntime-old/src/rt/arrayassign.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arrayassign.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,186 +0,0 @@ --/** -- * Implementation of array assignment support routines. -- * -- * Copyright: Copyright Digital Mars 2000 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright -- * -- * Copyright Digital Mars 2000 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arrayassign; -- --private --{ -- import rt.util.string; -- import core.stdc.string; -- import core.stdc.stdlib; -- debug(PRINTF) import core.stdc.stdio; --} -- --/** -- * Does array assignment (not construction) from another -- * array of the same element type. -- * ti is the element type. -- * Handles overlapping copies. -- */ --extern (C) void[] _d_arrayassign(TypeInfo ti, void[] from, void[] to) --{ -- debug(PRINTF) printf("_d_arrayassign(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize()); -- -- if (to.length != from.length) -- { -- char[10] tmp = void; -- string msg = "lengths don't match for array copy,"c; -- msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length); -- throw new Exception(msg); -- } -- -- auto element_size = ti.tsize(); -- -- /* Need a temporary buffer tmp[] big enough to hold one element -- */ -- void[16] buf = void; -- void[] tmp; -- if (element_size > buf.sizeof) -- tmp = alloca(element_size)[0 .. element_size]; -- else -- tmp = buf; -- -- -- if (to.ptr <= from.ptr) -- { -- foreach (i; 0 .. to.length) -- { -- void* pto = to.ptr + i * element_size; -- void* pfrom = from.ptr + i * element_size; -- memcpy(tmp.ptr, pto, element_size); -- memcpy(pto, pfrom, element_size); -- ti.postblit(pto); -- ti.destroy(tmp.ptr); -- } -- } -- else -- { -- for (int i = to.length; i--; ) -- { -- void* pto = to.ptr + i * element_size; -- void* pfrom = from.ptr + i * element_size; -- memcpy(tmp.ptr, pto, element_size); -- memcpy(pto, pfrom, element_size); -- ti.postblit(pto); -- ti.destroy(tmp.ptr); -- } -- } -- return to; --} -- --/** -- * Does array initialization (not assignment) from another -- * array of the same element type. -- * ti is the element type. -- */ --extern (C) void[] _d_arrayctor(TypeInfo ti, void[] from, void[] to) --{ -- debug(PRINTF) printf("_d_arrayctor(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize()); -- -- if (to.length != from.length) -- { -- char[10] tmp = void; -- string msg = "lengths don't match for array initialization,"c; -- msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length); -- throw new Exception(msg); -- } -- -- auto element_size = ti.tsize(); -- -- int i; -- try -- { -- for (i = 0; i < to.length; i++) -- { -- // Copy construction is defined as bit copy followed by postblit. -- memcpy(to.ptr + i * element_size, from.ptr + i * element_size, element_size); -- ti.postblit(to.ptr + i * element_size); -- } -- } -- catch (Object o) -- { -- /* Destroy, in reverse order, what we've constructed so far -- */ -- while (i--) -- { -- ti.destroy(to.ptr + i * element_size); -- } -- -- throw o; -- } -- return to; --} -- -- --/** -- * Do assignment to an array. -- * p[0 .. count] = value; -- */ --extern (C) void* _d_arraysetassign(void* p, void* value, int count, TypeInfo ti) --{ -- void* pstart = p; -- -- auto element_size = ti.tsize(); -- -- //Need a temporary buffer tmp[] big enough to hold one element -- void[16] buf = void; -- void[] tmp; -- if (element_size > buf.sizeof) -- { -- tmp = alloca(element_size)[0 .. element_size]; -- } -- else -- tmp = buf; -- -- foreach (i; 0 .. count) -- { -- memcpy(tmp.ptr, p, element_size); -- memcpy(p, value, element_size); -- ti.postblit(p); -- ti.destroy(tmp.ptr); -- p += element_size; -- } -- return pstart; --} -- --/** -- * Do construction of an array. -- * ti[count] p = value; -- */ --extern (C) void* _d_arraysetctor(void* p, void* value, int count, TypeInfo ti) --{ -- void* pstart = p; -- auto element_size = ti.tsize(); -- -- try -- { -- foreach (i; 0 .. count) -- { -- // Copy construction is defined as bit copy followed by postblit. -- memcpy(p, value, element_size); -- ti.postblit(p); -- p += element_size; -- } -- } -- catch (Object o) -- { -- // Destroy, in reverse order, what we've constructed so far -- while (p > pstart) -- { -- p -= element_size; -- ti.destroy(p); -- } -- -- throw o; -- } -- return pstart; --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraybyte.d druntime/src/rt/arraybyte.d ---- druntime-old/src/rt/arraybyte.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arraybyte.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,1893 +0,0 @@ --/** -- * Contains SSE2 and MMX versions of certain operations for char, byte, and -- * ubyte ('a', 'g' and 'h' suffixes). -- * -- * Copyright: Copyright Digital Mars 2008 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, based on code originally written by Burton Radons -- * -- * Copyright Digital Mars 2008 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arraybyte; -- --import core.cpuid; -- --version (unittest) --{ -- private import core.stdc.stdio : printf; -- /* This is so unit tests will test every CPU variant -- */ -- int cpuid; -- const int CPUID_MAX = 4; -- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } -- bool sse() { return cpuid == 2 && core.cpuid.sse(); } -- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } -- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } --} --else --{ -- alias core.cpuid.mmx mmx; -- alias core.cpuid.sse sse; -- alias core.cpuid.sse2 sse2; -- alias core.cpuid.amd3dnow amd3dnow; --} -- --//version = log; -- --bool disjoint(T)(T[] a, T[] b) --{ -- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); --} -- --alias byte T; -- --extern (C): -- --/* ======================================================================== */ -- -- --/*********************** -- * Computes: -- * a[] = b[] + value -- */ -- --T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) --{ -- return _arraySliceExpAddSliceAssign_g(a, value, b); --} -- --T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) --{ -- return _arraySliceExpAddSliceAssign_g(a, value, b); --} -- --T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpAddSliceAssign_g()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1088% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- l |= (l << 16); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startaddsse2u: -- add ESI, 64; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM2, [EAX+32]; -- movdqu XMM3, [EAX+48]; -- add EAX, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM4; -- paddb XMM2, XMM4; -- paddb XMM3, XMM4; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startaddsse2a: -- add ESI, 64; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM2, [EAX+32]; -- movdqa XMM3, [EAX+48]; -- add EAX, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM4; -- paddb XMM2, XMM4; -- paddb XMM3, XMM4; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 1000% faster -- if (mmx() && a.length >= 32) -- { -- auto n = aptr + (a.length & ~31); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM4, l; -- pshufw MM4, MM4, 0; -- -- align 4; -- startaddmmx: -- add ESI, 32; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- add EAX, 32; -- paddb MM0, MM4; -- paddb MM1, MM4; -- paddb MM2, MM4; -- paddb MM3, MM4; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startaddmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- /* trying to be fair and treat normal 32-bit cpu the same way as we do -- * the SIMD units, with unrolled asm. There's not enough registers, -- * really. -- */ -- else -- if (a.length >= 4) -- { -- -- auto n = aptr + (a.length & ~3); -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov CL, value; -- -- align 4; -- startadd386: -- add ESI, 4; -- mov DX, [EAX]; -- mov BX, [EAX+2]; -- add EAX, 4; -- add BL, CL; -- add BH, CL; -- add DL, CL; -- add DH, CL; -- mov [ESI -4], DX; -- mov [ESI+2 -4], BX; -- cmp ESI, EDI; -- jb startadd386; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ + value); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpAddSliceAssign_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + c[] -- */ -- --T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceAddSliceAssign_g(a, c, b); --} -- --T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceAddSliceAssign_g(a, c, b); --} -- --T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- //printf("_arraySliceSliceAddSliceAssign_g()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 5739% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- version (log) printf("\tsse2 unaligned\n"); -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 8; -- startaddlsse2u: -- add ESI, 64; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM2, [EAX+32]; -- movdqu XMM3, [EAX+48]; -- add EAX, 64; -- movdqu XMM4, [ECX]; -- movdqu XMM5, [ECX+16]; -- movdqu XMM6, [ECX+32]; -- movdqu XMM7, [ECX+48]; -- add ECX, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM5; -- paddb XMM2, XMM6; -- paddb XMM3, XMM7; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddlsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- version (log) printf("\tsse2 aligned\n"); -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 8; -- startaddlsse2a: -- add ESI, 64; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM2, [EAX+32]; -- movdqa XMM3, [EAX+48]; -- add EAX, 64; -- movdqa XMM4, [ECX]; -- movdqa XMM5, [ECX+16]; -- movdqa XMM6, [ECX+32]; -- movdqa XMM7, [ECX+48]; -- add ECX, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM5; -- paddb XMM2, XMM6; -- paddb XMM3, XMM7; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddlsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 4428% faster -- if (mmx() && a.length >= 32) -- { -- version (log) printf("\tmmx\n"); -- auto n = aptr + (a.length & ~31); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startaddlmmx: -- add ESI, 32; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- add EAX, 32; -- movq MM4, [ECX]; -- movq MM5, [ECX+8]; -- movq MM6, [ECX+16]; -- movq MM7, [ECX+24]; -- add ECX, 32; -- paddb MM0, MM4; -- paddb MM1, MM5; -- paddb MM2, MM6; -- paddb MM3, MM7; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startaddlmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- version (log) if (aptr < aend) printf("\tbase\n"); -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ + *cptr++); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddSliceAssign_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += value -- */ -- --T[] _arrayExpSliceAddass_a(T[] a, T value) --{ -- return _arrayExpSliceAddass_g(a, value); --} -- --T[] _arrayExpSliceAddass_h(T[] a, T value) --{ -- return _arrayExpSliceAddass_g(a, value); --} -- --T[] _arrayExpSliceAddass_g(T[] a, T value) --{ -- //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1578% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- l |= (l << 16); -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startaddasssse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM2, [ESI+32]; -- movdqu XMM3, [ESI+48]; -- add ESI, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM4; -- paddb XMM2, XMM4; -- paddb XMM3, XMM4; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddasssse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startaddasssse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM2, [ESI+32]; -- movdqa XMM3, [ESI+48]; -- add ESI, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM4; -- paddb XMM2, XMM4; -- paddb XMM3, XMM4; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddasssse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 1721% faster -- if (mmx() && a.length >= 32) -- { -- -- auto n = aptr + (a.length & ~31); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd MM4, l; -- pshufw MM4, MM4, 0; -- -- align 8; -- startaddassmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- movq MM2, [ESI+16]; -- movq MM3, [ESI+24]; -- add ESI, 32; -- paddb MM0, MM4; -- paddb MM1, MM4; -- paddb MM2, MM4; -- paddb MM3, MM4; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startaddassmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceAddass_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] += 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += b[] -- */ -- --T[] _arraySliceSliceAddass_a(T[] a, T[] b) --{ -- return _arraySliceSliceAddass_g(a, b); --} -- --T[] _arraySliceSliceAddass_h(T[] a, T[] b) --{ -- return _arraySliceSliceAddass_g(a, b); --} -- --T[] _arraySliceSliceAddass_g(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceAddass_g()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 4727% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 8; -- startaddasslsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM2, [ESI+32]; -- movdqu XMM3, [ESI+48]; -- add ESI, 64; -- movdqu XMM4, [ECX]; -- movdqu XMM5, [ECX+16]; -- movdqu XMM6, [ECX+32]; -- movdqu XMM7, [ECX+48]; -- add ECX, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM5; -- paddb XMM2, XMM6; -- paddb XMM3, XMM7; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddasslsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 8; -- startaddasslsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM2, [ESI+32]; -- movdqa XMM3, [ESI+48]; -- add ESI, 64; -- movdqa XMM4, [ECX]; -- movdqa XMM5, [ECX+16]; -- movdqa XMM6, [ECX+32]; -- movdqa XMM7, [ECX+48]; -- add ECX, 64; -- paddb XMM0, XMM4; -- paddb XMM1, XMM5; -- paddb XMM2, XMM6; -- paddb XMM3, XMM7; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startaddasslsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 3059% faster -- if (mmx() && a.length >= 32) -- { -- -- auto n = aptr + (a.length & ~31); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 8; -- startaddasslmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- movq MM2, [ESI+16]; -- movq MM3, [ESI+24]; -- add ESI, 32; -- movq MM4, [ECX]; -- movq MM5, [ECX+8]; -- movq MM6, [ECX+16]; -- movq MM7, [ECX+24]; -- add ECX, 32; -- paddb MM0, MM4; -- paddb MM1, MM5; -- paddb MM2, MM6; -- paddb MM3, MM7; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startaddasslmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddass_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] += b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- -- --/*********************** -- * Computes: -- * a[] = b[] - value -- */ -- --T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) --{ -- return _arraySliceExpMinSliceAssign_g(a, value, b); --} -- --T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) --{ -- return _arraySliceExpMinSliceAssign_g(a, value, b); --} -- --T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMinSliceAssign_g()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1189% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- l |= (l << 16); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startsubsse2u: -- add ESI, 64; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM2, [EAX+32]; -- movdqu XMM3, [EAX+48]; -- add EAX, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM4; -- psubb XMM2, XMM4; -- psubb XMM3, XMM4; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsubsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startsubsse2a: -- add ESI, 64; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM2, [EAX+32]; -- movdqa XMM3, [EAX+48]; -- add EAX, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM4; -- psubb XMM2, XMM4; -- psubb XMM3, XMM4; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsubsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 1079% faster -- if (mmx() && a.length >= 32) -- { -- auto n = aptr + (a.length & ~31); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM4, l; -- pshufw MM4, MM4, 0; -- -- align 4; -- startsubmmx: -- add ESI, 32; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- add EAX, 32; -- psubb MM0, MM4; -- psubb MM1, MM4; -- psubb MM2, MM4; -- psubb MM3, MM4; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startsubmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. -- else -- if (a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov CL, value; -- -- align 4; -- startsub386: -- add ESI, 4; -- mov DX, [EAX]; -- mov BX, [EAX+2]; -- add EAX, 4; -- sub BL, CL; -- sub BH, CL; -- sub DL, CL; -- sub DH, CL; -- mov [ESI -4], DX; -- mov [ESI+2 -4], BX; -- cmp ESI, EDI; -- jb startsub386; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ - value); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMinSliceAssign_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] = b[] - 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(b[i] - 6)) -- { -- printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = value - b[] -- */ -- --T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) --{ -- return _arrayExpSliceMinSliceAssign_g(a, b, value); --} -- --T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) --{ -- return _arrayExpSliceMinSliceAssign_g(a, b, value); --} -- --T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arrayExpSliceMinSliceAssign_g()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 8748% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- l |= (l << 16); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startsubrsse2u: -- add ESI, 64; -- movdqa XMM5, XMM4; -- movdqa XMM6, XMM4; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- psubb XMM5, XMM0; -- psubb XMM6, XMM1; -- movdqu [ESI -64], XMM5; -- movdqu [ESI+16-64], XMM6; -- movdqa XMM5, XMM4; -- movdqa XMM6, XMM4; -- movdqu XMM2, [EAX+32]; -- movdqu XMM3, [EAX+48]; -- add EAX, 64; -- psubb XMM5, XMM2; -- psubb XMM6, XMM3; -- movdqu [ESI+32-64], XMM5; -- movdqu [ESI+48-64], XMM6; -- cmp ESI, EDI; -- jb startsubrsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startsubrsse2a: -- add ESI, 64; -- movdqa XMM5, XMM4; -- movdqa XMM6, XMM4; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- psubb XMM5, XMM0; -- psubb XMM6, XMM1; -- movdqa [ESI -64], XMM5; -- movdqa [ESI+16-64], XMM6; -- movdqa XMM5, XMM4; -- movdqa XMM6, XMM4; -- movdqa XMM2, [EAX+32]; -- movdqa XMM3, [EAX+48]; -- add EAX, 64; -- psubb XMM5, XMM2; -- psubb XMM6, XMM3; -- movdqa [ESI+32-64], XMM5; -- movdqa [ESI+48-64], XMM6; -- cmp ESI, EDI; -- jb startsubrsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 7397% faster -- if (mmx() && a.length >= 32) -- { -- auto n = aptr + (a.length & ~31); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM4, l; -- pshufw MM4, MM4, 0; -- -- align 4; -- startsubrmmx: -- add ESI, 32; -- movq MM5, MM4; -- movq MM6, MM4; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- psubb MM5, MM0; -- psubb MM6, MM1; -- movq [ESI -32], MM5; -- movq [ESI+8 -32], MM6; -- movq MM5, MM4; -- movq MM6, MM4; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- add EAX, 32; -- psubb MM5, MM2; -- psubb MM6, MM3; -- movq [ESI+16-32], MM5; -- movq [ESI+24-32], MM6; -- cmp ESI, EDI; -- jb startsubrmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(value - *bptr++); -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinSliceAssign_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] = 6 - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(6 - b[i])) -- { -- printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - c[] -- */ -- --T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMinSliceAssign_g(a, c, b); --} -- --T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMinSliceAssign_g(a, c, b); --} -- --T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 5756% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 8; -- startsublsse2u: -- add ESI, 64; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM2, [EAX+32]; -- movdqu XMM3, [EAX+48]; -- add EAX, 64; -- movdqu XMM4, [ECX]; -- movdqu XMM5, [ECX+16]; -- movdqu XMM6, [ECX+32]; -- movdqu XMM7, [ECX+48]; -- add ECX, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM5; -- psubb XMM2, XMM6; -- psubb XMM3, XMM7; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsublsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 8; -- startsublsse2a: -- add ESI, 64; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM2, [EAX+32]; -- movdqa XMM3, [EAX+48]; -- add EAX, 64; -- movdqa XMM4, [ECX]; -- movdqa XMM5, [ECX+16]; -- movdqa XMM6, [ECX+32]; -- movdqa XMM7, [ECX+48]; -- add ECX, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM5; -- psubb XMM2, XMM6; -- psubb XMM3, XMM7; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsublsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 4428% faster -- if (mmx() && a.length >= 32) -- { -- auto n = aptr + (a.length & ~31); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 8; -- startsublmmx: -- add ESI, 32; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- add EAX, 32; -- movq MM4, [ECX]; -- movq MM5, [ECX+8]; -- movq MM6, [ECX+16]; -- movq MM7, [ECX+24]; -- add ECX, 32; -- psubb MM0, MM4; -- psubb MM1, MM5; -- psubb MM2, MM6; -- psubb MM3, MM7; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startsublmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ - *cptr++); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMinSliceAssign_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= value -- */ -- --T[] _arrayExpSliceMinass_a(T[] a, T value) --{ -- return _arrayExpSliceMinass_g(a, value); --} -- --T[] _arrayExpSliceMinass_h(T[] a, T value) --{ -- return _arrayExpSliceMinass_g(a, value); --} -- --T[] _arrayExpSliceMinass_g(T[] a, T value) --{ -- //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1577% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- l |= (l << 16); -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startsubasssse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM2, [ESI+32]; -- movdqu XMM3, [ESI+48]; -- add ESI, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM4; -- psubb XMM2, XMM4; -- psubb XMM3, XMM4; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsubasssse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 8; -- startsubasssse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM2, [ESI+32]; -- movdqa XMM3, [ESI+48]; -- add ESI, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM4; -- psubb XMM2, XMM4; -- psubb XMM3, XMM4; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsubasssse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 1577% faster -- if (mmx() && a.length >= 32) -- { -- -- auto n = aptr + (a.length & ~31); -- -- uint l = cast(ubyte) value; -- l |= (l << 8); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd MM4, l; -- pshufw MM4, MM4, 0; -- -- align 8; -- startsubassmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- movq MM2, [ESI+16]; -- movq MM3, [ESI+24]; -- add ESI, 32; -- psubb MM0, MM4; -- psubb MM1, MM4; -- psubb MM2, MM4; -- psubb MM3, MM4; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startsubassmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinass_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] -- */ -- --T[] _arraySliceSliceMinass_a(T[] a, T[] b) --{ -- return _arraySliceSliceMinass_g(a, b); --} -- --T[] _arraySliceSliceMinass_h(T[] a, T[] b) --{ -- return _arraySliceSliceMinass_g(a, b); --} -- --T[] _arraySliceSliceMinass_g(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMinass_g()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 4800% faster -- if (sse2() && a.length >= 64) -- { -- auto n = aptr + (a.length & ~63); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 8; -- startsubasslsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM2, [ESI+32]; -- movdqu XMM3, [ESI+48]; -- add ESI, 64; -- movdqu XMM4, [ECX]; -- movdqu XMM5, [ECX+16]; -- movdqu XMM6, [ECX+32]; -- movdqu XMM7, [ECX+48]; -- add ECX, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM5; -- psubb XMM2, XMM6; -- psubb XMM3, XMM7; -- movdqu [ESI -64], XMM0; -- movdqu [ESI+16-64], XMM1; -- movdqu [ESI+32-64], XMM2; -- movdqu [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsubasslsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 8; -- startsubasslsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM2, [ESI+32]; -- movdqa XMM3, [ESI+48]; -- add ESI, 64; -- movdqa XMM4, [ECX]; -- movdqa XMM5, [ECX+16]; -- movdqa XMM6, [ECX+32]; -- movdqa XMM7, [ECX+48]; -- add ECX, 64; -- psubb XMM0, XMM4; -- psubb XMM1, XMM5; -- psubb XMM2, XMM6; -- psubb XMM3, XMM7; -- movdqa [ESI -64], XMM0; -- movdqa [ESI+16-64], XMM1; -- movdqa [ESI+32-64], XMM2; -- movdqa [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsubasslsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 3107% faster -- if (mmx() && a.length >= 32) -- { -- -- auto n = aptr + (a.length & ~31); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 8; -- startsubasslmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- movq MM2, [ESI+16]; -- movq MM3, [ESI+24]; -- add ESI, 32; -- movq MM4, [ECX]; -- movq MM5, [ECX+8]; -- movq MM6, [ECX+16]; -- movq MM7, [ECX+24]; -- add ECX, 32; -- psubb MM0, MM4; -- psubb MM1, MM5; -- psubb MM2, MM6; -- psubb MM3, MM7; -- movq [ESI -32], MM0; -- movq [ESI+8 -32], MM1; -- movq [ESI+16-32], MM2; -- movq [ESI+24-32], MM3; -- cmp ESI, EDI; -- jb startsubasslmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMinass_g unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] -= b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycast.d druntime/src/rt/arraycast.d ---- druntime-old/src/rt/arraycast.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arraycast.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,94 +0,0 @@ --/** -- * Implementation of array cast support routines. -- * -- * Copyright: Copyright Digital Mars 2004 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, Sean Kelly -- * -- * Copyright Digital Mars 2004 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arraycast; -- --/****************************************** -- * Runtime helper to convert dynamic array of one -- * type to dynamic array of another. -- * Adjusts the length of the array. -- * Throws exception if new length is not aligned. -- */ -- --extern (C) -- --void[] _d_arraycast(size_t tsize, size_t fsize, void[] a) --{ -- auto length = a.length; -- -- auto nbytes = length * fsize; -- if (nbytes % tsize != 0) -- { -- throw new Exception("array cast misalignment"); -- } -- length = nbytes / tsize; -- *cast(size_t *)&a = length; // jam new length -- return a; --} -- --unittest --{ -- byte[int.sizeof * 3] b; -- int[] i; -- short[] s; -- -- i = cast(int[])b; -- assert(i.length == 3); -- -- s = cast(short[])b; -- assert(s.length == 6); -- -- s = cast(short[])i; -- assert(s.length == 6); --} -- --/****************************************** -- * Runtime helper to convert dynamic array of bits -- * dynamic array of another. -- * Adjusts the length of the array. -- * Throws exception if new length is not aligned. -- */ -- --version (none) --{ --extern (C) -- --void[] _d_arraycast_frombit(uint tsize, void[] a) --{ -- uint length = a.length; -- -- if (length & 7) -- { -- throw new Exception("bit[] array cast misalignment"); -- } -- length /= 8 * tsize; -- *cast(size_t *)&a = length; // jam new length -- return a; --} -- --unittest --{ -- version (D_Bits) -- { -- bit[int.sizeof * 3 * 8] b; -- int[] i; -- short[] s; -- -- i = cast(int[])b; -- assert(i.length == 3); -- -- s = cast(short[])b; -- assert(s.length == 6); -- } --} -- --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycat.d druntime/src/rt/arraycat.d ---- druntime-old/src/rt/arraycat.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arraycat.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,42 +0,0 @@ --/** -- * Implementation of array copy support routines. -- * -- * Copyright: Copyright Digital Mars 2004 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, Sean Kelly -- * -- * Copyright Digital Mars 2004 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arraycat; -- --private --{ -- import core.stdc.string; -- debug import core.stdc.stdio; --} -- --extern (C): -- --byte[] _d_arraycopy(size_t size, byte[] from, byte[] to) --{ -- debug printf("f = %p,%d, t = %p,%d, size = %d\n", -- from.ptr, from.length, to.ptr, to.length, size); -- -- if (to.length != from.length) -- { -- throw new Exception("lengths don't match for array copy"); -- } -- else if (to.ptr + to.length * size <= from.ptr || -- from.ptr + from.length * size <= to.ptr) -- { -- memcpy(to.ptr, from.ptr, to.length * size); -- } -- else -- { -- throw new Exception("overlapping array copy"); -- } -- return to; --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraydouble.d druntime/src/rt/arraydouble.d ---- druntime-old/src/rt/arraydouble.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arraydouble.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,1720 +0,0 @@ --/** -- * Contains SSE2 and MMX versions of certain operations for double. -- * -- * Copyright: Copyright Digital Mars 2008 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, based on code originally written by Burton Radons -- * -- * Copyright Digital Mars 2008 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arraydouble; -- --private import core.cpuid; -- --version (unittest) --{ -- private import core.stdc.stdio : printf; -- /* This is so unit tests will test every CPU variant -- */ -- int cpuid; -- const int CPUID_MAX = 5; -- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } -- bool sse() { return cpuid == 2 && core.cpuid.sse(); } -- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } -- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } --} --else --{ -- alias core.cpuid.mmx mmx; -- alias core.cpuid.sse sse; -- alias core.cpuid.sse2 sse2; -- alias core.cpuid.amd3dnow amd3dnow; --} -- --//version = log; -- --bool disjoint(T)(T[] a, T[] b) --{ -- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); --} -- --/* Performance figures measured by Burton Radons -- */ -- --alias double T; -- --extern (C): -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + c[] -- */ -- --T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 333% faster -- if (sse2() && b.length >= 16) -- { -- auto n = aptr + (b.length & ~15); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; // left operand -- mov ECX, cptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- movupd XMM4, [ECX]; -- movupd XMM5, [ECX+16]; -- movupd XMM6, [ECX+32]; -- movupd XMM7, [ECX+48]; -- add ESI, 64; -- addpd XMM0, XMM4; -- addpd XMM1, XMM5; -- addpd XMM2, XMM6; -- addpd XMM3, XMM7; -- add ECX, 64; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- // Handle remainder -- while (aptr < aend) -- *aptr++ = *bptr++ + *cptr++; -- -- return a; --} -- -- --unittest --{ -- printf("_arraySliceSliceAddSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - c[] -- */ -- --T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 324% faster -- if (sse2() && b.length >= 8) -- { -- auto n = aptr + (b.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; // left operand -- mov ECX, cptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- movupd XMM4, [ECX]; -- movupd XMM5, [ECX+16]; -- movupd XMM6, [ECX+32]; -- movupd XMM7, [ECX+48]; -- add ESI, 64; -- subpd XMM0, XMM4; -- subpd XMM1, XMM5; -- subpd XMM2, XMM6; -- subpd XMM3, XMM7; -- add ECX, 64; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- // Handle remainder -- while (aptr < aend) -- *aptr++ = *bptr++ - *cptr++; -- -- return a; --} -- -- --unittest --{ -- printf("_arraySliceSliceMinSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + value -- */ -- --T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpAddSliceAssign_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 305% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- addpd XMM0, XMM4; -- addpd XMM1, XMM4; -- addpd XMM2, XMM4; -- addpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ + value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpAddSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += value -- */ -- --T[] _arrayExpSliceAddass_d(T[] a, T value) --{ -- //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 114% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- if (aptr < n) -- -- // Unaligned case -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloopa: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- addpd XMM0, XMM4; -- addpd XMM1, XMM4; -- addpd XMM2, XMM4; -- addpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopa; -- -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceAddass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] += 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += b[] -- */ -- --T[] _arraySliceSliceAddass_d(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceAddass_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 183% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov ECX, bptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- movupd XMM4, [ECX]; -- movupd XMM5, [ECX+16]; -- movupd XMM6, [ECX+32]; -- movupd XMM7, [ECX+48]; -- add ECX, 64; -- addpd XMM0, XMM4; -- addpd XMM1, XMM5; -- addpd XMM2, XMM6; -- addpd XMM3, XMM7; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] += b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - value -- */ -- --T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMinSliceAssign_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 305% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- subpd XMM0, XMM4; -- subpd XMM1, XMM4; -- subpd XMM2, XMM4; -- subpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ - value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMinSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = value - b[] -- */ -- --T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arrayExpSliceMinSliceAssign_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 66% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movapd XMM5, XMM4; -- movapd XMM6, XMM4; -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- subpd XMM5, XMM0; -- subpd XMM6, XMM1; -- movupd [ESI+ 0-64], XMM5; -- movupd [ESI+16-64], XMM6; -- movapd XMM5, XMM4; -- movapd XMM6, XMM4; -- subpd XMM5, XMM2; -- subpd XMM6, XMM3; -- movupd [ESI+32-64], XMM5; -- movupd [ESI+48-64], XMM6; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = value - *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = 6 - a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(6 - a[i])) -- { -- printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= value -- */ -- --T[] _arrayExpSliceMinass_d(T[] a, T value) --{ -- //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 115% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- if (aptr < n) -- -- // Unaligned case -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloopa: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- subpd XMM0, XMM4; -- subpd XMM1, XMM4; -- subpd XMM2, XMM4; -- subpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopa; -- -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] -- */ -- --T[] _arraySliceSliceMinass_d(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMinass_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 183% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov ECX, bptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- movupd XMM4, [ECX]; -- movupd XMM5, [ECX+16]; -- movupd XMM6, [ECX+32]; -- movupd XMM7, [ECX+48]; -- add ECX, 64; -- subpd XMM0, XMM4; -- subpd XMM1, XMM5; -- subpd XMM2, XMM6; -- subpd XMM3, XMM7; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * value -- */ -- --T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMulSliceAssign_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 304% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- mulpd XMM0, XMM4; -- mulpd XMM1, XMM4; -- mulpd XMM2, XMM4; -- mulpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ * value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * c[] -- */ -- --T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- //printf("_arraySliceSliceMulSliceAssign_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 329% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; // left operand -- mov ECX, cptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add ESI, 64; -- movupd XMM4, [ECX]; -- movupd XMM5, [ECX+16]; -- movupd XMM6, [ECX+32]; -- movupd XMM7, [ECX+48]; -- add EAX, 64; -- mulpd XMM0, XMM4; -- mulpd XMM1, XMM5; -- mulpd XMM2, XMM6; -- mulpd XMM3, XMM7; -- add ECX, 64; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ * *cptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMulSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * b[i])) -- { -- printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= value -- */ -- --T[] _arrayExpSliceMulass_d(T[] a, T value) --{ -- //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 109% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- if (aptr < n) -- -- // Unaligned case -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, value; -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloopa: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- mulpd XMM0, XMM4; -- mulpd XMM1, XMM4; -- mulpd XMM2, XMM4; -- mulpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopa; -- -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ *= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMulass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] *= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= b[] -- */ -- --T[] _arraySliceSliceMulass_d(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMulass_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 205% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov ECX, bptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- movupd XMM4, [ECX]; -- movupd XMM5, [ECX+16]; -- movupd XMM6, [ECX+32]; -- movupd XMM7, [ECX+48]; -- add ECX, 64; -- mulpd XMM0, XMM4; -- mulpd XMM1, XMM5; -- mulpd XMM2, XMM6; -- mulpd XMM3, XMM7; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ *= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMulass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] *= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] / value -- */ -- --T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpDivSliceAssign_d()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- /* Multiplying by the reciprocal is faster, but does -- * not produce as accurate an answer. -- */ -- T recip = cast(T)1 / value; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 299% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, recip; -- //movsd XMM4, value -- //rcpsd XMM4, XMM4 -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movupd XMM0, [EAX]; -- movupd XMM1, [EAX+16]; -- movupd XMM2, [EAX+32]; -- movupd XMM3, [EAX+48]; -- add EAX, 64; -- mulpd XMM0, XMM4; -- mulpd XMM1, XMM4; -- mulpd XMM2, XMM4; -- mulpd XMM3, XMM4; -- //divpd XMM0, XMM4; -- //divpd XMM1, XMM4; -- //divpd XMM2, XMM4; -- //divpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- { -- *aptr++ = *bptr++ / value; -- //*aptr++ = *bptr++ * recip; -- } -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpDivSliceAssign_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] / 8; -- -- for (int i = 0; i < dim; i++) -- { -- //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); -- if (c[i] != cast(T)(a[i] / 8)) -- { -- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] /= value -- */ -- --T[] _arrayExpSliceDivass_d(T[] a, T value) --{ -- //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- /* Multiplying by the reciprocal is faster, but does -- * not produce as accurate an answer. -- */ -- T recip = cast(T)1 / value; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 version is 65% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- // Unaligned case -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movsd XMM4, recip; -- //movsd XMM4, value -- //rcpsd XMM4, XMM4 -- shufpd XMM4, XMM4, 0; -- -- align 8; -- startsseloopa: -- movupd XMM0, [ESI]; -- movupd XMM1, [ESI+16]; -- movupd XMM2, [ESI+32]; -- movupd XMM3, [ESI+48]; -- add ESI, 64; -- mulpd XMM0, XMM4; -- mulpd XMM1, XMM4; -- mulpd XMM2, XMM4; -- mulpd XMM3, XMM4; -- //divpd XMM0, XMM4; -- //divpd XMM1, XMM4; -- //divpd XMM2, XMM4; -- //divpd XMM3, XMM4; -- movupd [ESI+ 0-64], XMM0; -- movupd [ESI+16-64], XMM1; -- movupd [ESI+32-64], XMM2; -- movupd [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopa; -- -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ *= recip; -- -- return a; --} -- -- --unittest --{ -- printf("_arrayExpSliceDivass_d unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] /= 8; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] / 8)) -- { -- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] * value -- */ -- --T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAddass_d(a, -value, b); --} -- --/*********************** -- * Computes: -- * a[] += b[] * value -- */ -- --T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- // Handle remainder -- while (aptr < aend) -- *aptr++ += *bptr++ * value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAddass_d unittest\n"); -- -- cpuid = 1; -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 1; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] += a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); -- if (c[i] != cast(T)(b[i] + a[i] * 6)) -- { -- printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayfloat.d druntime/src/rt/arrayfloat.d ---- druntime-old/src/rt/arrayfloat.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arrayfloat.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,1435 +0,0 @@ --/** -- * Contains SSE2 and MMX versions of certain operations for float. -- * -- * Copyright: Copyright Digital Mars 2008 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, based on code originally written by Burton Radons -- * -- * Copyright Digital Mars 2008 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arrayfloat; -- --private import core.cpuid; -- --version (unittest) --{ -- private import core.stdc.stdio : printf; -- /* This is so unit tests will test every CPU variant -- */ -- int cpuid; -- const int CPUID_MAX = 5; -- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } -- bool sse() { return cpuid == 2 && core.cpuid.sse(); } -- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } -- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } --} --else --{ -- alias core.cpuid.mmx mmx; -- alias core.cpuid.sse sse; -- alias core.cpuid.sse2 sse2; -- alias core.cpuid.amd3dnow amd3dnow; --} -- --//version = log; -- --bool disjoint(T)(T[] a, T[] b) --{ -- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); --} -- --alias float T; -- --extern (C): -- --/* ======================================================================== */ --/* ======================================================================== */ -- --/* template for the case -- * a[] = b[] ? c[] -- * with some binary operator ? -- */ --private template CodeGenSliceSliceOp(string opD, string opSSE, string op3DNow) --{ -- const CodeGenSliceSliceOp = ` -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE version is 834% faster -- if (sse() && b.length >= 16) -- { -- auto n = aptr + (b.length & ~15); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; // left operand -- mov ECX, cptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movups XMM0, [EAX]; -- movups XMM1, [EAX+16]; -- movups XMM2, [EAX+32]; -- movups XMM3, [EAX+48]; -- add EAX, 64; -- movups XMM4, [ECX]; -- movups XMM5, [ECX+16]; -- movups XMM6, [ECX+32]; -- movups XMM7, [ECX+48]; -- add ESI, 64; -- ` ~ opSSE ~ ` XMM0, XMM4; -- ` ~ opSSE ~ ` XMM1, XMM5; -- ` ~ opSSE ~ ` XMM2, XMM6; -- ` ~ opSSE ~ ` XMM3, XMM7; -- add ECX, 64; -- movups [ESI+ 0-64], XMM0; -- movups [ESI+16-64], XMM1; -- movups [ESI+32-64], XMM2; -- movups [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- // 3DNow! version is only 13% faster -- if (amd3dnow() && b.length >= 8) -- { -- auto n = aptr + (b.length & ~7); -- -- asm -- { -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- mov EAX, bptr; // left operand -- mov ECX, cptr; // right operand -- -- align 4; -- start3dnow: -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- ` ~ op3DNow ~ ` MM0, [ECX]; -- ` ~ op3DNow ~ ` MM1, [ECX+8]; -- ` ~ op3DNow ~ ` MM2, [ECX+16]; -- ` ~ op3DNow ~ ` MM3, [ECX+24]; -- movq [ESI], MM0; -- movq [ESI+8], MM1; -- movq [ESI+16], MM2; -- movq [ESI+24], MM3; -- add ECX, 32; -- add ESI, 32; -- add EAX, 32; -- cmp ESI, EDI; -- jb start3dnow; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- // Handle remainder -- while (aptr < aend) -- *aptr++ = *bptr++ ` ~ opD ~ ` *cptr++; -- -- return a;`; --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + c[] -- */ -- --T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- mixin(CodeGenSliceSliceOp!("+", "addps", "pfadd")); --} -- -- --unittest --{ -- printf("_arraySliceSliceAddSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - c[] -- */ -- --T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- mixin(CodeGenSliceSliceOp!("-", "subps", "pfsub")); --} -- -- --unittest --{ -- printf("_arraySliceSliceMinSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * c[] -- */ -- --T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- mixin(CodeGenSliceSliceOp!("*", "mulps", "pfmul")); --} -- --unittest --{ -- printf("_arraySliceSliceMulSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * b[i])) -- { -- printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/* template for the case -- * a[] ?= value -- * with some binary operator ? -- */ --private template CodeGenExpSliceOpAssign(string opD, string opSSE, string op3DNow) --{ -- const CodeGenExpSliceOpAssign = ` -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- if (sse() && a.length >= 16) -- { -- auto aabeg = cast(T*)((cast(uint)aptr + 15) & ~15); // beginning of paragraph-aligned slice of a -- auto aaend = cast(T*)((cast(uint)aend) & ~15); // end of paragraph-aligned slice of a -- -- int numAligned = cast(int)(aaend - aabeg); // how many floats are in the aligned slice? -- -- // are there at least 16 floats in the paragraph-aligned slice? -- // otherwise we can't do anything with SSE. -- if (numAligned >= 16) -- { -- aaend = aabeg + (numAligned & ~15); // make sure the slice is actually a multiple of 16 floats long -- -- // process values up to aligned slice one by one -- while (aptr < aabeg) -- *aptr++ ` ~ opD ~ ` value; -- -- // process aligned slice with fast SSE operations -- asm -- { -- mov ESI, aabeg; -- mov EDI, aaend; -- movss XMM4, value; -- shufps XMM4, XMM4, 0; -- -- align 8; -- startsseloopa: -- movaps XMM0, [ESI]; -- movaps XMM1, [ESI+16]; -- movaps XMM2, [ESI+32]; -- movaps XMM3, [ESI+48]; -- add ESI, 64; -- ` ~ opSSE ~ ` XMM0, XMM4; -- ` ~ opSSE ~ ` XMM1, XMM4; -- ` ~ opSSE ~ ` XMM2, XMM4; -- ` ~ opSSE ~ ` XMM3, XMM4; -- movaps [ESI+ 0-64], XMM0; -- movaps [ESI+16-64], XMM1; -- movaps [ESI+32-64], XMM2; -- movaps [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopa; -- } -- aptr = aaend; -- } -- } -- else -- // 3DNow! version is 63% faster -- if (amd3dnow() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- ulong w = *cast(uint *) &value; -- ulong v = w | (w << 32L); -- -- asm -- { -- mov ESI, dword ptr [aptr]; -- mov EDI, dword ptr [n]; -- movq MM4, qword ptr [v]; -- -- align 8; -- start: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- movq MM2, [ESI+16]; -- movq MM3, [ESI+24]; -- ` ~ op3DNow ~ ` MM0, MM4; -- ` ~ op3DNow ~ ` MM1, MM4; -- ` ~ op3DNow ~ ` MM2, MM4; -- ` ~ op3DNow ~ ` MM3, MM4; -- movq [ESI], MM0; -- movq [ESI+8], MM1; -- movq [ESI+16], MM2; -- movq [ESI+24], MM3; -- add ESI, 32; -- cmp ESI, EDI; -- jb start; -- -- emms; -- mov dword ptr [aptr], ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ ` ~ opD ~ ` value; -- -- return a;`; --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += value -- */ -- --T[] _arrayExpSliceAddass_f(T[] a, T value) --{ -- mixin(CodeGenExpSliceOpAssign!("+=", "addps", "pfadd")); --} -- --unittest --{ -- printf("_arrayExpSliceAddass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] += 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= value -- */ -- --T[] _arrayExpSliceMinass_f(T[] a, T value) --{ -- mixin(CodeGenExpSliceOpAssign!("-=", "subps", "pfsub")); --} -- --unittest --{ -- printf("_arrayExpSliceminass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= value -- */ -- --T[] _arrayExpSliceMulass_f(T[] a, T value) --{ -- mixin(CodeGenExpSliceOpAssign!("*=", "mulps", "pfmul")); --} -- --unittest --{ -- printf("_arrayExpSliceMulass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] *= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] /= value -- */ -- --T[] _arrayExpSliceDivass_f(T[] a, T value) --{ -- return _arrayExpSliceMulass_f(a, 1f / value); --} -- --unittest --{ -- printf("_arrayExpSliceDivass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] /= 8; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] / 8)) -- { -- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ --/* ======================================================================== */ -- --/* template for the case -- * a[] = b[] ? value -- * with some binary operator ? -- */ --private template CodeGenSliceExpOp(string opD, string opSSE, string op3DNow) --{ -- const CodeGenSliceExpOp = ` -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE version is 665% faster -- if (sse() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movss XMM4, value; -- shufps XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movups XMM0, [EAX]; -- movups XMM1, [EAX+16]; -- movups XMM2, [EAX+32]; -- movups XMM3, [EAX+48]; -- add EAX, 64; -- ` ~ opSSE ~ ` XMM0, XMM4; -- ` ~ opSSE ~ ` XMM1, XMM4; -- ` ~ opSSE ~ ` XMM2, XMM4; -- ` ~ opSSE ~ ` XMM3, XMM4; -- movups [ESI+ 0-64], XMM0; -- movups [ESI+16-64], XMM1; -- movups [ESI+32-64], XMM2; -- movups [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- // 3DNow! version is 69% faster -- if (amd3dnow() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- ulong w = *cast(uint *) &value; -- ulong v = w | (w << 32L); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movq MM4, qword ptr [v]; -- -- align 8; -- start3dnow: -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- ` ~ op3DNow ~ ` MM0, MM4; -- ` ~ op3DNow ~ ` MM1, MM4; -- ` ~ op3DNow ~ ` MM2, MM4; -- ` ~ op3DNow ~ ` MM3, MM4; -- movq [ESI], MM0; -- movq [ESI+8], MM1; -- movq [ESI+16], MM2; -- movq [ESI+24], MM3; -- add ESI, 32; -- add EAX, 32; -- cmp ESI, EDI; -- jb start3dnow; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ ` ~ opD ~ ` value; -- -- return a;`; --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + value -- */ -- --T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- mixin(CodeGenSliceExpOp!("+", "addps", "pfadd")); --} -- --unittest --{ -- printf("_arraySliceExpAddSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - value -- */ -- --T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- mixin(CodeGenSliceExpOp!("-", "subps", "pfsub")); --} -- --unittest --{ -- printf("_arraySliceExpMinSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * value -- */ -- --T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- mixin(CodeGenSliceExpOp!("*", "mulps", "pfmul")); --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] / value -- */ -- --T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAssign_f(a, 1f/value, b); --} -- --unittest --{ -- printf("_arraySliceExpDivSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] / 8; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] / 8)) -- { -- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ --/* ======================================================================== */ -- --private template CodeGenSliceOpAssign(string opD, string opSSE, string op3DNow) --{ -- const CodeGenSliceOpAssign = ` -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE version is 468% faster -- if (sse() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- // Unaligned case -- asm -- { -- mov ECX, bptr; // right operand -- mov ESI, aptr; // destination operand -- mov EDI, n; // end comparison -- -- align 8; -- startsseloopb: -- movups XMM0, [ESI]; -- movups XMM1, [ESI+16]; -- movups XMM2, [ESI+32]; -- movups XMM3, [ESI+48]; -- add ESI, 64; -- movups XMM4, [ECX]; -- movups XMM5, [ECX+16]; -- movups XMM6, [ECX+32]; -- movups XMM7, [ECX+48]; -- add ECX, 64; -- ` ~ opSSE ~ ` XMM0, XMM4; -- ` ~ opSSE ~ ` XMM1, XMM5; -- ` ~ opSSE ~ ` XMM2, XMM6; -- ` ~ opSSE ~ ` XMM3, XMM7; -- movups [ESI+ 0-64], XMM0; -- movups [ESI+16-64], XMM1; -- movups [ESI+32-64], XMM2; -- movups [ESI+48-64], XMM3; -- cmp ESI, EDI; -- jb startsseloopb; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- // 3DNow! version is 57% faster -- if (amd3dnow() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, dword ptr [aptr]; // destination operand -- mov EDI, dword ptr [n]; // end comparison -- mov ECX, dword ptr [bptr]; // right operand -- -- align 4; -- start3dnow: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- movq MM2, [ESI+16]; -- movq MM3, [ESI+24]; -- ` ~ op3DNow ~ ` MM0, [ECX]; -- ` ~ op3DNow ~ ` MM1, [ECX+8]; -- ` ~ op3DNow ~ ` MM2, [ECX+16]; -- ` ~ op3DNow ~ ` MM3, [ECX+24]; -- movq [ESI], MM0; -- movq [ESI+8], MM1; -- movq [ESI+16], MM2; -- movq [ESI+24], MM3; -- add ESI, 32; -- add ECX, 32; -- cmp ESI, EDI; -- jb start3dnow; -- -- emms; -- mov dword ptr [aptr], ESI; -- mov dword ptr [bptr], ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ ` ~ opD ~ ` *bptr++; -- -- return a;`; --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += b[] -- */ -- --T[] _arraySliceSliceAddass_f(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- mixin(CodeGenSliceOpAssign!("+=", "addps", "pfadd")); --} -- --unittest --{ -- printf("_arraySliceSliceAddass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] += b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] -- */ -- --T[] _arraySliceSliceMinass_f(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- mixin(CodeGenSliceOpAssign!("-=", "subps", "pfsub")); --} -- --unittest --{ -- printf("_arrayExpSliceMinass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= b[] -- */ -- --T[] _arraySliceSliceMulass_f(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- mixin(CodeGenSliceOpAssign!("*=", "mulps", "pfmul")); --} -- --unittest --{ -- printf("_arrayExpSliceMulass_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- c[] *= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = value - b[] -- */ -- --T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arrayExpSliceMinSliceAssign_f()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE version is 690% faster -- if (sse() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- // Unaligned case -- asm -- { -- mov EAX, bptr; -- mov ESI, aptr; -- mov EDI, n; -- movss XMM4, value; -- shufps XMM4, XMM4, 0; -- -- align 8; -- startsseloop: -- add ESI, 64; -- movaps XMM5, XMM4; -- movaps XMM6, XMM4; -- movups XMM0, [EAX]; -- movups XMM1, [EAX+16]; -- movups XMM2, [EAX+32]; -- movups XMM3, [EAX+48]; -- add EAX, 64; -- subps XMM5, XMM0; -- subps XMM6, XMM1; -- movups [ESI+ 0-64], XMM5; -- movups [ESI+16-64], XMM6; -- movaps XMM5, XMM4; -- movaps XMM6, XMM4; -- subps XMM5, XMM2; -- subps XMM6, XMM3; -- movups [ESI+32-64], XMM5; -- movups [ESI+48-64], XMM6; -- cmp ESI, EDI; -- jb startsseloop; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- // 3DNow! version is 67% faster -- if (amd3dnow() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- ulong w = *cast(uint *) &value; -- ulong v = w | (w << 32L); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movq MM4, qword ptr [v]; -- -- align 8; -- start3dnow: -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- movq MM2, [EAX+16]; -- movq MM3, [EAX+24]; -- pfsubr MM0, MM4; -- pfsubr MM1, MM4; -- pfsubr MM2, MM4; -- pfsubr MM3, MM4; -- movq [ESI], MM0; -- movq [ESI+8], MM1; -- movq [ESI+16], MM2; -- movq [ESI+24], MM3; -- add ESI, 32; -- add EAX, 32; -- cmp ESI, EDI; -- jb start3dnow; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = value - *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinSliceAssign_f unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = 6 - a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(6 - a[i])) -- { -- printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] * value -- */ -- --T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAddass_f(a, -value, b); --} -- --/*********************** -- * Computes: -- * a[] += b[] * value -- */ -- --T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- // Handle remainder -- while (aptr < aend) -- *aptr++ += *bptr++ * value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAddass_f unittest\n"); -- -- cpuid = 1; -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 1; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] += a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); -- if (c[i] != cast(T)(b[i] + a[i] * 6)) -- { -- printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayint.d druntime/src/rt/arrayint.d ---- druntime-old/src/rt/arrayint.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arrayint.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,2430 +0,0 @@ --/** -- * Contains MMX versions of certain operations for dchar, int, and uint ('w', -- * 'i' and 'k' suffixes). -- * -- * Copyright: Copyright Digital Mars 2008 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, based on code originally written by Burton Radons -- * -- * Copyright Digital Mars 2008 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arrayint; -- --private import core.cpuid; -- --version (unittest) --{ -- private import core.stdc.stdio : printf; -- /* This is so unit tests will test every CPU variant -- */ -- int cpuid; -- const int CPUID_MAX = 4; -- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } -- bool sse() { return cpuid == 2 && core.cpuid.sse(); } -- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } -- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } --} --else --{ -- alias core.cpuid.mmx mmx; -- alias core.cpuid.sse sse; -- alias core.cpuid.sse2 sse2; -- alias core.cpuid.amd3dnow amd3dnow; --} -- --//version = log; -- --bool disjoint(T)(T[] a, T[] b) --{ -- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); --} -- --alias int T; -- --extern (C): -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + value -- */ -- --T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b) --{ -- return _arraySliceExpAddSliceAssign_i(a, value, b); --} -- --T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b) --{ -- return _arraySliceExpAddSliceAssign_i(a, value, b); --} -- --T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpAddSliceAssign_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 380% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 298% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movq MM2, l; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- paddd MM0, MM2; -- paddd MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- if (a.length >= 2) -- { -- auto n = aptr + (a.length & ~1); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov EDX, value; -- -- align 4; -- start386: -- add ESI, 8; -- mov EBX, [EAX]; -- mov ECX, [EAX+4]; -- add EAX, 8; -- add EBX, EDX; -- add ECX, EDX; -- mov [ESI -8], EBX; -- mov [ESI+4-8], ECX; -- cmp ESI, EDI; -- jb start386; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ + value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpAddSliceAssign_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + c[] -- */ -- --T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceAddSliceAssign_i(a, c, b); --} -- --T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceAddSliceAssign_i(a, c, b); --} -- --T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- //printf("_arraySliceSliceAddSliceAssign_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1710% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 995% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM2, [ECX]; -- movq MM1, [EAX+8]; -- movq MM3, [ECX+8]; -- add EAX, 16; -- add ECX, 16; -- paddd MM0, MM2; -- paddd MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- --normal: -- while (aptr < aend) -- *aptr++ = *bptr++ + *cptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddSliceAssign_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += value -- */ -- --T[] _arrayExpSliceAddass_w(T[] a, T value) --{ -- return _arrayExpSliceAddass_i(a, value); --} -- --T[] _arrayExpSliceAddass_k(T[] a, T value) --{ -- return _arrayExpSliceAddass_i(a, value); --} -- --T[] _arrayExpSliceAddass_i(T[] a, T value) --{ -- //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 83% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 81% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movq MM2, l; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- paddd MM0, MM2; -- paddd MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- else -- if (a.length >= 2) -- { -- auto n = aptr + (a.length & ~1); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EDX, value; -- -- align 4; -- start386: -- mov EBX, [ESI]; -- mov ECX, [ESI+4]; -- add ESI, 8; -- add EBX, EDX; -- add ECX, EDX; -- mov [ESI -8], EBX; -- mov [ESI+4-8], ECX; -- cmp ESI, EDI; -- jb start386; -- -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceAddass_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- a[] += 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(c[i] + 6)) -- { -- printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += b[] -- */ -- --T[] _arraySliceSliceAddass_w(T[] a, T[] b) --{ -- return _arraySliceSliceAddass_i(a, b); --} -- --T[] _arraySliceSliceAddass_k(T[] a, T[] b) --{ -- return _arraySliceSliceAddass_i(a, b); --} -- --T[] _arraySliceSliceAddass_i(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceAddass_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 695% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- paddd XMM0, XMM2; -- paddd XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 471% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM2, [ECX]; -- movq MM1, [ESI+8]; -- movq MM3, [ECX+8]; -- add ESI, 16; -- add ECX, 16; -- paddd MM0, MM2; -- paddd MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- --normal: -- while (aptr < aend) -- *aptr++ += *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddass_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] += a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(b[i] + a[i])) -- { -- printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - value -- */ -- --T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b) --{ -- return _arraySliceExpMinSliceAssign_i(a, value, b); --} -- --T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b) --{ -- return _arraySliceExpMinSliceAssign_i(a, value, b); --} -- --T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMinSliceAssign_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 400% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 315% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movq MM2, l; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- psubd MM0, MM2; -- psubd MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- if (a.length >= 2) -- { -- auto n = aptr + (a.length & ~1); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov EDX, value; -- -- align 4; -- start386: -- add ESI, 8; -- mov EBX, [EAX]; -- mov ECX, [EAX+4]; -- add EAX, 8; -- sub EBX, EDX; -- sub ECX, EDX; -- mov [ESI -8], EBX; -- mov [ESI+4-8], ECX; -- cmp ESI, EDI; -- jb start386; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ - value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMinSliceAssign_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = value - b[] -- */ -- --T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value) --{ -- return _arrayExpSliceMinSliceAssign_i(a, b, value); --} -- --T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value) --{ -- return _arrayExpSliceMinSliceAssign_i(a, b, value); --} -- --T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arrayExpSliceMinSliceAssign_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1812% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 4; -- startaddsse2u: -- add ESI, 32; -- movdqu XMM2, [EAX]; -- movdqu XMM3, [EAX+16]; -- movdqa XMM0, XMM4; -- movdqa XMM1, XMM4; -- add EAX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM4, l; -- pshufd XMM4, XMM4, 0; -- -- align 4; -- startaddsse2a: -- add ESI, 32; -- movdqa XMM2, [EAX]; -- movdqa XMM3, [EAX+16]; -- movdqa XMM0, XMM4; -- movdqa XMM1, XMM4; -- add EAX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 1077% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movq MM4, l; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM2, [EAX]; -- movq MM3, [EAX+8]; -- movq MM0, MM4; -- movq MM1, MM4; -- add EAX, 16; -- psubd MM0, MM2; -- psubd MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = value - *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinSliceAssign_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = 6 - a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(6 - a[i])) -- { -- printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - c[] -- */ -- --T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMinSliceAssign_i(a, c, b); --} -- --T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMinSliceAssign_i(a, c, b); --} -- --T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1721% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 1002% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM2, [ECX]; -- movq MM1, [EAX+8]; -- movq MM3, [ECX+8]; -- add EAX, 16; -- add ECX, 16; -- psubd MM0, MM2; -- psubd MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ - *cptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMinSliceAssign_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= value -- */ -- --T[] _arrayExpSliceMinass_w(T[] a, T value) --{ -- return _arrayExpSliceMinass_i(a, value); --} -- --T[] _arrayExpSliceMinass_k(T[] a, T value) --{ -- return _arrayExpSliceMinass_i(a, value); --} -- --T[] _arrayExpSliceMinass_i(T[] a, T value) --{ -- //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 81% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 81% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movq MM2, l; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- psubd MM0, MM2; -- psubd MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- else -- if (a.length >= 2) -- { -- auto n = aptr + (a.length & ~1); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EDX, value; -- -- align 4; -- start386: -- mov EBX, [ESI]; -- mov ECX, [ESI+4]; -- add ESI, 8; -- sub EBX, EDX; -- sub ECX, EDX; -- mov [ESI -8], EBX; -- mov [ESI+4-8], ECX; -- cmp ESI, EDI; -- jb start386; -- -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinass_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- a[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(c[i] - 6)) -- { -- printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] -- */ -- --T[] _arraySliceSliceMinass_w(T[] a, T[] b) --{ -- return _arraySliceSliceMinass_i(a, b); --} -- --T[] _arraySliceSliceMinass_k(T[] a, T[] b) --{ -- return _arraySliceSliceMinass_i(a, b); --} -- --T[] _arraySliceSliceMinass_i(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMinass_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 731% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- psubd XMM0, XMM2; -- psubd XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 441% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM2, [ECX]; -- movq MM1, [ESI+8]; -- movq MM3, [ECX+8]; -- add ESI, 16; -- add ECX, 16; -- psubd MM0, MM2; -- psubd MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMinass_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] -= a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(b[i] - a[i])) -- { -- printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * value -- */ -- --T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAssign_i(a, value, b); --} -- --T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAssign_i(a, value, b); --} -- --T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMulSliceAssign_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (none) // multiplying a pair is not supported by MMX -- { -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1380% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- { -- // MMX version is 1380% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movq MM2, l; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- pmuludq MM0, MM2; // only multiplies low 32 bits -- pmuludq MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ * value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]); -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * c[] -- */ -- --T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMulSliceAssign_i(a, c, b); --} -- --T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMulSliceAssign_i(a, c, b); --} -- --T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- //printf("_arraySliceSliceMulSliceAssign_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (none) -- { -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 1407% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 1029% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM2, [ECX]; -- movq MM1, [EAX+8]; -- movq MM3, [ECX+8]; -- add EAX, 16; -- add ECX, 16; -- pmuludq MM0, MM2; -- pmuludq MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = *bptr++ * *cptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMulSliceAssign_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * b[i])) -- { -- printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= value -- */ -- --T[] _arrayExpSliceMulass_w(T[] a, T value) --{ -- return _arrayExpSliceMulass_i(a, value); --} -- --T[] _arrayExpSliceMulass_k(T[] a, T value) --{ -- return _arrayExpSliceMulass_i(a, value); --} -- --T[] _arrayExpSliceMulass_i(T[] a, T value) --{ -- //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (none) -- { -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 400% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = value; -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 402% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movq MM2, l; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- pmuludq MM0, MM2; -- pmuludq MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ *= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMulass_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = a[]; -- a[] *= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(b[i] * 6)) -- { -- printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= b[] -- */ -- --T[] _arraySliceSliceMulass_w(T[] a, T[] b) --{ -- return _arraySliceSliceMulass_i(a, b); --} -- --T[] _arraySliceSliceMulass_k(T[] a, T[] b) --{ -- return _arraySliceSliceMulass_i(a, b); --} -- --T[] _arraySliceSliceMulass_i(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMulass_i()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (none) -- { -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 873% faster -- if (sse2() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- pmuludq XMM0, XMM2; -- pmuludq XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } --/+ BUG: comment out this section until we figure out what is going -- wrong with the invalid pshufd instructions. -- -- else -- // MMX version is 573% faster -- if (mmx() && a.length >= 4) -- { -- auto n = aptr + (a.length & ~3); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM2, [ECX]; -- movq MM1, [ESI+8]; -- movq MM3, [ECX+8]; -- pxor MM4, MM4; -- pxor MM5, MM5; -- punpckldq MM4, MM0; -- punpckldq MM5, MM2; -- add ESI, 16; -- add ECX, 16; -- pmuludq MM4, MM5; -- pshufd MM4, MM4, 8; // ? -- movq [ESI -16], MM4; -- pxor MM4, MM4; -- pxor MM5, MM5; -- punpckldq MM4, MM1; -- punpckldq MM5, MM3; -- pmuludq MM4, MM5; -- pshufd MM4, MM4, 8; // ? -- movq [ESI+8-16], MM4; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } --+/ -- } -- } -- -- while (aptr < aend) -- *aptr++ *= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMulass_i unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = a[]; -- a[] *= c[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(b[i] * c[i])) -- { -- printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); -- assert(0); -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayreal.d druntime/src/rt/arrayreal.d ---- druntime-old/src/rt/arrayreal.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arrayreal.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,241 +0,0 @@ --/** -- * Contains SSE2 and MMX versions of certain operations for real. -- * -- * Copyright: Copyright Digital Mars 2008 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, based on code originally written by Burton Radons -- * -- * Copyright Digital Mars 2008 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arrayreal; -- --import core.cpuid; -- --version (unittest) --{ -- private import core.stdc.stdio : printf; -- /* This is so unit tests will test every CPU variant -- */ -- int cpuid; -- const int CPUID_MAX = 1; -- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } -- bool sse() { return cpuid == 2 && core.cpuid.sse(); } -- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } -- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } --} --else --{ -- alias core.cpuid.mmx mmx; -- alias core.cpuid.sse sse; -- alias core.cpuid.sse2 sse2; -- alias core.cpuid.amd3dnow amd3dnow; --} -- --//version = log; -- --bool disjoint(T)(T[] a, T[] b) --{ -- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); --} -- --alias real T; -- --extern (C): -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + c[] -- */ -- --T[] _arraySliceSliceAddSliceAssign_r(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- for (int i = 0; i < a.length; i++) -- a[i] = b[i] + c[i]; -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddSliceAssign_r unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %Lg != %Lg + %Lg\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - c[] -- */ -- --T[] _arraySliceSliceMinSliceAssign_r(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- for (int i = 0; i < a.length; i++) -- a[i] = b[i] - c[i]; -- return a; --} -- -- --unittest --{ -- printf("_arraySliceSliceMinSliceAssign_r unittest\n"); -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %Lg != %Lg - %Lg\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] * value -- */ -- --T[] _arraySliceExpMulSliceMinass_r(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAddass_r(a, -value, b); --} -- --/*********************** -- * Computes: -- * a[] += b[] * value -- */ -- --T[] _arraySliceExpMulSliceAddass_r(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- // Handle remainder -- while (aptr < aend) -- *aptr++ += *bptr++ * value; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAddass_r unittest\n"); -- -- cpuid = 1; -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 1; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] += a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- //printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]); -- if (c[i] != cast(T)(b[i] + a[i] * 6)) -- { -- printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayshort.d druntime/src/rt/arrayshort.d ---- druntime-old/src/rt/arrayshort.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/arrayshort.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,2303 +0,0 @@ --/** -- * Contains SSE2 and MMX versions of certain operations for wchar, short, -- * and ushort ('u', 's' and 't' suffixes). -- * -- * Copyright: Copyright Digital Mars 2008 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright, based on code originally written by Burton Radons -- * -- * Copyright Digital Mars 2008 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.arrayshort; -- --private import core.cpuid; -- --version (unittest) --{ -- private import core.stdc.stdio : printf; -- /* This is so unit tests will test every CPU variant -- */ -- int cpuid; -- const int CPUID_MAX = 4; -- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } -- bool sse() { return cpuid == 2 && core.cpuid.sse(); } -- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } -- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } --} --else --{ -- alias core.cpuid.mmx mmx; -- alias core.cpuid.sse sse; -- alias core.cpuid.sse2 sse2; -- alias core.cpuid.sse2 sse2; --} -- --//version = log; -- --bool disjoint(T)(T[] a, T[] b) --{ -- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); --} -- --alias short T; -- --extern (C): -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + value -- */ -- --T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b) --{ -- return _arraySliceExpAddSliceAssign_s(a, value, b); --} -- --T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b) --{ -- return _arraySliceExpAddSliceAssign_s(a, value, b); --} -- --T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpAddSliceAssign_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 3343% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= (l << 16); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 3343% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM2, l; -- pshufw MM2, MM2, 0; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- paddw MM0, MM2; -- paddw MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ + value); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpAddSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + 6)) -- { -- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] + c[] -- */ -- --T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceAddSliceAssign_s(a, c, b); --} -- --T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceAddSliceAssign_s(a, c, b); --} -- --T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- //printf("_arraySliceSliceAddSliceAssign_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 3777% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- movdqu XMM2, [ECX]; -- movdqu XMM3, [ECX+16]; -- add ECX, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- movdqa XMM2, [ECX]; -- movdqa XMM3, [ECX+16]; -- add ECX, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 2068% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- movq MM2, [ECX]; -- movq MM3, [ECX+8]; -- add ECX, 16; -- paddw MM0, MM2; -- paddw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ + *cptr++); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] + b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] + b[i])) -- { -- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += value -- */ -- --T[] _arrayExpSliceAddass_u(T[] a, T value) --{ -- return _arrayExpSliceAddass_s(a, value); --} -- --T[] _arrayExpSliceAddass_t(T[] a, T value) --{ -- return _arrayExpSliceAddass_s(a, value); --} -- --T[] _arrayExpSliceAddass_s(T[] a, T value) --{ -- //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 832% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= (l << 16); -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 826% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd MM2, l; -- pshufw MM2, MM2, 0; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- paddw MM0, MM2; -- paddw MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceAddass_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- a[] += 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(c[i] + 6)) -- { -- printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] += b[] -- */ -- --T[] _arraySliceSliceAddass_u(T[] a, T[] b) --{ -- return _arraySliceSliceAddass_s(a, b); --} -- --T[] _arraySliceSliceAddass_t(T[] a, T[] b) --{ -- return _arraySliceSliceAddass_s(a, b); --} -- --T[] _arraySliceSliceAddass_s(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceAddass_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 2085% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- movdqu XMM2, [ECX]; -- movdqu XMM3, [ECX+16]; -- add ECX, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- movdqa XMM2, [ECX]; -- movdqa XMM3, [ECX+16]; -- add ECX, 32; -- paddw XMM0, XMM2; -- paddw XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 1022% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- start: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- movq MM2, [ECX]; -- movq MM3, [ECX+8]; -- add ECX, 16; -- paddw MM0, MM2; -- paddw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb start; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ += *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceAddass_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] += a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(b[i] + a[i])) -- { -- printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - value -- */ -- --T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) --{ -- return _arraySliceExpMinSliceAssign_s(a, value, b); --} -- --T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) --{ -- return _arraySliceExpMinSliceAssign_s(a, value, b); --} -- --T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMinSliceAssign_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 3695% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= (l << 16); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 3049% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM2, l; -- pshufw MM2, MM2, 0; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- psubw MM0, MM2; -- psubw MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ - value); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMinSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - 6)) -- { -- printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = value - b[] -- */ -- --T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) --{ -- return _arrayExpSliceMinSliceAssign_s(a, b, value); --} -- --T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) --{ -- return _arrayExpSliceMinSliceAssign_s(a, b, value); --} -- --T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arrayExpSliceMinSliceAssign_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 4995% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= (l << 16); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- -- align 4; -- startaddsse2u: -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- movd XMM3, l; -- pshufd XMM3, XMM3, 0; -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- psubw XMM2, XMM0; -- psubw XMM3, XMM1; -- movdqu [ESI -32], XMM2; -- movdqu [ESI+16-32], XMM3; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- -- align 4; -- startaddsse2a: -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- movd XMM3, l; -- pshufd XMM3, XMM3, 0; -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- psubw XMM2, XMM0; -- psubw XMM3, XMM1; -- movdqa [ESI -32], XMM2; -- movdqa [ESI+16-32], XMM3; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 4562% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM4, l; -- pshufw MM4, MM4, 0; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM2, [EAX]; -- movq MM3, [EAX+8]; -- movq MM0, MM4; -- movq MM1, MM4; -- add EAX, 16; -- psubw MM0, MM2; -- psubw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(value - *bptr++); -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = 6 - a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(6 - a[i])) -- { -- printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] - c[] -- */ -- --T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMinSliceAssign_s(a, c, b); --} -- --T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMinSliceAssign_s(a, c, b); --} -- --T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 4129% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- movdqu XMM2, [ECX]; -- movdqu XMM3, [ECX+16]; -- add ECX, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- movdqa XMM2, [ECX]; -- movdqa XMM3, [ECX+16]; -- add ECX, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 2018% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- movq MM2, [ECX]; -- movq MM3, [ECX+8]; -- add ECX, 16; -- psubw MM0, MM2; -- psubw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ - *cptr++); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMinSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] - b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] - b[i])) -- { -- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= value -- */ -- --T[] _arrayExpSliceMinass_u(T[] a, T value) --{ -- return _arrayExpSliceMinass_s(a, value); --} -- --T[] _arrayExpSliceMinass_t(T[] a, T value) --{ -- return _arrayExpSliceMinass_s(a, value); --} -- --T[] _arrayExpSliceMinass_s(T[] a, T value) --{ -- //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 835% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= (l << 16); -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startaddsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startaddsse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 835% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd MM2, l; -- pshufw MM2, MM2, 0; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- psubw MM0, MM2; -- psubw MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMinass_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- a[] = c[]; -- a[] -= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(c[i] - 6)) -- { -- printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] -= b[] -- */ -- --T[] _arraySliceSliceMinass_u(T[] a, T[] b) --{ -- return _arraySliceSliceMinass_s(a, b); --} -- --T[] _arraySliceSliceMinass_t(T[] a, T[] b) --{ -- return _arraySliceSliceMinass_s(a, b); --} -- --T[] _arraySliceSliceMinass_s(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMinass_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 2121% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm // unaligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- movdqu XMM2, [ECX]; -- movdqu XMM3, [ECX+16]; -- add ECX, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm // aligned case -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- movdqa XMM2, [ECX]; -- movdqa XMM3, [ECX+16]; -- add ECX, 32; -- psubw XMM0, XMM2; -- psubw XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 1116% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- start: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- movq MM2, [ECX]; -- movq MM3, [ECX+8]; -- add ECX, 16; -- psubw MM0, MM2; -- psubw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb start; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ -= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMinass_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = c[]; -- c[] -= a[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(b[i] - a[i])) -- { -- printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * value -- */ -- --T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAssign_s(a, value, b); --} -- --T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) --{ -- return _arraySliceExpMulSliceAssign_s(a, value, b); --} -- --T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) --in --{ -- assert(a.length == b.length); -- assert(disjoint(a, b)); --} --body --{ -- //printf("_arraySliceExpMulSliceAssign_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 3733% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= l << 16; -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM1, [EAX+16]; -- add EAX, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM1, [EAX+16]; -- add EAX, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- else -- // MMX version is 3733% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- movd MM2, l; -- pshufw MM2, MM2, 0; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM1, [EAX+8]; -- add EAX, 16; -- pmullw MM0, MM2; -- pmullw MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ * value); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceExpMulSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * 6)) -- { -- printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] = b[] * c[] -- */ -- --T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMulSliceAssign_s(a, c, b); --} -- --T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) --{ -- return _arraySliceSliceMulSliceAssign_s(a, c, b); --} -- --T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) --in --{ -- assert(a.length == b.length && b.length == c.length); -- assert(disjoint(a, b)); -- assert(disjoint(a, c)); -- assert(disjoint(b, c)); --} --body --{ -- //printf("_arraySliceSliceMulSliceAssign_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- auto cptr = c.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 2515% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2u: -- add ESI, 32; -- movdqu XMM0, [EAX]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [EAX+16]; -- movdqu XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startsse2a: -- add ESI, 32; -- movdqa XMM0, [EAX]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [EAX+16]; -- movdqa XMM3, [ECX+16]; -- add EAX, 32; -- add ECX, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- else -- // MMX version is 2515% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov EAX, bptr; -- mov ECX, cptr; -- -- align 4; -- startmmx: -- add ESI, 16; -- movq MM0, [EAX]; -- movq MM2, [ECX]; -- movq MM1, [EAX+8]; -- movq MM3, [ECX+8]; -- add EAX, 16; -- add ECX, 16; -- pmullw MM0, MM2; -- pmullw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, EAX; -- mov cptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ = cast(T)(*bptr++ * *cptr++); -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMulSliceAssign_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- c[] = a[] * b[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (c[i] != cast(T)(a[i] * b[i])) -- { -- printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= value -- */ -- --T[] _arrayExpSliceMulass_u(T[] a, T value) --{ -- return _arrayExpSliceMulass_s(a, value); --} -- --T[] _arrayExpSliceMulass_t(T[] a, T value) --{ -- return _arrayExpSliceMulass_s(a, value); --} -- --T[] _arrayExpSliceMulass_s(T[] a, T value) --{ -- //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 2044% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- uint l = cast(ushort) value; -- l |= l << 16; -- -- if (((cast(uint) aptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM1, [ESI+16]; -- add ESI, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM2; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd XMM2, l; -- pshufd XMM2, XMM2, 0; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM1, [ESI+16]; -- add ESI, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM2; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- } -- } -- } -- else -- // MMX version is 2056% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- uint l = cast(ushort) value; -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- movd MM2, l; -- pshufw MM2, MM2, 0; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM1, [ESI+8]; -- add ESI, 16; -- pmullw MM0, MM2; -- pmullw MM1, MM2; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ *= value; -- -- return a; --} -- --unittest --{ -- printf("_arrayExpSliceMulass_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = a[]; -- a[] *= 6; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(b[i] * 6)) -- { -- printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); -- assert(0); -- } -- } -- } -- } --} -- -- --/* ======================================================================== */ -- --/*********************** -- * Computes: -- * a[] *= b[] -- */ -- --T[] _arraySliceSliceMulass_u(T[] a, T[] b) --{ -- return _arraySliceSliceMulass_s(a, b); --} -- --T[] _arraySliceSliceMulass_t(T[] a, T[] b) --{ -- return _arraySliceSliceMulass_s(a, b); --} -- --T[] _arraySliceSliceMulass_s(T[] a, T[] b) --in --{ -- assert (a.length == b.length); -- assert (disjoint(a, b)); --} --body --{ -- //printf("_arraySliceSliceMulass_s()\n"); -- auto aptr = a.ptr; -- auto aend = aptr + a.length; -- auto bptr = b.ptr; -- -- version (D_InlineAsm_X86) -- { -- // SSE2 aligned version is 2519% faster -- if (sse2() && a.length >= 16) -- { -- auto n = aptr + (a.length & ~15); -- -- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2u: -- movdqu XMM0, [ESI]; -- movdqu XMM2, [ECX]; -- movdqu XMM1, [ESI+16]; -- movdqu XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM3; -- movdqu [ESI -32], XMM0; -- movdqu [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2u; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- else -- { -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startsse2a: -- movdqa XMM0, [ESI]; -- movdqa XMM2, [ECX]; -- movdqa XMM1, [ESI+16]; -- movdqa XMM3, [ECX+16]; -- add ESI, 32; -- add ECX, 32; -- pmullw XMM0, XMM2; -- pmullw XMM1, XMM3; -- movdqa [ESI -32], XMM0; -- movdqa [ESI+16-32], XMM1; -- cmp ESI, EDI; -- jb startsse2a; -- -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- else -- // MMX version is 1712% faster -- if (mmx() && a.length >= 8) -- { -- auto n = aptr + (a.length & ~7); -- -- asm -- { -- mov ESI, aptr; -- mov EDI, n; -- mov ECX, bptr; -- -- align 4; -- startmmx: -- movq MM0, [ESI]; -- movq MM2, [ECX]; -- movq MM1, [ESI+8]; -- movq MM3, [ECX+8]; -- add ESI, 16; -- add ECX, 16; -- pmullw MM0, MM2; -- pmullw MM1, MM3; -- movq [ESI -16], MM0; -- movq [ESI+8-16], MM1; -- cmp ESI, EDI; -- jb startmmx; -- -- emms; -- mov aptr, ESI; -- mov bptr, ECX; -- } -- } -- } -- -- while (aptr < aend) -- *aptr++ *= *bptr++; -- -- return a; --} -- --unittest --{ -- printf("_arraySliceSliceMulass_s unittest\n"); -- -- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) -- { -- version (log) printf(" cpuid %d\n", cpuid); -- -- for (int j = 0; j < 2; j++) -- { -- const int dim = 67; -- T[] a = new T[dim + j]; // aligned on 16 byte boundary -- a = a[j .. dim + j]; // misalign for second iteration -- T[] b = new T[dim + j]; -- b = b[j .. dim + j]; -- T[] c = new T[dim + j]; -- c = c[j .. dim + j]; -- -- for (int i = 0; i < dim; i++) -- { a[i] = cast(T)i; -- b[i] = cast(T)(i + 7); -- c[i] = cast(T)(i * 2); -- } -- -- b[] = a[]; -- a[] *= c[]; -- -- for (int i = 0; i < dim; i++) -- { -- if (a[i] != cast(T)(b[i] * c[i])) -- { -- printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); -- assert(0); -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh.c druntime/src/rt/deh.c ---- druntime-old/src/rt/deh.c 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/deh.c 1970-01-01 03:00:00.000000000 +0300 -@@ -1,734 +0,0 @@ --/** -- * Implementation of exception handling support routines for Windows. -- * -- * Copyright: Copyright Digital Mars 1999 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright -- * -- * Copyright Digital Mars 1999 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --#include --#include --#include --#include -- --/* ======================== Win32 =============================== */ -- --#if _WIN32 -- --#include --#include -- --//#include "\sc\src\include\ehsup.h" -- --/*** From Digital Mars C runtime library ***/ --EXCEPTION_DISPOSITION __cdecl _local_except_handler (EXCEPTION_RECORD *ExceptionRecord, -- void* EstablisherFrame, -- void *ContextRecord, -- void *DispatcherContext -- ); --void __cdecl _global_unwind(void *frame,EXCEPTION_RECORD *eRecord); --#define EXCEPTION_UNWIND 6 // Flag to indicate if the system is unwinding -- --extern DWORD _except_list; --/*** ***/ -- --#include "mars.h" -- --extern ClassInfo D6object9Throwable7__ClassZ; --#define _Class_9Throwable D6object9Throwable7__ClassZ; -- --extern ClassInfo D6object5Error7__ClassZ; --#define _Class_5Error D6object5Error7__ClassZ -- --typedef int (__pascal *fp_t)(); // function pointer in ambient memory model -- --void _d_setunhandled(Object*); -- --// The layout of DEstablisherFrame is the same for C++ -- --struct DEstablisherFrame --{ -- void *prev; // pointer to previous exception list -- void *handler; // pointer to routine for exception handler -- DWORD table_index; // current index into handler_info[] -- DWORD ebp; // this is EBP of routine --}; -- --struct DHandlerInfo --{ -- int prev_index; // previous table index -- unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) -- void *finally_code; // pointer to finally code to execute -- // (!=0 if try-finally) --}; -- --// Address of DHandlerTable is passed in EAX to _d_framehandler() -- --struct DHandlerTable --{ -- void *fptr; // pointer to start of function -- unsigned espoffset; // offset of ESP from EBP -- unsigned retoffset; // offset from start of function to return code -- struct DHandlerInfo handler_info[1]; --}; -- --struct DCatchBlock --{ -- ClassInfo *type; // catch type -- unsigned bpoffset; // EBP offset of catch var -- void *code; // catch handler code --}; -- --// Create one of these for each try-catch --struct DCatchInfo --{ -- unsigned ncatches; // number of catch blocks -- struct DCatchBlock catch_block[1]; // data for each catch block --}; -- --// Macro to make our own exception code --#define MAKE_EXCEPTION_CODE(severity, facility, exception) \ -- (((severity) << 30) | (1 << 29) | (0 << 28) | ((facility) << 16) | (exception)) -- --#define STATUS_DIGITAL_MARS_D_EXCEPTION MAKE_EXCEPTION_CODE(3,'D',1) -- --Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record); --void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, struct DEstablisherFrame *frame, int stop_index); -- -- --/*********************************** -- * The frame handler, this is called for each frame that has been registered -- * in the OS except_list. -- * Input: -- * EAX the handler table for the frame -- */ -- --EXCEPTION_DISPOSITION _d_framehandler( -- EXCEPTION_RECORD *exception_record, -- struct DEstablisherFrame *frame, -- CONTEXT *context, -- void *dispatcher_context) --{ -- struct DHandlerTable *handler_table; -- -- __asm { mov handler_table,EAX } -- -- if (exception_record->ExceptionFlags & EXCEPTION_UNWIND) -- { -- // Call all the finally blocks in this frame -- _d_local_unwind(handler_table, frame, -1); -- } -- else -- { -- // Jump to catch block if matching one is found -- -- int ndx,prev_ndx,i; -- struct DHandlerInfo *phi; -- struct DCatchInfo *pci; -- struct DCatchBlock *pcb; -- unsigned ncatches; // number of catches in the current handler -- Object *pti; -- ClassInfo *ci; -- -- ci = NULL; // only compute it if we need it -- -- // walk through handler table, checking each handler -- // with an index smaller than the current table_index -- for (ndx = frame->table_index; ndx != -1; ndx = prev_ndx) -- { -- phi = &handler_table->handler_info[ndx]; -- prev_ndx = phi->prev_index; -- if (phi->cioffset) -- { -- // this is a catch handler (no finally) -- pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); -- ncatches = pci->ncatches; -- for (i = 0; i < ncatches; i++) -- { -- pcb = &pci->catch_block[i]; -- -- if (!ci) -- { -- // This code must match the translation code -- if (exception_record->ExceptionCode == STATUS_DIGITAL_MARS_D_EXCEPTION) -- { -- //printf("ei[0] = %p\n", exception_record->ExceptionInformation[0]); -- ci = **(ClassInfo ***)(exception_record->ExceptionInformation[0]); -- } -- else -- ci = &_Class_9Throwable; -- } -- -- if (_d_isbaseof(ci, pcb->type)) -- { -- // Matched the catch type, so we've found the handler. -- int regebp; -- -- pti = _d_translate_se_to_d_exception(exception_record); -- -- // Initialize catch variable -- regebp = (int)&frame->ebp; // EBP for this frame -- *(void **)(regebp + (pcb->bpoffset)) = pti; -- -- _d_setunhandled(pti); -- -- // Have system call all finally blocks in intervening frames -- _global_unwind(frame, exception_record); -- -- // Call all the finally blocks skipped in this frame -- _d_local_unwind(handler_table, frame, ndx); -- -- _d_setunhandled(NULL); -- -- frame->table_index = prev_ndx; // we are out of this handler -- -- // Jump to catch block. Does not return. -- { -- unsigned catch_esp; -- fp_t catch_addr; -- -- catch_addr = (fp_t)(pcb->code); -- catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); -- _asm -- { -- mov EAX,catch_esp -- mov ECX,catch_addr -- mov [EAX],ECX -- mov EBP,regebp -- mov ESP,EAX // reset stack -- ret // jump to catch block -- } -- } -- } -- } -- } -- } -- } -- return ExceptionContinueSearch; --} -- --/*********************************** -- * Exception filter for use in __try..__except block -- * surrounding call to Dmain() -- */ -- --int _d_exception_filter(struct _EXCEPTION_POINTERS *eptrs, -- int retval, -- Object **exception_object) --{ -- *exception_object = _d_translate_se_to_d_exception(eptrs->ExceptionRecord); -- return retval; --} -- --/*********************************** -- * Throw a D object. -- */ -- --void __stdcall _d_throw(Object *h) --{ -- //printf("_d_throw(h = %p, &h = %p)\n", h, &h); -- //printf("\tvptr = %p\n", *(void **)h); -- RaiseException(STATUS_DIGITAL_MARS_D_EXCEPTION, -- EXCEPTION_NONCONTINUABLE, -- 1, (DWORD *)&h); --} -- --/*********************************** -- * Create an exception object -- */ -- --Object *_d_create_exception_object(ClassInfo *ci, char *msg) --{ -- Throwable *exc; -- -- exc = (Throwable *)_d_newclass(ci); -- // BUG: what if _d_newclass() throws an out of memory exception? -- -- if (msg) -- { -- exc->msglen = strlen(msg); -- exc->msg = msg; -- } -- return (Object *)exc; --} -- --/*********************************** -- * Converts a Windows Structured Exception code to a D Exception Object. -- */ -- --Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record) --{ -- Object *pti; -- -- switch (exception_record->ExceptionCode) { -- case STATUS_DIGITAL_MARS_D_EXCEPTION: -- // Generated D exception -- pti = (Object *)(exception_record->ExceptionInformation[0]); -- break; -- -- case STATUS_INTEGER_DIVIDE_BY_ZERO: -- pti = _d_create_exception_object(&_Class_5Error, "Integer Divide by Zero"); -- break; -- -- case STATUS_FLOAT_DIVIDE_BY_ZERO: -- pti = _d_create_exception_object(&_Class_5Error, "Float Divide by Zero"); -- break; -- -- case STATUS_ACCESS_VIOLATION: -- pti = _d_create_exception_object(&_Class_5Error, "Access Violation"); -- break; -- -- case STATUS_STACK_OVERFLOW: -- pti = _d_create_exception_object(&_Class_5Error, "Stack Overflow"); -- break; -- -- case STATUS_DATATYPE_MISALIGNMENT: -- pti = _d_create_exception_object(&_Class_5Error, "Datatype Misalignment"); -- break; -- -- case STATUS_ARRAY_BOUNDS_EXCEEDED: -- pti = _d_create_exception_object(&_Class_5Error, "Array Bounds Exceeded"); -- break; -- -- case STATUS_FLOAT_INVALID_OPERATION: -- pti = _d_create_exception_object(&_Class_5Error, "Invalid Floating Point Operation"); -- break; -- -- case STATUS_FLOAT_DENORMAL_OPERAND: -- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Denormal Operand"); -- break; -- -- case STATUS_FLOAT_INEXACT_RESULT: -- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Inexact Result"); -- break; -- -- case STATUS_FLOAT_OVERFLOW: -- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Overflow"); -- break; -- -- case STATUS_FLOAT_UNDERFLOW: -- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Underflow"); -- break; -- -- case STATUS_FLOAT_STACK_CHECK: -- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Stack Check"); -- break; -- -- case STATUS_PRIVILEGED_INSTRUCTION: -- if (*((unsigned char *)(exception_record->ExceptionAddress))==0xF4) { // HLT -- pti = _d_create_exception_object(&_Class_5Error, "assert(0) or HLT instruction"); -- } else { -- pti = _d_create_exception_object(&_Class_5Error, "Privileged Instruction"); -- } -- break; -- -- case STATUS_ILLEGAL_INSTRUCTION: -- pti = _d_create_exception_object(&_Class_5Error, "Illegal Instruction"); -- break; -- -- case STATUS_BREAKPOINT: -- pti = _d_create_exception_object(&_Class_5Error, "Breakpoint"); -- break; -- -- case STATUS_IN_PAGE_ERROR: -- pti = _d_create_exception_object(&_Class_5Error, "Win32 In Page Exception"); -- break; --/* -- case STATUS_INTEGER_OVERFLOW: // not supported on any x86 processor -- case STATUS_INVALID_DISPOSITION: -- case STATUS_NONCONTINUABLE_EXCEPTION: -- case STATUS_SINGLE_STEP: -- case DBG_CONTROL_C: // only when a debugger is attached -- // In DMC, but not in Microsoft docs -- case STATUS_GUARD_PAGE_VIOLATION: -- case STATUS_INVALID_HANDLE: --*/ -- // convert all other exception codes into a Win32Exception -- default: -- pti = _d_create_exception_object(&_Class_5Error, "Win32 Exception"); -- break; -- } -- -- return pti; --} -- --/************************************** -- * Call finally blocks in the current stack frame until stop_index. -- * This is roughly equivalent to _local_unwind() for C in \src\win32\ehsup.c -- */ -- --void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, -- struct DEstablisherFrame *frame, int stop_index) --{ -- struct DHandlerInfo *phi; -- struct DCatchInfo *pci; -- int i; -- -- // Set up a special exception handler to catch double-fault exceptions. -- __asm -- { -- push dword ptr -1 -- push dword ptr 0 -- push offset _local_except_handler // defined in src\win32\ehsup.c -- push dword ptr fs:_except_list -- mov FS:_except_list,ESP -- } -- -- for (i = frame->table_index; i != -1 && i != stop_index; i = phi->prev_index) -- { -- phi = &handler_table->handler_info[i]; -- if (phi->finally_code) -- { -- // Note that it is unnecessary to adjust the ESP, as the finally block -- // accesses all items on the stack as relative to EBP. -- -- DWORD *catch_ebp = &frame->ebp; -- void *blockaddr = phi->finally_code; -- -- _asm -- { -- push EBX -- mov EBX,blockaddr -- push EBP -- mov EBP,catch_ebp -- call EBX -- pop EBP -- pop EBX -- } -- } -- } -- -- _asm -- { -- pop FS:_except_list -- add ESP,12 -- } --} -- --/*********************************** -- * external version of the unwinder -- */ -- --__declspec(naked) void __cdecl _d_local_unwind2() --{ -- __asm -- { -- jmp _d_local_unwind -- } --} -- --/*********************************** -- * The frame handler, this is called for each frame that has been registered -- * in the OS except_list. -- * Input: -- * EAX the handler table for the frame -- */ -- --EXCEPTION_DISPOSITION _d_monitor_handler( -- EXCEPTION_RECORD *exception_record, -- struct DEstablisherFrame *frame, -- CONTEXT *context, -- void *dispatcher_context) --{ -- if (exception_record->ExceptionFlags & EXCEPTION_UNWIND) -- { -- _d_monitorexit((Object *)frame->table_index); -- } -- else -- { -- } -- return ExceptionContinueSearch; --} -- --/*********************************** -- */ -- --void _d_monitor_prolog(void *x, void *y, Object *h) --{ -- __asm -- { -- push EAX -- } -- //printf("_d_monitor_prolog(x=%p, y=%p, h=%p)\n", x, y, h); -- _d_monitorenter(h); -- __asm -- { -- pop EAX -- } --} -- --/*********************************** -- */ -- --void _d_monitor_epilog(void *x, void *y, Object *h) --{ -- //printf("_d_monitor_epilog(x=%p, y=%p, h=%p)\n", x, y, h); -- __asm -- { -- push EAX -- push EDX -- } -- _d_monitorexit(h); -- __asm -- { -- pop EDX -- pop EAX -- } --} -- --#endif -- --/* ======================== linux =============================== */ -- --#if linux -- --#include "mars.h" -- --extern ClassInfo D6object9Throwable7__ClassZ; --#define _Class_9Throwable D6object9Throwable7__ClassZ; -- --extern ClassInfo D6object5Error7__ClassZ; --#define _Class_5Error D6object5Error7__ClassZ -- --typedef int (*fp_t)(); // function pointer in ambient memory model -- --struct DHandlerInfo --{ -- unsigned offset; // offset from function address to start of guarded section -- int prev_index; // previous table index -- unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) -- void *finally_code; // pointer to finally code to execute -- // (!=0 if try-finally) --}; -- --// Address of DHandlerTable, searched for by eh_finddata() -- --struct DHandlerTable --{ -- void *fptr; // pointer to start of function -- unsigned espoffset; // offset of ESP from EBP -- unsigned retoffset; // offset from start of function to return code -- unsigned nhandlers; // dimension of handler_info[] -- struct DHandlerInfo handler_info[1]; --}; -- --struct DCatchBlock --{ -- ClassInfo *type; // catch type -- unsigned bpoffset; // EBP offset of catch var -- void *code; // catch handler code --}; -- --// Create one of these for each try-catch --struct DCatchInfo --{ -- unsigned ncatches; // number of catch blocks -- struct DCatchBlock catch_block[1]; // data for each catch block --}; -- --// One of these is generated for each function with try-catch or try-finally -- --struct FuncTable --{ -- void *fptr; // pointer to start of function -- struct DHandlerTable *handlertable; // eh data for this function -- unsigned size; // size of function in bytes --}; -- --extern struct FuncTable *table_start; --extern struct FuncTable *table_end; -- --void terminate() --{ --// _asm --// { --// hlt --// } --} -- --/******************************************* -- * Given address that is inside a function, -- * figure out which function it is in. -- * Return DHandlerTable if there is one, NULL if not. -- */ -- --struct DHandlerTable *__eh_finddata(void *address) --{ -- struct FuncTable *ft; -- -- for (ft = (struct FuncTable *)table_start; -- ft < (struct FuncTable *)table_end; -- ft++) -- { -- if (ft->fptr <= address && -- address < (void *)((char *)ft->fptr + ft->size)) -- { -- return ft->handlertable; -- } -- } -- return NULL; --} -- -- --/****************************** -- * Given EBP, find return address to caller, and caller's EBP. -- * Input: -- * regbp Value of EBP for current function -- * *pretaddr Return address -- * Output: -- * *pretaddr return address to caller -- * Returns: -- * caller's EBP -- */ -- --unsigned __eh_find_caller(unsigned regbp, unsigned *pretaddr) --{ -- unsigned bp = *(unsigned *)regbp; -- -- if (bp) // if not end of call chain -- { -- // Perform sanity checks on new EBP. -- // If it is screwed up, terminate() hopefully before we do more damage. -- if (bp <= regbp) -- // stack should grow to smaller values -- terminate(); -- -- *pretaddr = *(unsigned *)(regbp + sizeof(int)); -- } -- return bp; --} -- --/*********************************** -- * Throw a D object. -- */ -- --void __stdcall _d_throw(Object *h) --{ -- unsigned regebp; -- -- //printf("_d_throw(h = %p, &h = %p)\n", h, &h); -- //printf("\tvptr = %p\n", *(void **)h); -- -- regebp = _EBP; -- -- while (1) // for each function on the stack -- { -- struct DHandlerTable *handler_table; -- struct FuncTable *pfunc; -- struct DHandlerInfo *phi; -- unsigned retaddr; -- unsigned funcoffset; -- unsigned spoff; -- unsigned retoffset; -- int index; -- int dim; -- int ndx; -- int prev_ndx; -- -- regebp = __eh_find_caller(regebp,&retaddr); -- if (!regebp) -- // if end of call chain -- break; -- -- handler_table = __eh_finddata((void *)retaddr); // find static data associated with function -- if (!handler_table) // if no static data -- { -- continue; -- } -- funcoffset = (unsigned)handler_table->fptr; -- spoff = handler_table->espoffset; -- retoffset = handler_table->retoffset; -- --#ifdef DEBUG -- printf("retaddr = x%x\n",(unsigned)retaddr); -- printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n", -- regebp,funcoffset,spoff,retoffset); --#endif -- -- // Find start index for retaddr in static data -- dim = handler_table->nhandlers; -- index = -1; -- for (int i = 0; i < dim; i++) -- { -- phi = &handler_table->handler_info[i]; -- -- if ((unsigned)retaddr >= funcoffset + phi->offset) -- index = i; -- } -- -- // walk through handler table, checking each handler -- // with an index smaller than the current table_index -- for (ndx = index; ndx != -1; ndx = prev_ndx) -- { -- phi = &handler_table->handler_info[ndx]; -- prev_ndx = phi->prev_index; -- if (phi->cioffset) -- { -- // this is a catch handler (no finally) -- struct DCatchInfo *pci; -- int ncatches; -- int i; -- -- pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); -- ncatches = pci->ncatches; -- for (i = 0; i < ncatches; i++) -- { -- struct DCatchBlock *pcb; -- ClassInfo *ci = **(ClassInfo ***)h; -- -- pcb = &pci->catch_block[i]; -- -- if (_d_isbaseof(ci, pcb->type)) -- { // Matched the catch type, so we've found the handler. -- -- // Initialize catch variable -- *(void **)(regebp + (pcb->bpoffset)) = h; -- -- // Jump to catch block. Does not return. -- { -- unsigned catch_esp; -- fp_t catch_addr; -- -- catch_addr = (fp_t)(pcb->code); -- catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); -- _asm -- { -- mov EAX,catch_esp -- mov ECX,catch_addr -- mov [EAX],ECX -- mov EBP,regebp -- mov ESP,EAX // reset stack -- ret // jump to catch block -- } -- } -- } -- } -- } -- else if (phi->finally_code) -- { // Call finally block -- // Note that it is unnecessary to adjust the ESP, as the finally block -- // accesses all items on the stack as relative to EBP. -- -- void *blockaddr = phi->finally_code; -- -- _asm -- { -- push EBX -- mov EBX,blockaddr -- push EBP -- mov EBP,regebp -- call EBX -- pop EBP -- pop EBX -- } -- } -- } -- } --} -- -- --#endif -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh2.d druntime/src/rt/deh2.d ---- druntime-old/src/rt/deh2.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/deh2.d 1970-01-01 03:00:00.000000000 +0300 -@@ -1,322 +0,0 @@ --/** -- * Implementation of exception handling support routines for Posix. -- * -- * Copyright: Copyright Digital Mars 2000 - 2009. -- * License: Boost License 1.0. -- * Authors: Walter Bright -- * -- * Copyright Digital Mars 2000 - 2009. -- * Distributed under the Boost Software License, Version 1.0. -- * (See accompanying file LICENSE_1_0.txt or copy at -- * http://www.boost.org/LICENSE_1_0.txt) -- */ --module rt.deh2; -- --//debug=1; -- --extern (C) --{ -- extern __gshared -- { -- void* _deh_beg; -- void* _deh_end; -- } -- -- int _d_isbaseof(ClassInfo oc, ClassInfo c); -- -- void _d_setunhandled(Object* o); --} -- --alias int (*fp_t)(); // function pointer in ambient memory model -- --struct DHandlerInfo --{ -- uint offset; // offset from function address to start of guarded section -- uint endoffset; // offset of end of guarded section -- int prev_index; // previous table index -- uint cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) -- void *finally_code; // pointer to finally code to execute -- // (!=0 if try-finally) --} -- --// Address of DHandlerTable, searched for by eh_finddata() -- --struct DHandlerTable --{ -- void *fptr; // pointer to start of function -- uint espoffset; // offset of ESP from EBP -- uint retoffset; // offset from start of function to return code -- uint nhandlers; // dimension of handler_info[] -- DHandlerInfo handler_info[1]; --} -- --struct DCatchBlock --{ -- ClassInfo type; // catch type -- uint bpoffset; // EBP offset of catch var -- void *code; // catch handler code --} -- --// Create one of these for each try-catch --struct DCatchInfo --{ -- uint ncatches; // number of catch blocks -- DCatchBlock catch_block[1]; // data for each catch block --} -- --// One of these is generated for each function with try-catch or try-finally -- --struct FuncTable --{ -- void *fptr; // pointer to start of function -- DHandlerTable *handlertable; // eh data for this function -- uint fsize; // size of function in bytes --} -- --void terminate() --{ -- asm -- { -- hlt ; -- } --} -- --/******************************************* -- * Given address that is inside a function, -- * figure out which function it is in. -- * Return DHandlerTable if there is one, NULL if not. -- */ -- --DHandlerTable *__eh_finddata(void *address) --{ -- FuncTable *ft; -- --// debug printf("__eh_finddata(address = x%x)\n", address); --// debug printf("_deh_beg = x%x, _deh_end = x%x\n", &_deh_beg, &_deh_end); -- for (ft = cast(FuncTable *)&_deh_beg; -- ft < cast(FuncTable *)&_deh_end; -- ft++) -- { --// debug printf("\tfptr = x%x, fsize = x%03x, handlertable = x%x\n", --// ft.fptr, ft.fsize, ft.handlertable); -- -- if (ft.fptr <= address && -- address < cast(void *)(cast(char *)ft.fptr + ft.fsize)) -- { --// debug printf("\tfound handler table\n"); -- return ft.handlertable; -- } -- } --// debug printf("\tnot found\n"); -- return null; --} -- -- --/****************************** -- * Given EBP, find return address to caller, and caller's EBP. -- * Input: -- * regbp Value of EBP for current function -- * *pretaddr Return address -- * Output: -- * *pretaddr return address to caller -- * Returns: -- * caller's EBP -- */ -- --uint __eh_find_caller(uint regbp, uint *pretaddr) --{ -- uint bp = *cast(uint *)regbp; -- -- if (bp) // if not end of call chain -- { -- // Perform sanity checks on new EBP. -- // If it is screwed up, terminate() hopefully before we do more damage. -- if (bp <= regbp) -- // stack should grow to smaller values -- terminate(); -- -- *pretaddr = *cast(uint *)(regbp + int.sizeof); -- } -- return bp; --} -- --/*********************************** -- * Throw a D object. -- */ -- --extern (Windows) void _d_throw(Object *h) --{ -- uint regebp; -- -- debug -- { -- printf("_d_throw(h = %p, &h = %p)\n", h, &h); -- printf("\tvptr = %p\n", *cast(void **)h); -- } -- -- asm -- { -- mov regebp,EBP ; -- } -- -- _d_setunhandled(h); -- --//static uint abc; --//if (++abc == 2) *(char *)0=0; -- --//int count = 0; -- while (1) // for each function on the stack -- { -- DHandlerTable *handler_table; -- FuncTable *pfunc; -- DHandlerInfo *phi; -- uint retaddr; -- uint funcoffset; -- uint spoff; -- uint retoffset; -- int index; -- int dim; -- int ndx; -- int prev_ndx; -- -- regebp = __eh_find_caller(regebp,&retaddr); -- if (!regebp) -- { // if end of call chain -- debug printf("end of call chain\n"); -- break; -- } -- -- debug printf("found caller, EBP = x%x, retaddr = x%x\n", regebp, retaddr); --//if (++count == 12) *(char*)0=0; -- handler_table = __eh_finddata(cast(void *)retaddr); // find static data associated with function -- if (!handler_table) // if no static data -- { -- debug printf("no handler table\n"); -- continue; -- } -- funcoffset = cast(uint)handler_table.fptr; -- spoff = handler_table.espoffset; -- retoffset = handler_table.retoffset; -- -- debug -- { -- printf("retaddr = x%x\n",cast(uint)retaddr); -- printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n", -- regebp,funcoffset,spoff,retoffset); -- } -- -- // Find start index for retaddr in static data -- dim = handler_table.nhandlers; -- -- debug -- { -- printf("handler_info[]:\n"); -- for (int i = 0; i < dim; i++) -- { -- phi = &handler_table.handler_info[i]; -- printf("\t[%d]: offset = x%04x, endoffset = x%04x, prev_index = %d, cioffset = x%04x, finally_code = %x\n", -- i, phi.offset, phi.endoffset, phi.prev_index, phi.cioffset, phi.finally_code); -- } -- } -- -- index = -1; -- for (int i = 0; i < dim; i++) -- { -- phi = &handler_table.handler_info[i]; -- -- debug printf("i = %d, phi.offset = %04x\n", i, funcoffset + phi.offset); -- if (cast(uint)retaddr > funcoffset + phi.offset && -- cast(uint)retaddr <= funcoffset + phi.endoffset) -- index = i; -- } -- debug printf("index = %d\n", index); -- -- // walk through handler table, checking each handler -- // with an index smaller than the current table_index -- for (ndx = index; ndx != -1; ndx = prev_ndx) -- { -- phi = &handler_table.handler_info[ndx]; -- prev_ndx = phi.prev_index; -- if (phi.cioffset) -- { -- // this is a catch handler (no finally) -- DCatchInfo *pci; -- int ncatches; -- int i; -- -- pci = cast(DCatchInfo *)(cast(char *)handler_table + phi.cioffset); -- ncatches = pci.ncatches; -- for (i = 0; i < ncatches; i++) -- { -- DCatchBlock *pcb; -- ClassInfo ci = **cast(ClassInfo **)h; -- -- pcb = &pci.catch_block[i]; -- -- if (_d_isbaseof(ci, pcb.type)) -- { // Matched the catch type, so we've found the handler. -- -- _d_setunhandled(null); -- -- // Initialize catch variable -- *cast(void **)(regebp + (pcb.bpoffset)) = h; -- -- // Jump to catch block. Does not return. -- { -- uint catch_esp; -- fp_t catch_addr; -- -- catch_addr = cast(fp_t)(pcb.code); -- catch_esp = regebp - handler_table.espoffset - fp_t.sizeof; -- asm -- { -- mov EAX,catch_esp ; -- mov ECX,catch_addr ; -- mov [EAX],ECX ; -- mov EBP,regebp ; -- mov ESP,EAX ; // reset stack -- ret ; // jump to catch block -- } -- } -- } -- } -- } -- else if (phi.finally_code) -- { // Call finally block -- // Note that it is unnecessary to adjust the ESP, as the finally block -- // accesses all items on the stack as relative to EBP. -- -- void *blockaddr = phi.finally_code; -- -- version (OSX) -- { -- asm -- { -- sub ESP,4 ; // align stack to 16 -- push EBX ; -- mov EBX,blockaddr ; -- push EBP ; -- mov EBP,regebp ; -- call EBX ; -- pop EBP ; -- pop EBX ; -- add ESP,4 ; -- } -- } -- else -- { -- asm -- { -- push EBX ; -- mov EBX,blockaddr ; -- push EBP ; -- mov EBP,regebp ; -- call EBX ; -- pop EBP ; -- pop EBX ; -- } -- } -- } -- } -- } --} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/eh.d druntime/src/rt/eh.d ---- druntime-old/src/rt/eh.d 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/src/rt/eh.d 2010-10-03 18:29:58.099624002 +0400 -@@ -0,0 +1,428 @@ -+/** -+ * This module contains functions and structures required for -+ * exception handling. -+ */ -+module eh; -+ -+private import core.stdc.stdio; -+private import core.stdc.stdlib; -+private import rt.util.console; -+private import ldc.cstdarg; -+ -+// debug = EH_personality; -+// debug = EH_personality_verbose; -+ -+// current EH implementation works on x86 -+// if it has a working unwind runtime -+version(X86) { -+ version(linux) version=X86_UNWIND; -+ version(darwin) version=X86_UNWIND; -+ version(solaris) version=X86_UNWIND; -+} -+version(X86_64) { -+ version(linux) version=X86_UNWIND; -+ version(darwin) version=X86_UNWIND; -+ version(solaris) version=X86_UNWIND; -+} -+ -+//version = HP_LIBUNWIND; -+ -+// D runtime functions -+extern(C) { -+ int _d_isbaseof(ClassInfo oc, ClassInfo c); -+} -+ -+// libunwind headers -+extern(C) -+{ -+ enum _Unwind_Reason_Code : int -+ { -+ NO_REASON = 0, -+ FOREIGN_EXCEPTION_CAUGHT = 1, -+ FATAL_PHASE2_ERROR = 2, -+ FATAL_PHASE1_ERROR = 3, -+ NORMAL_STOP = 4, -+ END_OF_STACK = 5, -+ HANDLER_FOUND = 6, -+ INSTALL_CONTEXT = 7, -+ CONTINUE_UNWIND = 8 -+ } -+ -+ enum _Unwind_Action : int -+ { -+ SEARCH_PHASE = 1, -+ CLEANUP_PHASE = 2, -+ HANDLER_FRAME = 4, -+ FORCE_UNWIND = 8 -+ } -+ -+ alias void* _Unwind_Context_Ptr; -+ -+ alias void function(_Unwind_Reason_Code, _Unwind_Exception*) _Unwind_Exception_Cleanup_Fn; -+ -+ struct _Unwind_Exception -+ { -+ ulong exception_class; -+ _Unwind_Exception_Cleanup_Fn exception_cleanup; -+ ptrdiff_t private_1; -+ ptrdiff_t private_2; -+ } -+ -+// interface to HP's libunwind from http://www.nongnu.org/libunwind/ -+version(HP_LIBUNWIND) -+{ -+ void __libunwind_Unwind_Resume(_Unwind_Exception *); -+ _Unwind_Reason_Code __libunwind_Unwind_RaiseException(_Unwind_Exception *); -+ ptrdiff_t __libunwind_Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr -+ context); -+ ptrdiff_t __libunwind_Unwind_GetIP(_Unwind_Context_Ptr context); -+ ptrdiff_t __libunwind_Unwind_SetIP(_Unwind_Context_Ptr context, -+ ptrdiff_t new_value); -+ ptrdiff_t __libunwind_Unwind_SetGR(_Unwind_Context_Ptr context, int index, -+ ptrdiff_t new_value); -+ ptrdiff_t __libunwind_Unwind_GetRegionStart(_Unwind_Context_Ptr context); -+ -+ alias __libunwind_Unwind_Resume _Unwind_Resume; -+ alias __libunwind_Unwind_RaiseException _Unwind_RaiseException; -+ alias __libunwind_Unwind_GetLanguageSpecificData -+ _Unwind_GetLanguageSpecificData; -+ alias __libunwind_Unwind_GetIP _Unwind_GetIP; -+ alias __libunwind_Unwind_SetIP _Unwind_SetIP; -+ alias __libunwind_Unwind_SetGR _Unwind_SetGR; -+ alias __libunwind_Unwind_GetRegionStart _Unwind_GetRegionStart; -+} -+else version(X86_UNWIND) -+{ -+ void _Unwind_Resume(_Unwind_Exception*); -+ _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*); -+ ptrdiff_t _Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr context); -+ ptrdiff_t _Unwind_GetIP(_Unwind_Context_Ptr context); -+ ptrdiff_t _Unwind_SetIP(_Unwind_Context_Ptr context, ptrdiff_t new_value); -+ ptrdiff_t _Unwind_SetGR(_Unwind_Context_Ptr context, int index, -+ ptrdiff_t new_value); -+ ptrdiff_t _Unwind_GetRegionStart(_Unwind_Context_Ptr context); -+} -+else -+{ -+ // runtime calls these directly -+ void _Unwind_Resume(_Unwind_Exception*) -+ { -+ console("_Unwind_Resume is not implemented on this platform.\n"); -+ } -+ _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*) -+ { -+ console("_Unwind_RaiseException is not implemented on this platform.\n"); -+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; -+ } -+} -+ -+} -+ -+// error and exit -+extern(C) private void fatalerror(in char* format, ...) -+{ -+ va_list args; -+ va_start(args, format); -+ printf("Fatal error in EH code: "); -+ vprintf(format, args); -+ printf("\n"); -+ abort(); -+} -+ -+ -+// helpers for reading certain DWARF data -+private ubyte* get_uleb128(ubyte* addr, ref size_t res) -+{ -+ res = 0; -+ size_t bitsize = 0; -+ -+ // read as long as high bit is set -+ while(*addr & 0x80) { -+ res |= (*addr & 0x7f) << bitsize; -+ bitsize += 7; -+ addr += 1; -+ if(bitsize >= size_t.sizeof*8) -+ fatalerror("tried to read uleb128 that exceeded size of size_t"); -+ } -+ // read last -+ if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize) -+ fatalerror("Fatal error in EH code: tried to read uleb128 that exceeded size of size_t"); -+ res |= (*addr) << bitsize; -+ -+ return addr + 1; -+} -+ -+private ubyte* get_sleb128(ubyte* addr, ref ptrdiff_t res) -+{ -+ res = 0; -+ size_t bitsize = 0; -+ -+ // read as long as high bit is set -+ while(*addr & 0x80) { -+ res |= (*addr & 0x7f) << bitsize; -+ bitsize += 7; -+ addr += 1; -+ if(bitsize >= size_t.sizeof*8) -+ fatalerror("tried to read sleb128 that exceeded size of size_t"); -+ } -+ // read last -+ if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize) -+ fatalerror("tried to read sleb128 that exceeded size of size_t"); -+ res |= (*addr) << bitsize; -+ -+ // take care of sign -+ if(bitsize < size_t.sizeof*8 && ((*addr) & 0x40)) -+ res |= cast(ptrdiff_t)(-1) ^ ((1 << (bitsize+7)) - 1); -+ -+ return addr + 1; -+} -+ -+ -+// exception struct used by the runtime. -+// _d_throw allocates a new instance and passes the address of its -+// _Unwind_Exception member to the unwind call. The personality -+// routine is then able to get the whole struct by looking at the data -+// surrounding the unwind info. -+struct _d_exception -+{ -+ Object exception_object; -+ _Unwind_Exception unwind_info; -+} -+ -+// the 8-byte string identifying the type of exception -+// the first 4 are for vendor, the second 4 for language -+//TODO: This may be the wrong way around -+char[8] _d_exception_class = "LLDCD1\0\0"; -+ -+ -+// -+// x86 unwind specific implementation of personality function -+// and helpers -+// -+version(X86_UNWIND) +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/alloca.d druntime/src/rt/alloca.d +--- druntime-orig/src/rt/alloca.d 2010-08-05 05:39:06.000000000 +0400 ++++ druntime/src/rt/alloca.d 2010-10-08 22:31:50.989547000 +0400 +@@ -12,6 +12,9 @@ + */ + module rt.alloca; + ++version (DMD) +{ + -+// the personality routine gets called by the unwind handler and is responsible for -+// reading the EH tables and deciding what to do -+extern(C) _Unwind_Reason_Code _d_eh_personality(int ver, _Unwind_Action actions, ulong exception_class, _Unwind_Exception* exception_info, _Unwind_Context_Ptr context) -+{ -+ debug(EH_personality_verbose) printf("entering personality function. context: %p\n", context); -+ // check ver: the C++ Itanium ABI only allows ver == 1 -+ if(ver != 1) -+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; -+ -+ // check exceptionClass -+ //TODO: Treat foreign exceptions with more respect -+ if((cast(char*)&exception_class)[0..8] != _d_exception_class) -+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; -+ -+ // find call site table, action table and classinfo table -+ // Note: callsite and action tables do not contain static-length -+ // data and will be parsed as needed -+ // Note: classinfo_table points past the end of the table -+ ubyte* callsite_table; -+ ubyte* action_table; -+ ClassInfo* classinfo_table; -+ _d_getLanguageSpecificTables(context, callsite_table, action_table, classinfo_table); -+ if (callsite_table is null) -+ return _Unwind_Reason_Code.CONTINUE_UNWIND; -+ -+ /* -+ find landing pad and action table index belonging to ip by walking -+ the callsite_table -+ */ -+ ubyte* callsite_walker = callsite_table; -+ -+ // get the instruction pointer -+ // will be used to find the right entry in the callsite_table -+ // -1 because it will point past the last instruction -+ ptrdiff_t ip = _Unwind_GetIP(context) - 1; -+ -+ // address block_start is relative to -+ ptrdiff_t region_start = _Unwind_GetRegionStart(context); -+ -+ // table entries -+ uint block_start_offset, block_size; -+ ptrdiff_t landing_pad; -+ size_t action_offset; -+ -+ while(true) { -+ // if we've gone through the list and found nothing... -+ if(callsite_walker >= action_table) -+ return _Unwind_Reason_Code.CONTINUE_UNWIND; -+ -+ block_start_offset = *cast(uint*)callsite_walker; -+ block_size = *(cast(uint*)callsite_walker + 1); -+ landing_pad = *(cast(uint*)callsite_walker + 2); -+ if(landing_pad) -+ landing_pad += region_start; -+ callsite_walker = get_uleb128(callsite_walker + 3*uint.sizeof, action_offset); -+ -+ debug(EH_personality_verbose) printf("ip=%llx %d %d %llx\n", ip, block_start_offset, block_size, landing_pad); -+ -+ // since the list is sorted, as soon as we're past the ip -+ // there's no handler to be found -+ if(ip < region_start + block_start_offset) -+ return _Unwind_Reason_Code.CONTINUE_UNWIND; -+ -+ // if we've found our block, exit -+ if(ip < region_start + block_start_offset + block_size) -+ break; -+ } -+ -+ debug(EH_personality) printf("Found correct landing pad and actionOffset %d\n", action_offset); -+ -+ // now we need the exception's classinfo to find a handler -+ // the exception_info is actually a member of a larger _d_exception struct -+ // the runtime allocated. get that now -+ _d_exception* exception_struct = cast(_d_exception*)(cast(ubyte*)exception_info - _d_exception.unwind_info.offsetof); -+ -+ // if there's no action offset and no landing pad, continue unwinding -+ if(!action_offset && !landing_pad) -+ return _Unwind_Reason_Code.CONTINUE_UNWIND; -+ -+ // if there's no action offset but a landing pad, this is a cleanup handler -+ else if(!action_offset && landing_pad) -+ return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context); -+ -+ /* -+ walk action table chain, comparing classinfos using _d_isbaseof -+ */ -+ ubyte* action_walker = action_table + action_offset - 1; -+ -+ ptrdiff_t ti_offset, next_action_offset; -+ while(true) { -+ action_walker = get_sleb128(action_walker, ti_offset); -+ // it is intentional that we not modify action_walker here -+ // next_action_offset is from current action_walker position -+ get_sleb128(action_walker, next_action_offset); -+ -+ // negative are 'filters' which we don't use -+ if(!(ti_offset >= 0)) -+ fatalerror("Filter actions are unsupported"); -+ -+ // zero means cleanup, which we require to be the last action -+ if(ti_offset == 0) { -+ if(!(next_action_offset == 0)) -+ fatalerror("Cleanup action must be last in chain"); -+ return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context); -+ } -+ -+ // get classinfo for action and check if the one in the -+ // exception structure is a base -+ ClassInfo catch_ci = *(classinfo_table - ti_offset); -+ debug(EH_personality) printf("Comparing catch %s to exception %s\n", catch_ci.name.ptr, exception_struct.exception_object.classinfo.name.ptr); -+ if(_d_isbaseof(exception_struct.exception_object.classinfo, catch_ci)) -+ return _d_eh_install_catch_context(actions, ti_offset, landing_pad, exception_struct, context); -+ -+ // we've walked through all actions and found nothing... -+ if(next_action_offset == 0) -+ return _Unwind_Reason_Code.CONTINUE_UNWIND; -+ else -+ action_walker += next_action_offset; -+ } -+ -+ fatalerror("reached unreachable"); -+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; -+} -+ -+// These are the register numbers for SetGR that -+// llvm's eh.exception and eh.selector intrinsics -+// will pick up. -+// Hints for these can be found by looking at the -+// EH_RETURN_DATA_REGNO macro in GCC, careful testing -+// is required though. -+version (X86_64) -+{ -+ private int eh_exception_regno = 0; -+ private int eh_selector_regno = 1; -+} else { -+ private int eh_exception_regno = 0; -+ private int eh_selector_regno = 2; -+} -+ -+private _Unwind_Reason_Code _d_eh_install_catch_context(_Unwind_Action actions, ptrdiff_t switchval, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context) -+{ -+ debug(EH_personality) printf("Found catch clause!\n"); -+ -+ if(actions & _Unwind_Action.SEARCH_PHASE) -+ return _Unwind_Reason_Code.HANDLER_FOUND; -+ -+ else if(actions & _Unwind_Action.CLEANUP_PHASE) -+ { -+ debug(EH_personality) printf("Setting switch value to: %d!\n", switchval); -+ _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)cast(void*)(exception_struct.exception_object)); -+ _Unwind_SetGR(context, eh_selector_regno, cast(ptrdiff_t)switchval); -+ _Unwind_SetIP(context, landing_pad); -+ return _Unwind_Reason_Code.INSTALL_CONTEXT; -+ } -+ -+ fatalerror("reached unreachable"); -+ return _Unwind_Reason_Code.FATAL_PHASE2_ERROR; -+} -+ -+private _Unwind_Reason_Code _d_eh_install_finally_context(_Unwind_Action actions, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context) -+{ -+ // if we're merely in search phase, continue -+ if(actions & _Unwind_Action.SEARCH_PHASE) -+ return _Unwind_Reason_Code.CONTINUE_UNWIND; -+ -+ debug(EH_personality) printf("Calling cleanup routine...\n"); -+ -+ _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)exception_struct); -+ _Unwind_SetGR(context, eh_selector_regno, 0); -+ _Unwind_SetIP(context, landing_pad); -+ return _Unwind_Reason_Code.INSTALL_CONTEXT; -+} -+ -+private void _d_getLanguageSpecificTables(_Unwind_Context_Ptr context, ref ubyte* callsite, ref ubyte* action, ref ClassInfo* ci) -+{ -+ ubyte* data = cast(ubyte*)_Unwind_GetLanguageSpecificData(context); -+ if (data is null) -+ { -+ //printf("language specific data was null\n"); -+ callsite = null; -+ action = null; -+ ci = null; -+ return; -+ } -+ -+ //TODO: Do proper DWARF reading here -+ if(*data++ != 0xff) -+ fatalerror("DWARF header has unexpected format 1"); -+ -+ if(*data++ != 0x00) -+ fatalerror("DWARF header has unexpected format 2"); -+ size_t cioffset; -+ data = get_uleb128(data, cioffset); -+ ci = cast(ClassInfo*)(data + cioffset); -+ -+ if(*data++ != 0x03) -+ fatalerror("DWARF header has unexpected format 3"); -+ size_t callsitelength; -+ data = get_uleb128(data, callsitelength); -+ action = data + callsitelength; -+ -+ callsite = data; -+} -+ -+} // end of x86 Linux specific implementation -+ -+ -+extern(C) void _d_throw_exception(Object e) -+{ -+ if (e !is null) -+ { -+ _d_exception* exc_struct = new _d_exception; -+ exc_struct.unwind_info.exception_class = *cast(ulong*)_d_exception_class.ptr; -+ exc_struct.exception_object = e; -+ _Unwind_Reason_Code ret = _Unwind_RaiseException(&exc_struct.unwind_info); -+ console("_Unwind_RaiseException failed with reason code: ")(ret)("\n"); -+ } -+ abort(); -+} -+ -+extern(C) void _d_eh_resume_unwind(_d_exception* exception_struct) -+{ -+ _Unwind_Resume(&exception_struct.unwind_info); -+} -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/lifetime.d druntime/src/rt/lifetime.d ---- druntime-old/src/rt/lifetime.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/lifetime.d 2010-10-08 14:55:56.581547002 +0400 -@@ -81,6 +81,28 @@ - MAXSMALLSIZE = 256-SMALLPAD, - MAXMEDSIZE = (PAGESIZE / 2) - MEDPAD - } -+ -+ version( LDC ) -+ { -+ size_t length_adjust(size_t sizeelem, size_t newlength) -+ { -+ size_t newsize = void; -+ static if (size_t.sizeof < ulong.sizeof) -+ { -+ ulong s = cast(ulong)sizeelem * cast(ulong)newlength; -+ if (s > size_t.max) -+ onOutOfMemoryError(); -+ newsize = cast(size_t)s; -+ } -+ else -+ { -+ newsize = sizeelem * newlength; -+ if (newsize / newlength != sizeelem) -+ onOutOfMemoryError(); -+ } -+ return newsize; -+ } -+ } + /+ + #if DOS386 + extern size_t _x386_break; +@@ -133,3 +136,5 @@ + ret ; + } } - - -@@ -92,6 +114,13 @@ ++ ++} +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/lifetime.d druntime/src/rt/lifetime.d +--- druntime-orig/src/rt/lifetime.d 2010-08-05 05:39:06.000000000 +0400 ++++ druntime/src/rt/lifetime.d 2010-10-29 10:40:39.533035001 +0400 +@@ -92,6 +92,18 @@ return gc_malloc(sz); } ++version (LDC) ++{ ++ +/** + * for allocating a single POD value + */ @@ -13434,10 +751,12 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. +{ + return gc_malloc(ti.tsize(), !(ti.flags() & 1) ? BlkAttr.NO_SCAN : 0); +} ++ ++} // version (LDC) /** * -@@ -670,7 +699,7 @@ +@@ -670,7 +682,7 @@ * ti is the type of the resulting array, or pointer to element. * (For when the array is initialized to 0) */ @@ -13446,7 +765,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { ulong result; auto size = ti.next.tsize(); // array element size -@@ -702,7 +731,7 @@ +@@ -702,7 +714,7 @@ __setArrayAllocLength(info, size, isshared); result = cast(ulong)length + (cast(ulong)cast(size_t)arrstart << 32); } @@ -13455,7 +774,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. Loverflow: onOutOfMemoryError(); -@@ -711,7 +740,7 @@ +@@ -711,7 +723,7 @@ /** * For when the array has a non-zero initializer. */ @@ -13464,7 +783,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { ulong result; auto size = ti.next.tsize(); // array element size -@@ -764,7 +793,7 @@ +@@ -764,7 +776,7 @@ __setArrayAllocLength(info, size, isshared); result = cast(ulong)length + (cast(ulong)cast(uint)arrstart << 32); } @@ -13473,7 +792,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. Loverflow: onOutOfMemoryError(); -@@ -773,7 +802,7 @@ +@@ -773,7 +785,7 @@ /** * */ @@ -13482,7 +801,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { ulong result; -@@ -823,14 +852,14 @@ +@@ -823,14 +835,14 @@ } va_end(q); } @@ -13499,7 +818,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { ulong result; -@@ -881,10 +910,9 @@ +@@ -881,7 +893,7 @@ } va_end(q); } @@ -13507,34 +826,8 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. + return *cast(void[]*)&result; } -- - /** - * - */ -@@ -1046,7 +1074,7 @@ - /** - * Resize dynamic arrays with 0 initializers. - */ --extern (C) byte[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p) -+extern (C) void[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p) - in - { - assert(ti); -@@ -1206,7 +1234,7 @@ - * initsize size of initializer - * ... initializer - */ --extern (C) byte[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p) -+extern (C) void[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p) - in - { - assert(!p.length || p.data); -@@ -1376,12 +1404,11 @@ - onOutOfMemoryError(); - } -- - /** +@@ -1381,7 +1393,7 @@ * Append y[] to array pointed to by px * size is size of each array element. */ @@ -13543,7 +836,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { // only optimize array append where ti is not a shared type auto sizeelem = ti.next.tsize(); // array element size -@@ -1468,10 +1495,9 @@ +@@ -1468,7 +1480,7 @@ L1: px.length = newlength; memcpy(px.data + length * sizeelem, y.ptr, y.length * sizeelem); @@ -13551,11 +844,8 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. + return *cast(void[]*)px; } -- - /** - * - */ -@@ -1552,21 +1578,36 @@ + +@@ -1552,21 +1564,36 @@ return newcap; } @@ -13594,7 +884,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { // c could encode into from 1 to 4 characters char[4] buf = void; -@@ -1612,7 +1653,7 @@ +@@ -1612,7 +1639,7 @@ /** * Append dchar to wchar[] */ @@ -13603,19 +893,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { // c could encode into from 1 to 2 w characters wchar[2] buf = void; -@@ -1641,7 +1682,6 @@ - return _d_arrayappendT(typeid(shared wchar[]), cast(Array *)&x, appendthis); - } - -- - /** - * - */ -@@ -1794,11 +1834,10 @@ - void* ptr; - } - -- +@@ -1798,7 +1825,7 @@ /** * */ @@ -13624,7 +902,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. out (result) { auto sizeelem = ti.next.tsize(); // array element size -@@ -1819,7 +1858,7 @@ +@@ -1819,7 +1846,7 @@ r.length = a.length; memcpy(r.ptr, a.ptr, size); } @@ -13633,10 +911,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort.d druntime/src/rt/qsort.d ---- druntime-old/src/rt/qsort.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/qsort.d 2010-10-07 13:59:06.815253002 +0400 -@@ -44,7 +44,7 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/qsort.d druntime/src/rt/qsort.d +--- druntime-orig/src/rt/qsort.d 2010-08-05 05:39:06.000000000 +0400 ++++ druntime/src/rt/qsort.d 2010-10-07 13:59:06.815253002 +0400 +@@ -44,7 +44,7 @@ structures. The default value is optimized for a high cost for compares. */ @@ -13645,7 +923,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. { byte* base; byte*[40] stack; // stack -@@ -124,7 +124,7 @@ +@@ -124,7 +124,7 @@ limit = sp[1]; } else // else stack empty, all done @@ -13654,10 +932,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } assert(0); } -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort2.d druntime/src/rt/qsort2.d ---- druntime-old/src/rt/qsort2.d 2010-08-05 05:39:06.000000000 +0400 -+++ druntime/src/rt/qsort2.d 2010-10-07 14:01:41.359253001 +0400 -@@ -31,14 +31,14 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/qsort2.d druntime/src/rt/qsort2.d +--- druntime-orig/src/rt/qsort2.d 2010-08-05 05:39:06.000000000 +0400 ++++ druntime/src/rt/qsort2.d 2010-10-07 14:01:41.359253001 +0400 +@@ -31,14 +31,14 @@ return tiglobal.compare(p1, p2); } @@ -13674,10 +952,10 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. } -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/trace.d druntime/src/rt/trace.d ---- druntime-old/src/rt/trace.d 2010-08-07 09:46:06.000000000 +0400 -+++ druntime/src/rt/trace.d 2010-10-01 21:01:58.444892002 +0400 -@@ -855,7 +855,7 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-orig/src/rt/trace.d druntime/src/rt/trace.d +--- druntime-orig/src/rt/trace.d 2010-08-07 09:46:06.000000000 +0400 ++++ druntime/src/rt/trace.d 2010-10-01 21:01:58.444892002 +0400 +@@ -855,7 +855,7 @@ version (OSX) { // 16 byte align stack asm @@ -13686,7 +964,7 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. pushad ; sub ESP,12 ; } -@@ -870,7 +870,7 @@ +@@ -870,7 +870,7 @@ else { asm @@ -13695,219 +973,3 @@ diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch. pushad ; } trace_epi(); -diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/std/intrinsic.d druntime/src/std/intrinsic.d ---- druntime-old/src/std/intrinsic.d 1970-01-01 03:00:00.000000000 +0300 -+++ druntime/src/std/intrinsic.d 2010-10-03 20:07:21.183624002 +0400 -@@ -0,0 +1,212 @@ -+/* -+ * D phobos intrinsics for LDC -+ * -+ * From GDC ... public domain! -+ */ -+module std.intrinsic; -+ -+// Check for the right compiler -+version(LDC) -+{ -+ // OK -+} -+else -+{ -+ static assert(false, "This module is only valid for LDC"); -+} -+ -+/** -+ * Scans the bits in v starting with bit 0, looking -+ * for the first set bit. -+ * Returns: -+ * The bit number of the first bit set. -+ * The return value is undefined if v is zero. -+ */ -+nothrow int bsf(uint v) -+{ -+ uint m = 1; -+ uint i; -+ for (i = 0; i < 32; i++,m<<=1) { -+ if (v&m) -+ return i; -+ } -+ return i; // supposed to be undefined -+} -+ -+/** -+ * Scans the bits in v from the most significant bit -+ * to the least significant bit, looking -+ * for the first set bit. -+ * Returns: -+ * The bit number of the first bit set. -+ * The return value is undefined if v is zero. -+ * Example: -+ * --- -+ * import std.intrinsic; -+ * -+ * int main() -+ * { -+ * uint v; -+ * int x; -+ * -+ * v = 0x21; -+ * x = bsf(v); -+ * printf("bsf(x%x) = %d\n", v, x); -+ * x = bsr(v); -+ * printf("bsr(x%x) = %d\n", v, x); -+ * return 0; -+ * } -+ * --- -+ * Output: -+ * bsf(x21) = 0
-+ * bsr(x21) = 5 -+ */ -+nothrow int bsr(uint v) -+{ -+ uint m = 0x80000000; -+ uint i; -+ for (i = 32; i ; i--,m>>>=1) { -+ if (v&m) -+ return i-1; -+ } -+ return i; // supposed to be undefined -+} -+ -+ -+/** -+ * Tests the bit. -+ */ -+nothrow int bt(uint *p, uint bitnum) -+{ -+ return (p[bitnum / (uint.sizeof*8)] & (1<<(bitnum & ((uint.sizeof*8)-1)))) ? -1 : 0 ; -+} -+ -+ -+/** -+ * Tests and complements the bit. -+ */ -+nothrow int btc(uint *p, uint bitnum) -+{ -+ uint * q = p + (bitnum / (uint.sizeof*8)); -+ uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1)); -+ int result = *q & mask; -+ *q ^= mask; -+ return result ? -1 : 0; -+} -+ -+ -+/** -+ * Tests and resets (sets to 0) the bit. -+ */ -+nothrow int btr(uint *p, uint bitnum) -+{ -+ uint * q = p + (bitnum / (uint.sizeof*8)); -+ uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1)); -+ int result = *q & mask; -+ *q &= ~mask; -+ return result ? -1 : 0; -+} -+ -+ -+/** -+ * Tests and sets the bit. -+ * Params: -+ * p = a non-NULL pointer to an array of uints. -+ * index = a bit number, starting with bit 0 of p[0], -+ * and progressing. It addresses bits like the expression: -+--- -+p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1))) -+--- -+ * Returns: -+ * A non-zero value if the bit was set, and a zero -+ * if it was clear. -+ * -+ * Example: -+ * --- -+import std.intrinsic; -+ -+int main() -+{ -+ uint array[2]; -+ -+ array[0] = 2; -+ array[1] = 0x100; -+ -+ printf("btc(array, 35) = %d\n", btc(array, 35)); -+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -+ -+ printf("btc(array, 35) = %d\n", btc(array, 35)); -+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -+ -+ printf("bts(array, 35) = %d\n", bts(array, 35)); -+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -+ -+ printf("btr(array, 35) = %d\n", btr(array, 35)); -+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -+ -+ printf("bt(array, 1) = %d\n", bt(array, 1)); -+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); -+ -+ return 0; -+} -+ * --- -+ * Output: -+
-+btc(array, 35) = 0
-+array = [0]:x2, [1]:x108
-+btc(array, 35) = -1
-+array = [0]:x2, [1]:x100
-+bts(array, 35) = 0
-+array = [0]:x2, [1]:x108
-+btr(array, 35) = -1
-+array = [0]:x2, [1]:x100
-+bt(array, 1) = -1
-+array = [0]:x2, [1]:x100
-+
-+ */ -+nothrow int bts(uint *p, uint bitnum) -+{ -+ uint * q = p + (bitnum / (uint.sizeof*8)); -+ uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1)); -+ int result = *q & mask; -+ *q |= mask; -+ return result ? -1 : 0; -+} -+ -+/** -+ * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes -+ * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3 -+ * becomes byte 0. -+ */ -+pragma(intrinsic, "llvm.bswap.i32") -+ uint bswap(uint val); -+ -+/** -+ * Reads I/O port at port_address. -+ */ -+ubyte inp(uint p) { throw new Exception("inp intrinsic not yet implemented"); } -+ -+/** -+ * ditto -+ */ -+ushort inpw(uint p) { throw new Exception("inpw intrinsic not yet implemented"); } -+ -+/** -+ * ditto -+ */ -+uint inpl(uint p) { throw new Exception("inpl intrinsic not yet implemented"); } -+ -+/** -+ * ditto -+ */ -+ubyte outp(uint p, ubyte v) { throw new Exception("outp intrinsic not yet implemented"); } -+ -+/** -+ * ditto -+ */ -+ushort outpw(uint p, ushort v) { throw new Exception("outpw intrinsic not yet implemented"); } -+ -+/** -+ * ditto -+ */ -+uint outpl(uint p, uint v) { throw new Exception("outpl intrinsic not yet implemented"); } diff --git a/phobos.patch b/phobos.patch new file mode 100644 index 00000000..a25ed120 --- /dev/null +++ b/phobos.patch @@ -0,0 +1,243 @@ +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/conv.d phobos/std/conv.d +--- phobos-orig/std/conv.d 2010-09-17 00:27:48.000000000 +0400 ++++ phobos/std/conv.d 2010-10-29 12:06:21.221035000 +0400 +@@ -1395,7 +1395,7 @@ + else // not hex + { + if (toupper(p.front) == 'N') +- { ++ { + // nan + enforce((p.popFront(), !p.empty && toupper(p.front) == 'A') + && (p.popFront(), !p.empty && toupper(p.front) == 'N'), +@@ -3191,6 +3191,11 @@ + T toImpl(T, S)(S d) if (is(Unqual!S == double) && isSomeString!(T)) + { + //alias Unqual!(ElementType!T) Char; ++ version(LDC) // FIXME: workarond for case when this function returns "-nan" ++ { ++ if (isnan(d)) ++ return "nan"; ++ } + char[20] buffer; + int len = sprintf(buffer.ptr, "%g", d); + return to!T(buffer[0 .. len].dup); +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/functional.d phobos/std/functional.d +--- phobos-orig/std/functional.d 2010-09-17 00:27:48.000000000 +0400 ++++ phobos/std/functional.d 2010-10-29 12:01:35.285035001 +0400 +@@ -713,6 +713,13 @@ + assert(dg_pure_nothrow() == 7); + //assert(dg_pure_nothrow_safe() == 8); + } ++ version (LDC) ++ { ++ // FIXME: ++ } ++ else ++ { ++ + /* test for linkage */ + { + struct S +@@ -724,4 +731,6 @@ + auto dg_xtrnD = toDelegate(&S.xtrnD); + static assert(! is(typeof(dg_xtrnC) == typeof(dg_xtrnD))); + } ++ ++ } + } +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/internal/math/biguintx86.d phobos/std/internal/math/biguintx86.d +--- phobos-orig/std/internal/math/biguintx86.d 2010-09-17 00:27:48.000000000 +0400 ++++ phobos/std/internal/math/biguintx86.d 2010-10-26 14:08:51.480925001 +0400 +@@ -733,7 +733,10 @@ + // EDI = dest + // ESI = src + +- enum string OP = (op=='+')? "add" : "sub"; ++ version(LDC) { ++ } else { ++ enum string OP = (op=='+')? "add" : "sub"; ++ } + version(D_PIC) { + enum { zero = 0 } + } else { +@@ -767,7 +770,10 @@ + jnz L_enter_odd; + } + // Main loop, with entry point for even length +-mixin(asmMulAdd_innerloop(OP, "ESP+LASTPARAM")); ++version(LDC) ++ mixin(asmMulAdd_innerloop((op=='+')? "add" : "sub", "ESP+LASTPARAM")); ++else ++ mixin(asmMulAdd_innerloop(OP, "ESP+LASTPARAM")); + asm { + mov EAX, EBP; // get final carry + pop EBP; +@@ -777,6 +783,9 @@ + ret 5*4; + } + L_enter_odd: ++version(LDC) ++ mixin(asmMulAdd_enter_odd((op=='+')? "add" : "sub", "ESP+LASTPARAM")); ++else + mixin(asmMulAdd_enter_odd(OP, "ESP+LASTPARAM")); + } + +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/math.d phobos/std/math.d +--- phobos-orig/std/math.d 2010-09-17 00:27:48.000000000 +0400 ++++ phobos/std/math.d 2010-10-29 12:08:18.925035001 +0400 +@@ -318,7 +318,10 @@ + * Results are undefined if |x| >= $(POWER 2,64). + */ + +-@safe pure nothrow real cos(real x); /* intrinsic */ ++version(LDC) ++ @safe pure nothrow real cos(real x) { return llvm_cos(x); } ++else ++ @safe pure nothrow real cos(real x); /* intrinsic */ + + /*********************************** + * Returns sine of x. x is in radians. +@@ -333,7 +336,10 @@ + * Results are undefined if |x| >= $(POWER 2,64). + */ + +-@safe pure nothrow real sin(real x); /* intrinsic */ ++version(LDC) ++ @safe pure nothrow real sin(real x) { return llvm_sin(x); } ++else ++ @safe pure nothrow real sin(real x); /* intrinsic */ + + + /*********************************** +@@ -831,6 +837,20 @@ + * ) + */ + ++version(LDC) ++{ ++ ++@safe pure nothrow ++{ ++ float sqrt(float x) { return llvm_sqrt(x); } ++ double sqrt(double x) { return llvm_sqrt(x); } ++ real sqrt(real x) { return llvm_sqrt(x); } ++} ++ ++} ++else ++{ ++ + @safe pure nothrow + { + float sqrt(float x); /* intrinsic */ +@@ -838,6 +858,8 @@ + real sqrt(real x); /* intrinsic */ /// ditto + } + ++} ++ + @trusted pure nothrow { // Should be @safe. See bugs 4628, 4630. + // Create explicit overloads for integer sqrts. No ddoc for these because + // hopefully a more elegant solution will eventually be found, so we don't +@@ -1413,9 +1435,22 @@ + * Compute n * 2$(SUP exp) + * References: frexp + */ ++version(LDC) ++{ ++ ++pure nothrow real ldexp(real n, int exp) ++{ ++ return core.stdc.math.ldexpl(n, exp); ++} ++ ++} ++else ++{ + + @safe pure nothrow real ldexp(real n, int exp); /* intrinsic */ + ++} ++ + unittest { + assert(ldexp(1, -16384) == 0x1p-16384L); + assert(ldexp(1, -16382) == 0x1p-16382L); +@@ -1608,7 +1643,31 @@ + * $(TR $(TD $(PLUSMN)$(INFIN)) $(TD +$(INFIN)) ) + * ) + */ +-@safe pure nothrow real fabs(real x); /* intrinsic */ ++version(LDC) { ++ version( FreeBSD ) ++ version (all) // < 8-CURRENT ++ private extern(C) real fabsl(real x) { return fabs(x); } ++ else ++ private extern(C) real fabsl(real x); ++ else ++ private extern(C) real fabsl(real x); ++ pure nothrow real fabs(real x) ++ { ++ version(D_InlineAsm_X86) ++ { ++ asm { ++ fld x; ++ fabs; ++ } ++ } ++ else ++ { ++ return fabsl(x); ++ } ++ } ++} else { ++ @safe pure nothrow real fabs(real x); /* intrinsic */ ++} + + + /*********************************************************************** +diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- phobos-orig/std/openrj.d phobos/std/openrj.d +--- phobos-orig/std/openrj.d 2009-09-03 12:01:40.000000000 +0400 ++++ phobos/std/openrj.d 2010-10-26 13:17:37.480925001 +0400 +@@ -620,11 +620,11 @@ + /** + * + */ +- int opApply(int delegate(inout Field field) dg) ++ int opApply(int delegate(ref Field field) dg) + { + int result = 0; + +- foreach (inout field; m_fields) ++ foreach (ref Field field; m_fields) + { + result = dg(field); + +@@ -1000,11 +1000,11 @@ + /** + * + */ +- int opApply(int delegate(inout Record record) dg) ++ int opApply(int delegate(ref Record record) dg) + { + int result = 0; + +- foreach(inout Record record; m_records) ++ foreach(ref Record record; m_records) + { + result = dg(record); + +@@ -1020,11 +1020,11 @@ + /** + * + */ +- int opApply(int delegate(inout Field field) dg) ++ int opApply(int delegate(ref Field field) dg) + { + int result = 0; + +- foreach(inout Field field; m_fields) ++ foreach(ref Field field; m_fields) + { + result = dg(field); + diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index f8b15d8d..686448fc 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -53,6 +53,9 @@ if(D_VERSION EQUAL 1) set(RUNTIME_INCLUDE ${RUNTIME_DC_DIR}) file(GLOB CORE_D ${RUNTIME_DIR}/lib/common/tango/core/*.d) file(GLOB CORE_C ${RUNTIME_DIR}/lib/common/tango/stdc/*.c) + file(GLOB_RECURSE GC_D ${RUNTIME_GC_DIR}/*.d) + file(GLOB_RECURSE DCRT_D ${RUNTIME_DC_DIR}/*.d) + file(GLOB DCRT_C ${RUNTIME_DC_DIR}/*.c) elseif(D_VERSION EQUAL 2) set(RUNTIME_CC druntime-core) set(RUNTIME_GC druntime-gc-basic) @@ -63,6 +66,21 @@ elseif(D_VERSION EQUAL 2) set(RUNTIME_INCLUDE ${RUNTIME_DIR}/src) file(GLOB CORE_D ${RUNTIME_DIR}/src/core/*.d ) file(GLOB CORE_D_SYNC ${RUNTIME_DIR}/src/core/sync/*.d ) + file(GLOB_RECURSE GC_D ${RUNTIME_GC_DIR}/*.d) + file(GLOB_RECURSE DCRT_D ${RUNTIME_DC_DIR}/*.d) + list(REMOVE_ITEM DCRT_D + ${RUNTIME_DC_DIR}/arrayassign.d + ${RUNTIME_DC_DIR}/arraybyte.d + ${RUNTIME_DC_DIR}/arraycast.d + ${RUNTIME_DC_DIR}/arraycat.d + ${RUNTIME_DC_DIR}/arraydouble.d + ${RUNTIME_DC_DIR}/arrayfloat.d + ${RUNTIME_DC_DIR}/arrayreal.d + ${RUNTIME_DC_DIR}/arrayshort.d + ${RUNTIME_DC_DIR}/deh2.d + ) + file(GLOB DCRT_C ${RUNTIME_DC_DIR}/*.c) + list(REMOVE_ITEM DCRT_C ${RUNTIME_DC_DIR}/deh.c) if(UNIX) file(GLOB CORE_D_SYS ${RUNTIME_DIR}/src/core/sys/posix/*.d) elseif(WIN32) @@ -140,10 +158,6 @@ if(D_VERSION EQUAL 2) endif(NOT PATCH_EXE) endif(D_VERSION EQUAL 2) -file(GLOB_RECURSE GC_D ${RUNTIME_GC_DIR}/*.d) -file(GLOB_RECURSE DCRT_D ${RUNTIME_DC_DIR}/*.d) -file(GLOB DCRT_C ${RUNTIME_DC_DIR}/*.c) - macro(dc INPUT_D OUTLIST_O OUTLIST_BC INCDIR MOREFLAGS PATH) if ("${PATH}" STREQUAL "") file(RELATIVE_PATH output ${RUNTIME_DIR} ${INPUT_D})