diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/cstdarg.di druntime/import/ldc/cstdarg.di --- druntime-old/import/ldc/cstdarg.di 1970-01-01 03:00:00.000000000 +0300 +++ druntime/import/ldc/cstdarg.di 2010-09-30 22:10:37.000000000 +0400 @@ -0,0 +1,29 @@ +/* + * vararg support for extern(C) functions + */ + +module ldc.cstdarg; + +// Check for the right compiler +version(LDC) +{ + // OK +} +else +{ + static assert(false, "This module is only valid for LDC"); +} + +alias void* va_list; + +pragma(va_start) + void va_start(T)(va_list ap, ref T); + +pragma(va_arg) + T va_arg(T)(va_list ap); + +pragma(va_end) + void va_end(va_list args); + +pragma(va_copy) + void va_copy(va_list dst, va_list src); diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/intrinsics.di druntime/import/ldc/intrinsics.di --- druntime-old/import/ldc/intrinsics.di 1970-01-01 03:00:00.000000000 +0300 +++ druntime/import/ldc/intrinsics.di 2010-10-02 14:01:02.975890001 +0400 @@ -0,0 +1,413 @@ +/* + * This module holds declarations to LLVM intrinsics. + * + * See the LLVM language reference for more information: + * + * - http://llvm.org/docs/LangRef.html#intrinsics + * + */ + +module ldc.intrinsics; + +// Check for the right compiler +version(LDC) +{ + // OK +} +else +{ + static assert(false, "This module is only valid for LDC"); +} + +// +// CODE GENERATOR INTRINSICS +// + + +// The 'llvm.returnaddress' intrinsic attempts to compute a target-specific +// value indicating the return address of the current function or one of its +// callers. + +pragma(intrinsic, "llvm.returnaddress") + void* llvm_returnaddress(uint level); + + +// The 'llvm.frameaddress' intrinsic attempts to return the target-specific +// frame pointer value for the specified stack frame. + +pragma(intrinsic, "llvm.frameaddress") + void* llvm_frameaddress(uint level); + + +// The 'llvm.stacksave' intrinsic is used to remember the current state of the +// function stack, for use with llvm.stackrestore. This is useful for +// implementing language features like scoped automatic variable sized arrays +// in C99. + +pragma(intrinsic, "llvm.stacksave") + void* llvm_stacksave(); + + +// The 'llvm.stackrestore' intrinsic is used to restore the state of the +// function stack to the state it was in when the corresponding llvm.stacksave +// intrinsic executed. This is useful for implementing language features like +// scoped automatic variable sized arrays in C99. + +pragma(intrinsic, "llvm.stackrestore") + void llvm_stackrestore(void* ptr); + + +// The 'llvm.prefetch' intrinsic is a hint to the code generator to insert a +// prefetch instruction if supported; otherwise, it is a noop. Prefetches have +// no effect on the behavior of the program but can change its performance +// characteristics. + +pragma(intrinsic, "llvm.prefetch") + void llvm_prefetch(void* ptr, uint rw, uint locality); + + +// The 'llvm.pcmarker' intrinsic is a method to export a Program Counter (PC) +// in a region of code to simulators and other tools. The method is target +// specific, but it is expected that the marker will use exported symbols to +// transmit the PC of the marker. The marker makes no guarantees that it will +// remain with any specific instruction after optimizations. It is possible +// that the presence of a marker will inhibit optimizations. The intended use +// is to be inserted after optimizations to allow correlations of simulation +// runs. + +pragma(intrinsic, "llvm.pcmarker") + void llvm_pcmarker(uint id); + + +// The 'llvm.readcyclecounter' intrinsic provides access to the cycle counter +// register (or similar low latency, high accuracy clocks) on those targets that +// support it. On X86, it should map to RDTSC. On Alpha, it should map to RPCC. +// As the backing counters overflow quickly (on the order of 9 seconds on +// alpha), this should only be used for small timings. + +pragma(intrinsic, "llvm.readcyclecounter") + ulong readcyclecounter(); + + + + +// +// STANDARD C LIBRARY INTRINSICS +// + + +// The 'llvm.memcpy.*' intrinsics copy a block of memory from the source +// location to the destination location. +// Note that, unlike the standard libc function, the llvm.memcpy.* intrinsics do +// not return a value, and takes an extra alignment argument. + +pragma(intrinsic, "llvm.memcpy.i#") + void llvm_memcpy(T)(void* dst, void* src, T len, uint alignment); + +deprecated { + alias llvm_memcpy!(uint) llvm_memcpy_i32; + alias llvm_memcpy!(ulong) llvm_memcpy_i64; +} + + +// The 'llvm.memmove.*' intrinsics move a block of memory from the source +// location to the destination location. It is similar to the 'llvm.memcpy' +// intrinsic but allows the two memory locations to overlap. +// Note that, unlike the standard libc function, the llvm.memmove.* intrinsics +// do not return a value, and takes an extra alignment argument. + +pragma(intrinsic, "llvm.memmove.i#") + void llvm_memmove(T)(void* dst, void* src, T len, uint alignment); + +deprecated { + alias llvm_memmove!(uint) llvm_memmove_i32; + alias llvm_memmove!(ulong) llvm_memmove_i64; +} + + +// The 'llvm.memset.*' intrinsics fill a block of memory with a particular byte +// value. +// Note that, unlike the standard libc function, the llvm.memset intrinsic does +// not return a value, and takes an extra alignment argument. + +pragma(intrinsic, "llvm.memset.i#") + void llvm_memset(T)(void* dst, ubyte val, T len, uint alignment); + +deprecated { + alias llvm_memset!(uint) llvm_memset_i32; + alias llvm_memset!(ulong) llvm_memset_i64; +} + + +// The 'llvm.sqrt' intrinsics return the sqrt of the specified operand, +// returning the same value as the libm 'sqrt' functions would. Unlike sqrt in +// libm, however, llvm.sqrt has undefined behavior for negative numbers other +// than -0.0 (which allows for better optimization, because there is no need to +// worry about errno being set). llvm.sqrt(-0.0) is defined to return -0.0 like +// IEEE sqrt. + +pragma(intrinsic, "llvm.sqrt.f#") + T llvm_sqrt(T)(T val); + +deprecated { + alias llvm_sqrt!(float) llvm_sqrt_f32; + alias llvm_sqrt!(double) llvm_sqrt_f64; + alias llvm_sqrt!(real) llvm_sqrt_f80; // may not actually be .f80 +} + + +// The 'llvm.sin.*' intrinsics return the sine of the operand. + +pragma(intrinsic, "llvm.sin.f#") + T llvm_sin(T)(T val); + +deprecated { + alias llvm_sin!(float) llvm_sin_f32; + alias llvm_sin!(double) llvm_sin_f64; + alias llvm_sin!(real) llvm_sin_f80; // may not actually be .f80 +} + + +// The 'llvm.cos.*' intrinsics return the cosine of the operand. + +pragma(intrinsic, "llvm.cos.f#") + T llvm_cos(T)(T val); + +deprecated { + alias llvm_cos!(float) llvm_cos_f32; + alias llvm_cos!(double) llvm_cos_f64; + alias llvm_cos!(real) llvm_cos_f80; // may not actually be .f80 +} + + +// The 'llvm.powi.*' intrinsics return the first operand raised to the specified +// (positive or negative) power. The order of evaluation of multiplications is +// not defined. When a vector of floating point type is used, the second +// argument remains a scalar integer value. + +pragma(intrinsic, "llvm.powi.f#") + T llvm_powi(T)(T val, int power); + +deprecated { + alias llvm_powi!(float) llvm_powi_f32; + alias llvm_powi!(double) llvm_powi_f64; + alias llvm_powi!(real) llvm_powi_f80; // may not actually be .f80 +} + + +// The 'llvm.pow.*' intrinsics return the first operand raised to the specified +// (positive or negative) power. + +pragma(intrinsic, "llvm.pow.f#") + T llvm_pow(T)(T val, T power); + +deprecated { + alias llvm_pow!(float) llvm_pow_f32; + alias llvm_pow!(double) llvm_pow_f64; + alias llvm_pow!(real) llvm_pow_f80; // may not actually be .f80 +} + + +// +// BIT MANIPULATION INTRINSICS +// + +// The 'llvm.bswap' family of intrinsics is used to byte swap integer values +// with an even number of bytes (positive multiple of 16 bits). These are +// useful for performing operations on data that is not in the target's native +// byte order. + +pragma(intrinsic, "llvm.bswap.i#.i#") + T llvm_bswap(T)(T val); + +deprecated { + alias llvm_bswap!(ushort) llvm_bswap_i16; + alias llvm_bswap!(uint) llvm_bswap_i32; + alias llvm_bswap!(ulong) llvm_bswap_i64; +} + + +// The 'llvm.ctpop' family of intrinsics counts the number of bits set in a +// value. + +pragma(intrinsic, "llvm.ctpop.i#") + T llvm_ctpop(T)(T src); + +deprecated { + alias llvm_ctpop!(ubyte) llvm_ctpop_i8; + alias llvm_ctpop!(ushort) llvm_ctpop_i16; + alias llvm_ctpop!(uint) llvm_ctpop_i32; + alias llvm_ctpop!(ulong) llvm_ctpop_i64; +} + + +// The 'llvm.ctlz' family of intrinsic functions counts the number of leading +// zeros in a variable. + +pragma(intrinsic, "llvm.ctlz.i#") + T llvm_ctlz(T)(T src); + +deprecated { + alias llvm_ctlz!(ubyte) llvm_ctlz_i8; + alias llvm_ctlz!(ushort) llvm_ctlz_i16; + alias llvm_ctlz!(uint) llvm_ctlz_i32; + alias llvm_ctlz!(ulong) llvm_ctlz_i64; +} + + +// The 'llvm.cttz' family of intrinsic functions counts the number of trailing +// zeros. + +pragma(intrinsic, "llvm.cttz.i#") + T llvm_cttz(T)(T src); + +deprecated { + alias llvm_cttz!(ubyte) llvm_cttz_i8; + alias llvm_cttz!(ushort) llvm_cttz_i16; + alias llvm_cttz!(uint) llvm_cttz_i32; + alias llvm_cttz!(ulong) llvm_cttz_i64; +} + + +// The 'llvm.part.select' family of intrinsic functions selects a range of bits +// from an integer value and returns them in the same bit width as the original +// value. + +pragma(intrinsic, "llvm.part.select.i#") + T llvm_part_select(T)(T val, uint loBit, uint hiBit); + +deprecated { + alias llvm_part_select!(ubyte) llvm_part_select_i; + alias llvm_part_select!(ushort) llvm_part_select_i; + alias llvm_part_select!(uint) llvm_part_select_i; + alias llvm_part_select!(ulong) llvm_part_select_i; +} + + +// The 'llvm.part.set' family of intrinsic functions replaces a range of bits +// in an integer value with another integer value. It returns the integer with +// the replaced bits. + +// TODO +// declare i17 @llvm.part.set.i17.i9 (i17 %val, i9 %repl, i32 %lo, i32 %hi) +// declare i29 @llvm.part.set.i29.i9 (i29 %val, i9 %repl, i32 %lo, i32 %hi) + + + + +// +// ATOMIC OPERATIONS AND SYNCHRONIZATION INTRINSICS +// + +// The llvm.memory.barrier intrinsic guarantees ordering between specific +// pairs of memory access types. + +pragma(intrinsic, "llvm.memory.barrier") + void llvm_memory_barrier(bool ll, bool ls, bool sl, bool ss, bool device); + +// This loads a value in memory and compares it to a given value. If they are +// equal, it stores a new value into the memory. + +pragma(intrinsic, "llvm.atomic.cmp.swap.i#.p0i#") + T llvm_atomic_cmp_swap(T)(shared T* ptr, T cmp, T val); + +// This intrinsic loads the value stored in memory at ptr and yields the value +// from memory. It then stores the value in val in the memory at ptr. + +pragma(intrinsic, "llvm.atomic.swap.i#.p0i#") + T llvm_atomic_swap(T)(T* ptr, T val); + +// This intrinsic adds delta to the value stored in memory at ptr. It yields +// the original value at ptr. + +pragma(intrinsic, "llvm.atomic.load.add.i#.p0i#") + T llvm_atomic_load_add(T)(shared const T* ptr, T val); + +// This intrinsic subtracts delta to the value stored in memory at ptr. It +// yields the original value at ptr. + +pragma(intrinsic, "llvm.atomic.load.sub.i#.p0i#") + T llvm_atomic_load_sub(T)(T* ptr, T val); + +// These intrinsics bitwise the operation (and, nand, or, xor) delta to the +// value stored in memory at ptr. It yields the original value at ptr. + +pragma(intrinsic, "llvm.atomic.load.and.i#.p0i#") + T llvm_atomic_load_and(T)(T* ptr, T val); + +pragma(intrinsic, "llvm.atomic.load.nand.i#.p0i#") + T llvm_atomic_load_nand(T)(T* ptr, T val); + +pragma(intrinsic, "llvm.atomic.load.or.i#.p0i#") + T llvm_atomic_load_or(T)(T* ptr, T val); + +pragma(intrinsic, "llvm.atomic.load.xor.i#.p0i#") + T llvm_atomic_load_xor(T)(T* ptr, T val); + +// These intrinsics takes the signed or unsigned minimum or maximum of delta +// and the value stored in memory at ptr. It yields the original value at ptr. + +pragma(intrinsic, "llvm.atomic.load.max.i#.p0i#") + T llvm_atomic_load_max(T)(T* ptr, T val); + +pragma(intrinsic, "llvm.atomic.load.min.i#.p0i#") + T llvm_atomic_load_min(T)(T* ptr, T val); + +pragma(intrinsic, "llvm.atomic.load.umax.i#.p0i#") + T llvm_atomic_load_umax(T)(T* ptr, T val); + +pragma(intrinsic, "llvm.atomic.load.umin.i#.p0i#") + T llvm_atomic_load_umin(T)(T* ptr, T val); + + +// +// ARITHMETIC-WITH-OVERFLOW INTRINSICS +// + +struct OverflowRet(T) { + static assert(is(T : int), T.stringof ~ " is not an integer type!"); + T result; + bool overflow; +} + +// Signed and unsigned addition +pragma(intrinsic, "llvm.sadd.with.overflow.i#") + OverflowRet!(T) llvm_sadd_with_overflow(T)(T lhs, T rhs); + +pragma(intrinsic, "llvm.uadd.with.overflow.i#") + OverflowRet!(T) llvm_uadd_with_overflow(T)(T lhs, T rhs); + + +// Signed and unsigned subtraction +pragma(intrinsic, "llvm.ssub.with.overflow.i#") + OverflowRet!(T) llvm_ssub_with_overflow(T)(T lhs, T rhs); + +pragma(intrinsic, "llvm.usub.with.overflow.i#") + OverflowRet!(T) llvm_usub_with_overflow(T)(T lhs, T rhs); + + +// Signed and unsigned multiplication +pragma(intrinsic, "llvm.smul.with.overflow.i#") + OverflowRet!(T) llvm_smul_with_overflow(T)(T lhs, T rhs); + +/* Note: LLVM documentations says: + * Warning: 'llvm.umul.with.overflow' is badly broken. + * It is actively being fixed, but it should not currently be used! + * + * See: http://llvm.org/docs/LangRef.html#int_umul_overflow + */ +//pragma(intrinsic, "llvm.umul.with.overflow.i#") +// OverflowRet!(T) llvm_umul_with_overflow(T)(T lhs, T rhs); + + +// +// GENERAL INTRINSICS +// + + +// This intrinsics is lowered to the target dependent trap instruction. If the +// target does not have a trap instruction, this intrinsic will be lowered to +// the call of the abort() function. + +pragma(intrinsic, "llvm.trap") + void llvm_trap(); diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/llvmasm.di druntime/import/ldc/llvmasm.di --- druntime-old/import/ldc/llvmasm.di 1970-01-01 03:00:00.000000000 +0300 +++ druntime/import/ldc/llvmasm.di 2010-09-30 22:10:37.000000000 +0400 @@ -0,0 +1,17 @@ +module ldc.llvmasm; + +struct __asmtuple_t(T...) +{ + T v; +} + +pragma(llvm_inline_asm) +{ + void __asm( )(char[] asmcode, char[] constraints, ...); + T __asm(T)(char[] asmcode, char[] constraints, ...); + + template __asmtuple(T...) + { + __asmtuple_t!(T) __asmtuple(char[] asmcode, char[] constraints, ...); + } +} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/vararg.d druntime/import/ldc/vararg.d --- druntime-old/import/ldc/vararg.d 1970-01-01 03:00:00.000000000 +0300 +++ druntime/import/ldc/vararg.d 2010-09-30 22:10:37.000000000 +0400 @@ -0,0 +1,43 @@ +/* + * This module holds the implementation of special vararg templates for D style var args. + * + * Provides the functions tango.core.Vararg expects to be present! + */ + +module ldc.Vararg; + +// Check for the right compiler +version(LDC) +{ + // OK +} +else +{ + static assert(false, "This module is only valid for LDC"); +} + +alias void* va_list; + +void va_start(T) ( out va_list ap, inout T parmn ) +{ + // not needed ! +} + +T va_arg(T)(ref va_list vp) +{ + T* arg = cast(T*) vp; + // ldc always aligns to size_t.sizeof in vararg lists + vp = cast(va_list) ( cast(void*) vp + ( ( T.sizeof + size_t.sizeof - 1 ) & ~( size_t.sizeof - 1 ) ) ); + return *arg; +} + +void va_end( va_list ap ) +{ + // not needed ! +} + +void va_copy( out va_list dst, va_list src ) +{ + // seems pretty useless ! + dst = src; +} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/object.di druntime/import/object.di --- druntime-old/import/object.di 2010-09-03 12:28:52.000000000 +0400 +++ druntime/import/object.di 2010-10-05 12:47:24.873150000 +0400 @@ -130,7 +130,7 @@ Interface[] interfaces; TypeInfo_Class base; void* destructor; - void(*classInvariant)(Object); + void function(Object) classInvariant; uint m_flags; // 1: // is IUnknown or is derived from IUnknown // 2: // has no possible pointers into GC memory @@ -140,7 +140,7 @@ // 32: // has typeinfo member void* deallocator; OffsetTypeInfo[] m_offTi; - void* defaultConstructor; + void function(Object) defaultConstructor; // default Constructor const(MemberInfo[]) function(string) xgetMembers; static TypeInfo_Class find(in char[] classname); @@ -179,7 +179,7 @@ class TypeInfo_Const : TypeInfo { - TypeInfo next; + TypeInfo base; } class TypeInfo_Invariant : TypeInfo_Const @@ -288,7 +288,6 @@ interface TraceInfo { int opApply(scope int delegate(ref char[])); - string toString(); } string msg; diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/std/intrinsic.di druntime/import/std/intrinsic.di --- druntime-old/import/std/intrinsic.di 2010-08-05 05:39:08.000000000 +0400 +++ druntime/import/std/intrinsic.di 1970-01-01 03:00:00.000000000 +0300 @@ -1,176 +0,0 @@ -/** - * These functions are built-in intrinsics to the compiler. - * - * Intrinsic functions are functions built in to the compiler, usually to take - * advantage of specific CPU features that are inefficient to handle via - * external functions. The compiler's optimizer and code generator are fully - * integrated in with intrinsic functions, bringing to bear their full power on - * them. This can result in some surprising speedups. - * - * Copyright: Public Domain - * License: Public Domain - * Authors: Walter Bright - */ -module std.intrinsic; - - -/** - * Scans the bits in v starting with bit 0, looking - * for the first set bit. - * Returns: - * The bit number of the first bit set. - * The return value is undefined if v is zero. - */ -pure nothrow int bsf( uint v ); - - -/** - * Scans the bits in v from the most significant bit - * to the least significant bit, looking - * for the first set bit. - * Returns: - * The bit number of the first bit set. - * The return value is undefined if v is zero. - * Example: - * --- - * import std.intrinsic; - * - * int main() - * { - * uint v; - * int x; - * - * v = 0x21; - * x = bsf(v); - * printf("bsf(x%x) = %d\n", v, x); - * x = bsr(v); - * printf("bsr(x%x) = %d\n", v, x); - * return 0; - * } - * --- - * Output: - * bsf(x21) = 0
- * bsr(x21) = 5 - */ -pure nothrow int bsr( uint v ); - - -/** - * Tests the bit. - */ -pure nothrow int bt( in uint* p, uint bitnum ); - - -/** - * Tests and complements the bit. - */ -nothrow int btc( uint* p, uint bitnum ); - - -/** - * Tests and resets (sets to 0) the bit. - */ -nothrow int btr( uint* p, uint bitnum ); - - -/** - * Tests and sets the bit. - * Params: - * p = a non-NULL pointer to an array of uints. - * index = a bit number, starting with bit 0 of p[0], - * and progressing. It addresses bits like the expression: ---- -p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1))) ---- - * Returns: - * A non-zero value if the bit was set, and a zero - * if it was clear. - * - * Example: - * --- -import std.intrinsic; - -int main() -{ - uint array[2]; - - array[0] = 2; - array[1] = 0x100; - - printf("btc(array, 35) = %d\n", btc(array, 35)); - printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); - - printf("btc(array, 35) = %d\n", btc(array, 35)); - printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); - - printf("bts(array, 35) = %d\n", bts(array, 35)); - printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); - - printf("btr(array, 35) = %d\n", btr(array, 35)); - printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); - - printf("bt(array, 1) = %d\n", bt(array, 1)); - printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); - - return 0; -} - * --- - * Output: -
-btc(array, 35) = 0
-array = [0]:x2, [1]:x108
-btc(array, 35) = -1
-array = [0]:x2, [1]:x100
-bts(array, 35) = 0
-array = [0]:x2, [1]:x108
-btr(array, 35) = -1
-array = [0]:x2, [1]:x100
-bt(array, 1) = -1
-array = [0]:x2, [1]:x100
-
- */ -nothrow int bts( uint* p, uint bitnum ); - - -/** - * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes - * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3 - * becomes byte 0. - */ -pure nothrow uint bswap( uint v ); - - -/** - * Reads I/O port at port_address. - */ -nothrow ubyte inp( uint port_address ); - - -/** - * ditto - */ -nothrow ushort inpw( uint port_address ); - - -/** - * ditto - */ -nothrow uint inpl( uint port_address ); - - -/** - * Writes and returns value to I/O port at port_address. - */ -nothrow ubyte outp( uint port_address, ubyte value ); - - -/** - * ditto - */ -nothrow ushort outpw( uint port_address, ushort value ); - - -/** - * ditto - */ -nothrow uint outpl( uint port_address, uint value ); diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/core/atomic.d druntime/src/core/atomic.d --- druntime-old/src/core/atomic.d 2010-09-03 12:28:52.000000000 +0400 +++ druntime/src/core/atomic.d 2010-10-05 15:55:10.893150001 +0400 @@ -89,6 +89,117 @@ return false; } } + +//////////////////////////////////////////////////////////////////////////////// +// LDC Atomics Implementation +//////////////////////////////////////////////////////////////////////////////// + +else version( LDC ) +{ + import ldc.intrinsics; + + T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) + if( is( NakedType!(V1) == NakedType!(T) ) ) + { + // binary operators + // + // + - * / % ^^ & + // | ^ << >> >>> ~ in + // == != < <= > >= + static if( op == "+" || op == "-" || op == "*" || op == "/" || + op == "%" || op == "^^" || op == "&" || op == "|" || + op == "^" || op == "<<" || op == ">>" || op == ">>>" || + op == "~" || // skip "in" + op == "==" || op == "!=" || op == "<" || op == "<=" || + op == ">" || op == ">=" ) + { + T get = val; // compiler can do atomic load + mixin( "return get " ~ op ~ " mod;" ); + } + else + // assignment operators + // + // += -= *= /= %= ^^= &= + // |= ^= <<= >>= >>>= ~= + static if( op == "+=" || op == "-=" || op == "*=" || op == "/=" || + op == "%=" || op == "^^=" || op == "&=" || op == "|=" || + op == "^=" || op == "<<=" || op == ">>=" || op == ">>>=" ) // skip "~=" + { + T get, set; + + do + { + get = set = atomicLoad!(msync.raw)( val ); + mixin( "set " ~ op ~ " mod;" ); + } while( !cas( &val, get, set ) ); + return set; + } + else + { + static assert( false, "Operation not supported." ); + } + } + + bool cas(T,V1,V2)( shared(T)* here, const V1 ifThis, const V2 writeThis ) + if( is( NakedType!(V1) == NakedType!(T) ) && + is( NakedType!(V2) == NakedType!(T) ) ) + + { + T oldval = void; + static if (is(T P == U*, U)) + { + oldval = cast(T)llvm_atomic_cmp_swap!(size_t)(cast(shared size_t*)&writeThis, cast(size_t)ifThis, cast(size_t)here); + } + else static if (is(T == bool)) + { + oldval = llvm_atomic_cmp_swap!(ubyte)(cast(shared ubyte*)&writeThis, ifThis?1:0, here?1:0)?0:1; + } + else + { + oldval = llvm_atomic_cmp_swap!(T)(here, ifThis, writeThis); + } + return oldval == ifThis; + } + + + private + { + enum msync + { + raw, /// not sequenced + acq, /// hoist-load + hoist-store barrier + rel, /// sink-load + sink-store barrier + seq, /// fully sequenced (acq + rel) + } + + T atomicLoad(msync ms = msync.seq, T)( const ref shared T val ) + { + llvm_memory_barrier( + ms == msync.acq || ms == msync.seq, + ms == msync.acq || ms == msync.seq, + ms == msync.rel || ms == msync.seq, + ms == msync.rel || ms == msync.seq, + false); + static if (is(T P == U*, U)) // pointer + { + return cast(T)llvm_atomic_load_add!(size_t)(cast(size_t*)&val, 0); + } + else static if (is(T == bool)) + { + return llvm_atomic_load_add!(ubyte)(cast(ubyte*)&val, cast(ubyte)0) ? 1 : 0; + } + else + { + return llvm_atomic_load_add!(T)(&val, cast(T)0); + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// x86_32 Atomic Function Implementation +//////////////////////////////////////////////////////////////////////////////// + else version( AsmX86_32 ) { T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) @@ -396,6 +507,12 @@ } } } + + +//////////////////////////////////////////////////////////////////////////////// +// x86_64 Atomic Function Implementation +//////////////////////////////////////////////////////////////////////////////// + else version( AsmX86_64 ) { T atomicOp(string op, T, V1)( ref shared T val, V1 mod ) diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gc.d druntime/src/gc/gc.d --- druntime-old/src/gc/gc.d 2010-08-05 05:39:08.000000000 +0400 +++ druntime/src/gc/gc.d 2010-10-04 16:54:06.837685001 +0400 @@ -100,7 +100,7 @@ version (GCCLASS) { void* p; ClassInfo ci = GC.classinfo; - + p = malloc(ci.init.length); (cast(byte*)p)[0 .. ci.init.length] = ci.init[]; _gc = cast(GC)p; diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcbits.d druntime/src/gc/gcbits.d --- druntime-old/src/gc/gcbits.d 2010-08-08 04:10:24.000000000 +0400 +++ druntime/src/gc/gcbits.d 2010-10-01 20:49:51.268892001 +0400 @@ -26,6 +26,10 @@ { version = bitops; } +else version (LDC) +{ + version = bitops; +} else version (GNU) { // use the unoptimized version diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcx.d druntime/src/gc/gcx.d --- druntime-old/src/gc/gcx.d 2010-08-27 01:23:26.000000000 +0400 +++ druntime/src/gc/gcx.d 2010-10-07 22:27:41.879253001 +0400 @@ -1464,7 +1464,8 @@ void initialize() - { int dummy; + { + int dummy; (cast(byte*)&this)[0 .. Gcx.sizeof] = 0; stackBottom = cast(char*)&dummy; @@ -2200,7 +2201,7 @@ if ((cast(size_t)p & ~(PAGESIZE-1)) == pcache) continue; - auto pool = findPool(p); + auto pool = findPool(p); if (pool) { size_t offset = cast(size_t)(p - pool.baseAddr); @@ -2270,80 +2271,129 @@ __builtin_unwind_init(); sp = & sp; } + else version(LDC) + { + version(X86) + { + uint eax,ecx,edx,ebx,ebp,esi,edi; + asm + { + mov eax[EBP], EAX ; + mov ecx[EBP], ECX ; + mov edx[EBP], EDX ; + mov ebx[EBP], EBX ; + mov ebp[EBP], EBP ; + mov esi[EBP], ESI ; + mov edi[EBP], EDI ; + mov sp[EBP], ESP ; + } + } + else version (X86_64) + { + ulong rax,rbx,rcx,rdx,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15; + asm + { + movq rax[RBP], RAX ; + movq rbx[RBP], RBX ; + movq rcx[RBP], RCX ; + movq rdx[RBP], RDX ; + movq rbp[RBP], RBP ; + movq rsi[RBP], RSI ; + movq rdi[RBP], RDI ; + movq r8 [RBP], R8 ; + movq r9 [RBP], R9 ; + movq r10[RBP], R10 ; + movq r11[RBP], R11 ; + movq r12[RBP], R12 ; + movq r13[RBP], R13 ; + movq r14[RBP], R14 ; + movq r15[RBP], R15 ; + movq sp[RBP], RSP ; + } + } + else + { + static assert( false, "Architecture not supported." ); + } + } else version( D_InlineAsm_X86 ) { - asm - { - pushad ; - mov sp[EBP],ESP ; - } + asm + { + pushad ; + mov sp[EBP],ESP ; + } + } + else version ( D_InlineAsm_X86_64 ) + { + asm + { + push RAX ; + push RBX ; + push RCX ; + push RDX ; + push RSI ; + push RDI ; + push RBP ; + push R8 ; + push R9 ; + push R10 ; + push R11 ; + push R12 ; + push R13 ; + push R14 ; + push R15 ; + push EAX ; // 16 byte align the stack + } + } + else + { + static assert( false, "Architecture not supported." ); } - else version ( D_InlineAsm_X86_64 ) - { - asm - { - push RAX ; - push RBX ; - push RCX ; - push RDX ; - push RSI ; - push RDI ; - push RBP ; - push R8 ; - push R9 ; - push R10 ; - push R11 ; - push R12 ; - push R13 ; - push R14 ; - push R15 ; - push EAX ; // 16 byte align the stack - } - } - else - { - static assert( false, "Architecture not supported." ); - } result = fullcollect(sp); - version( GNU ) - { - // registers will be popped automatically - } - else version( D_InlineAsm_X86 ) - { - asm - { - popad; - } - } - else version ( D_InlineAsm_X86_64 ) - { - asm - { - pop EAX ; // 16 byte align the stack - pop R15 ; - pop R14 ; - pop R13 ; - pop R12 ; - pop R11 ; - pop R10 ; - pop R9 ; - pop R8 ; - pop RBP ; - pop RDI ; - pop RSI ; - pop RDX ; - pop RCX ; - pop RBX ; - pop RAX ; - } - } - else - { - static assert( false, "Architecture not supported." ); - } + version( GNU ) + { + // registers will be popped automatically + } + else version(LDC) + { + // nothing to do + } + else version( D_InlineAsm_X86 ) + { + asm + { + popad; + } + } + else version ( D_InlineAsm_X86_64 ) + { + asm + { + pop EAX ; // 16 byte align the stack + pop R15 ; + pop R14 ; + pop R13 ; + pop R12 ; + pop R11 ; + pop R10 ; + pop R9 ; + pop R8 ; + pop RBP ; + pop RDI ; + pop RSI ; + pop RDX ; + pop RCX ; + pop RBX ; + pop RAX ; + } + } + else + { + static assert( false, "Architecture not supported." ); + } return result; } @@ -2357,7 +2407,7 @@ Pool* pool; debug(COLLECT_PRINTF) printf("Gcx.fullcollect()\n"); - //printf("\tpool address range = %p .. %p\n", minAddr, maxAddr); + //printf("\tpool address range = %p .. %p\n", minAddr, maxAddr); thread_suspendAll(); diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/object_.d druntime/src/object_.d --- druntime-old/src/object_.d 2010-09-03 12:28:52.000000000 +0400 +++ druntime/src/object_.d 2010-10-05 14:50:34.733150002 +0400 @@ -1073,7 +1073,7 @@ abstract class MemberInfo { - string name(); + string name() { return ""; }; // LDC: FIXME: } class MemberInfo_field : MemberInfo @@ -1663,7 +1663,6 @@ { int len = 0; ModuleReference *mr; - for (mr = _Dmodule_ref; mr; mr = mr.next) len++; _moduleinfo_array = new ModuleInfo*[len]; @@ -1802,7 +1801,10 @@ { debug(PRINTF) printf("_moduleTlsCtor()\n"); - void* p = alloca(_moduleinfo_array.length * ubyte.sizeof); + version( DMD ) + void* p = alloca(_moduleinfo_array.length * ubyte.sizeof); + else + void* p = malloc(_moduleinfo_array.length * ubyte.sizeof); auto flags = cast(ubyte[])p[0 .. _moduleinfo_array.length]; flags[] = 0; @@ -2025,7 +2027,6 @@ _d_monitor_create(h); m = getMonitor(h); } - IMonitor i = m.impl; if (i is null) @@ -2124,7 +2125,7 @@ size_t _aaLen(void* p); void* _aaGet(void** pp, TypeInfo keyti, size_t valuesize, ...); void* _aaGetRvalue(void* p, TypeInfo keyti, size_t valuesize, ...); - void* _aaIn(void* p, TypeInfo keyti); + void* _aaIn(void* p, TypeInfo keyti, ...); void _aaDel(void* p, TypeInfo keyti, ...); void[] _aaValues(void* p, size_t keysize, size_t valuesize); void[] _aaKeys(void* p, size_t keysize, size_t valuesize); diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/adi.d druntime/src/rt/adi.d --- druntime-old/src/rt/adi.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/adi.d 2010-10-07 14:32:52.911253001 +0400 @@ -35,6 +35,14 @@ extern (C) void gc_free( void* p ); } +version (DMD) +{ + version (X86) + { + version = DMD_X86; + } +} + struct Array { @@ -48,7 +56,7 @@ * reversed. */ -extern (C) long _adReverseChar(char[] a) +extern (C) char[] _adReverseChar(char[] a) { if (a.length > 1) { @@ -108,7 +116,7 @@ hi = hi - 1 + (stridehi - stridelo); } } - return *cast(long*)(&a); + return a; } unittest @@ -143,7 +151,7 @@ * reversed. */ -extern (C) long _adReverseWchar(wchar[] a) +extern (C) wchar[] _adReverseWchar(wchar[] a) { if (a.length > 1) { @@ -201,7 +209,7 @@ hi = hi - 1 + (stridehi - stridelo); } } - return *cast(long*)(&a); + return a; } unittest @@ -225,10 +233,10 @@ * Support for array.reverse property. */ -extern (C) long _adReverse(Array a, size_t szelem) +extern (C) void[] _adReverse(void[] a, size_t szelem) out (result) { - assert(result is *cast(long*)(&a)); + assert(result.ptr is a.ptr); } body { @@ -243,10 +251,10 @@ tmp = buffer.ptr; if (szelem > 16) { - //version (Windows) + version (Windows) tmp = cast(byte*) alloca(szelem); - //else - //tmp = gc_malloc(szelem); + else + tmp = cast(byte*) gc_malloc(szelem); } for (; lo < hi; lo += szelem, hi -= szelem) @@ -267,7 +275,7 @@ //gc_free(tmp); } } - return *cast(long*)(&a); + return a; } unittest @@ -311,7 +319,7 @@ * Sort array of chars. */ -extern (C) long _adSortChar(char[] a) +extern (C) char[] _adSortChar(char[] a) { if (a.length > 1) { @@ -326,14 +334,14 @@ } delete da; } - return *cast(long*)(&a); + return a; } /********************************************** * Sort array of wchars. */ -extern (C) long _adSortWchar(wchar[] a) +extern (C) wchar[] _adSortWchar(wchar[] a) { if (a.length > 1) { @@ -348,7 +356,7 @@ } delete da; } - return *cast(long*)(&a); + return a; } /*************************************** @@ -358,7 +366,7 @@ * 0 not equal */ -extern (C) int _adEq(Array a1, Array a2, TypeInfo ti) +extern (C) int _adEq(void[] a1, void[] a2, TypeInfo ti) { debug(adi) printf("_adEq(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); if (a1.length != a2.length) @@ -379,7 +387,7 @@ return 1; // equal } -extern (C) int _adEq2(Array a1, Array a2, TypeInfo ti) +extern (C) int _adEq2(void[] a1, void[] a2, TypeInfo ti) { debug(adi) printf("_adEq2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); if (a1.length != a2.length) @@ -405,7 +413,7 @@ * Support for array compare test. */ -extern (C) int _adCmp(Array a1, Array a2, TypeInfo ti) +extern (C) int _adCmp(void[] a1, void[] a2, TypeInfo ti) { debug(adi) printf("adCmp()\n"); auto len = a1.length; @@ -435,7 +443,7 @@ return (a1.length > a2.length) ? 1 : -1; } -extern (C) int _adCmp2(Array a1, Array a2, TypeInfo ti) +extern (C) int _adCmp2(void[] a1, void[] a2, TypeInfo ti) { debug(adi) printf("_adCmp2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length); return ti.compare(&a1, &a2); @@ -461,9 +469,9 @@ * Support for array compare test. */ -extern (C) int _adCmpChar(Array a1, Array a2) +extern (C) int _adCmpChar(void[] a1, void[] a2) { - version (X86) + version (DMD_X86) { asm { naked ; @@ -569,8 +577,8 @@ ret ; } - } - else + } + else { int len; int c; diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayInit.d druntime/src/rt/arrayInit.d --- druntime-old/src/rt/arrayInit.d 1970-01-01 03:00:00.000000000 +0300 +++ druntime/src/rt/arrayInit.d 2010-10-03 20:41:52.223624001 +0400 @@ -0,0 +1,155 @@ +private import ldc.intrinsics; + +extern(C): + +int memcmp(void*,void*,size_t); +size_t strlen(char*); + +version(LLVM64) +alias llvm_memcpy_i64 llvm_memcpy; +else +alias llvm_memcpy_i32 llvm_memcpy; + +// per-element array init routines + +void _d_array_init_i16(ushort* a, size_t n, ushort v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_i32(uint* a, size_t n, uint v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_i64(ulong* a, size_t n, ulong v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_float(float* a, size_t n, float v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_double(double* a, size_t n, double v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_real(real* a, size_t n, real v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_cfloat(cfloat* a, size_t n, cfloat v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_cdouble(cdouble* a, size_t n, cdouble v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_creal(creal* a, size_t n, creal v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_pointer(void** a, size_t n, void* v) +{ + auto p = a; + auto end = a+n; + while (p !is end) + *p++ = v; +} + +void _d_array_init_mem(void* a, size_t na, void* v, size_t nv) +{ + auto p = a; + auto end = a + na*nv; + while (p !is end) { + llvm_memcpy(p,v,nv,0); + p += nv; + } +} + +/* +void _d_array_init(TypeInfo ti, void* a) +{ + auto initializer = ti.next.init(); + auto isize = initializer.length; + auto q = initializer.ptr; + + if (isize == 1) + memset(p, *cast(ubyte*)q, size); + else if (isize == int.sizeof) + { + int init = *cast(int*)q; + size /= int.sizeof; + for (size_t u = 0; u < size; u++) + { + (cast(int*)p)[u] = init; + } + } + else + { + for (size_t u = 0; u < size; u += isize) + { + memcpy(p + u, q, isize); + } + } +}*/ + +// for array cast +size_t _d_array_cast_len(size_t len, size_t elemsz, size_t newelemsz) +{ + if (newelemsz == 1) { + return len*elemsz; + } + else if ((len*elemsz) % newelemsz) { + throw new Exception("Bad array cast"); + } + return (len*elemsz)/newelemsz; +} + +// slice copy when assertions are enabled +void _d_array_slice_copy(void* dst, size_t dstlen, void* src, size_t srclen) +{ + assert(dst); + assert(src); + if (dstlen != srclen) + throw new Exception("lengths don't match for array copy"); + else if (dst+dstlen <= src || src+srclen <= dst) + llvm_memcpy(dst, src, dstlen, 0); + else + throw new Exception("overlapping array copy"); +} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayassign.d druntime/src/rt/arrayassign.d --- druntime-old/src/rt/arrayassign.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arrayassign.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,186 +0,0 @@ -/** - * Implementation of array assignment support routines. - * - * Copyright: Copyright Digital Mars 2000 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright - * - * Copyright Digital Mars 2000 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arrayassign; - -private -{ - import rt.util.string; - import core.stdc.string; - import core.stdc.stdlib; - debug(PRINTF) import core.stdc.stdio; -} - -/** - * Does array assignment (not construction) from another - * array of the same element type. - * ti is the element type. - * Handles overlapping copies. - */ -extern (C) void[] _d_arrayassign(TypeInfo ti, void[] from, void[] to) -{ - debug(PRINTF) printf("_d_arrayassign(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize()); - - if (to.length != from.length) - { - char[10] tmp = void; - string msg = "lengths don't match for array copy,"c; - msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length); - throw new Exception(msg); - } - - auto element_size = ti.tsize(); - - /* Need a temporary buffer tmp[] big enough to hold one element - */ - void[16] buf = void; - void[] tmp; - if (element_size > buf.sizeof) - tmp = alloca(element_size)[0 .. element_size]; - else - tmp = buf; - - - if (to.ptr <= from.ptr) - { - foreach (i; 0 .. to.length) - { - void* pto = to.ptr + i * element_size; - void* pfrom = from.ptr + i * element_size; - memcpy(tmp.ptr, pto, element_size); - memcpy(pto, pfrom, element_size); - ti.postblit(pto); - ti.destroy(tmp.ptr); - } - } - else - { - for (int i = to.length; i--; ) - { - void* pto = to.ptr + i * element_size; - void* pfrom = from.ptr + i * element_size; - memcpy(tmp.ptr, pto, element_size); - memcpy(pto, pfrom, element_size); - ti.postblit(pto); - ti.destroy(tmp.ptr); - } - } - return to; -} - -/** - * Does array initialization (not assignment) from another - * array of the same element type. - * ti is the element type. - */ -extern (C) void[] _d_arrayctor(TypeInfo ti, void[] from, void[] to) -{ - debug(PRINTF) printf("_d_arrayctor(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize()); - - if (to.length != from.length) - { - char[10] tmp = void; - string msg = "lengths don't match for array initialization,"c; - msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length); - throw new Exception(msg); - } - - auto element_size = ti.tsize(); - - int i; - try - { - for (i = 0; i < to.length; i++) - { - // Copy construction is defined as bit copy followed by postblit. - memcpy(to.ptr + i * element_size, from.ptr + i * element_size, element_size); - ti.postblit(to.ptr + i * element_size); - } - } - catch (Object o) - { - /* Destroy, in reverse order, what we've constructed so far - */ - while (i--) - { - ti.destroy(to.ptr + i * element_size); - } - - throw o; - } - return to; -} - - -/** - * Do assignment to an array. - * p[0 .. count] = value; - */ -extern (C) void* _d_arraysetassign(void* p, void* value, int count, TypeInfo ti) -{ - void* pstart = p; - - auto element_size = ti.tsize(); - - //Need a temporary buffer tmp[] big enough to hold one element - void[16] buf = void; - void[] tmp; - if (element_size > buf.sizeof) - { - tmp = alloca(element_size)[0 .. element_size]; - } - else - tmp = buf; - - foreach (i; 0 .. count) - { - memcpy(tmp.ptr, p, element_size); - memcpy(p, value, element_size); - ti.postblit(p); - ti.destroy(tmp.ptr); - p += element_size; - } - return pstart; -} - -/** - * Do construction of an array. - * ti[count] p = value; - */ -extern (C) void* _d_arraysetctor(void* p, void* value, int count, TypeInfo ti) -{ - void* pstart = p; - auto element_size = ti.tsize(); - - try - { - foreach (i; 0 .. count) - { - // Copy construction is defined as bit copy followed by postblit. - memcpy(p, value, element_size); - ti.postblit(p); - p += element_size; - } - } - catch (Object o) - { - // Destroy, in reverse order, what we've constructed so far - while (p > pstart) - { - p -= element_size; - ti.destroy(p); - } - - throw o; - } - return pstart; -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraybyte.d druntime/src/rt/arraybyte.d --- druntime-old/src/rt/arraybyte.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arraybyte.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,1893 +0,0 @@ -/** - * Contains SSE2 and MMX versions of certain operations for char, byte, and - * ubyte ('a', 'g' and 'h' suffixes). - * - * Copyright: Copyright Digital Mars 2008 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, based on code originally written by Burton Radons - * - * Copyright Digital Mars 2008 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arraybyte; - -import core.cpuid; - -version (unittest) -{ - private import core.stdc.stdio : printf; - /* This is so unit tests will test every CPU variant - */ - int cpuid; - const int CPUID_MAX = 4; - bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } - bool sse() { return cpuid == 2 && core.cpuid.sse(); } - bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } - bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } -} -else -{ - alias core.cpuid.mmx mmx; - alias core.cpuid.sse sse; - alias core.cpuid.sse2 sse2; - alias core.cpuid.amd3dnow amd3dnow; -} - -//version = log; - -bool disjoint(T)(T[] a, T[] b) -{ - return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); -} - -alias byte T; - -extern (C): - -/* ======================================================================== */ - - -/*********************** - * Computes: - * a[] = b[] + value - */ - -T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) -{ - return _arraySliceExpAddSliceAssign_g(a, value, b); -} - -T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) -{ - return _arraySliceExpAddSliceAssign_g(a, value, b); -} - -T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpAddSliceAssign_g()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1088% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - uint l = cast(ubyte) value; - l |= (l << 8); - l |= (l << 16); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startaddsse2u: - add ESI, 64; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - movdqu XMM2, [EAX+32]; - movdqu XMM3, [EAX+48]; - add EAX, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM4; - paddb XMM2, XMM4; - paddb XMM3, XMM4; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startaddsse2a: - add ESI, 64; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - movdqa XMM2, [EAX+32]; - movdqa XMM3, [EAX+48]; - add EAX, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM4; - paddb XMM2, XMM4; - paddb XMM3, XMM4; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 1000% faster - if (mmx() && a.length >= 32) - { - auto n = aptr + (a.length & ~31); - - uint l = cast(ubyte) value; - l |= (l << 8); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM4, l; - pshufw MM4, MM4, 0; - - align 4; - startaddmmx: - add ESI, 32; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - add EAX, 32; - paddb MM0, MM4; - paddb MM1, MM4; - paddb MM2, MM4; - paddb MM3, MM4; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startaddmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - /* trying to be fair and treat normal 32-bit cpu the same way as we do - * the SIMD units, with unrolled asm. There's not enough registers, - * really. - */ - else - if (a.length >= 4) - { - - auto n = aptr + (a.length & ~3); - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov CL, value; - - align 4; - startadd386: - add ESI, 4; - mov DX, [EAX]; - mov BX, [EAX+2]; - add EAX, 4; - add BL, CL; - add BH, CL; - add DL, CL; - add DH, CL; - mov [ESI -4], DX; - mov [ESI+2 -4], BX; - cmp ESI, EDI; - jb startadd386; - - mov aptr, ESI; - mov bptr, EAX; - } - - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ + value); - - return a; -} - -unittest -{ - printf("_arraySliceExpAddSliceAssign_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + c[] - */ - -T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceAddSliceAssign_g(a, c, b); -} - -T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceAddSliceAssign_g(a, c, b); -} - -T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - //printf("_arraySliceSliceAddSliceAssign_g()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 5739% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - version (log) printf("\tsse2 unaligned\n"); - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 8; - startaddlsse2u: - add ESI, 64; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - movdqu XMM2, [EAX+32]; - movdqu XMM3, [EAX+48]; - add EAX, 64; - movdqu XMM4, [ECX]; - movdqu XMM5, [ECX+16]; - movdqu XMM6, [ECX+32]; - movdqu XMM7, [ECX+48]; - add ECX, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM5; - paddb XMM2, XMM6; - paddb XMM3, XMM7; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddlsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - version (log) printf("\tsse2 aligned\n"); - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 8; - startaddlsse2a: - add ESI, 64; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - movdqa XMM2, [EAX+32]; - movdqa XMM3, [EAX+48]; - add EAX, 64; - movdqa XMM4, [ECX]; - movdqa XMM5, [ECX+16]; - movdqa XMM6, [ECX+32]; - movdqa XMM7, [ECX+48]; - add ECX, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM5; - paddb XMM2, XMM6; - paddb XMM3, XMM7; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddlsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 4428% faster - if (mmx() && a.length >= 32) - { - version (log) printf("\tmmx\n"); - auto n = aptr + (a.length & ~31); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startaddlmmx: - add ESI, 32; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - add EAX, 32; - movq MM4, [ECX]; - movq MM5, [ECX+8]; - movq MM6, [ECX+16]; - movq MM7, [ECX+24]; - add ECX, 32; - paddb MM0, MM4; - paddb MM1, MM5; - paddb MM2, MM6; - paddb MM3, MM7; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startaddlmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - version (log) if (aptr < aend) printf("\tbase\n"); - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ + *cptr++); - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddSliceAssign_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += value - */ - -T[] _arrayExpSliceAddass_a(T[] a, T value) -{ - return _arrayExpSliceAddass_g(a, value); -} - -T[] _arrayExpSliceAddass_h(T[] a, T value) -{ - return _arrayExpSliceAddass_g(a, value); -} - -T[] _arrayExpSliceAddass_g(T[] a, T value) -{ - //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1578% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - uint l = cast(ubyte) value; - l |= (l << 8); - l |= (l << 16); - - if (((cast(uint) aptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startaddasssse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - movdqu XMM2, [ESI+32]; - movdqu XMM3, [ESI+48]; - add ESI, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM4; - paddb XMM2, XMM4; - paddb XMM3, XMM4; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddasssse2u; - - mov aptr, ESI; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startaddasssse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - movdqa XMM2, [ESI+32]; - movdqa XMM3, [ESI+48]; - add ESI, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM4; - paddb XMM2, XMM4; - paddb XMM3, XMM4; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddasssse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 1721% faster - if (mmx() && a.length >= 32) - { - - auto n = aptr + (a.length & ~31); - - uint l = cast(ubyte) value; - l |= (l << 8); - - asm - { - mov ESI, aptr; - mov EDI, n; - movd MM4, l; - pshufw MM4, MM4, 0; - - align 8; - startaddassmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - movq MM2, [ESI+16]; - movq MM3, [ESI+24]; - add ESI, 32; - paddb MM0, MM4; - paddb MM1, MM4; - paddb MM2, MM4; - paddb MM3, MM4; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startaddassmmx; - - emms; - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ += value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceAddass_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] += 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += b[] - */ - -T[] _arraySliceSliceAddass_a(T[] a, T[] b) -{ - return _arraySliceSliceAddass_g(a, b); -} - -T[] _arraySliceSliceAddass_h(T[] a, T[] b) -{ - return _arraySliceSliceAddass_g(a, b); -} - -T[] _arraySliceSliceAddass_g(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceAddass_g()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 4727% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 8; - startaddasslsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - movdqu XMM2, [ESI+32]; - movdqu XMM3, [ESI+48]; - add ESI, 64; - movdqu XMM4, [ECX]; - movdqu XMM5, [ECX+16]; - movdqu XMM6, [ECX+32]; - movdqu XMM7, [ECX+48]; - add ECX, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM5; - paddb XMM2, XMM6; - paddb XMM3, XMM7; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddasslsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 8; - startaddasslsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - movdqa XMM2, [ESI+32]; - movdqa XMM3, [ESI+48]; - add ESI, 64; - movdqa XMM4, [ECX]; - movdqa XMM5, [ECX+16]; - movdqa XMM6, [ECX+32]; - movdqa XMM7, [ECX+48]; - add ECX, 64; - paddb XMM0, XMM4; - paddb XMM1, XMM5; - paddb XMM2, XMM6; - paddb XMM3, XMM7; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startaddasslsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 3059% faster - if (mmx() && a.length >= 32) - { - - auto n = aptr + (a.length & ~31); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 8; - startaddasslmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - movq MM2, [ESI+16]; - movq MM3, [ESI+24]; - add ESI, 32; - movq MM4, [ECX]; - movq MM5, [ECX+8]; - movq MM6, [ECX+16]; - movq MM7, [ECX+24]; - add ECX, 32; - paddb MM0, MM4; - paddb MM1, MM5; - paddb MM2, MM6; - paddb MM3, MM7; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startaddasslmmx; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ += *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddass_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] += b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - - -/*********************** - * Computes: - * a[] = b[] - value - */ - -T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) -{ - return _arraySliceExpMinSliceAssign_g(a, value, b); -} - -T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) -{ - return _arraySliceExpMinSliceAssign_g(a, value, b); -} - -T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMinSliceAssign_g()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1189% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - uint l = cast(ubyte) value; - l |= (l << 8); - l |= (l << 16); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startsubsse2u: - add ESI, 64; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - movdqu XMM2, [EAX+32]; - movdqu XMM3, [EAX+48]; - add EAX, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM4; - psubb XMM2, XMM4; - psubb XMM3, XMM4; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsubsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startsubsse2a: - add ESI, 64; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - movdqa XMM2, [EAX+32]; - movdqa XMM3, [EAX+48]; - add EAX, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM4; - psubb XMM2, XMM4; - psubb XMM3, XMM4; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsubsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 1079% faster - if (mmx() && a.length >= 32) - { - auto n = aptr + (a.length & ~31); - - uint l = cast(ubyte) value; - l |= (l << 8); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM4, l; - pshufw MM4, MM4, 0; - - align 4; - startsubmmx: - add ESI, 32; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - add EAX, 32; - psubb MM0, MM4; - psubb MM1, MM4; - psubb MM2, MM4; - psubb MM3, MM4; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startsubmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. - else - if (a.length >= 4) - { - auto n = aptr + (a.length & ~3); - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov CL, value; - - align 4; - startsub386: - add ESI, 4; - mov DX, [EAX]; - mov BX, [EAX+2]; - add EAX, 4; - sub BL, CL; - sub BH, CL; - sub DL, CL; - sub DH, CL; - mov [ESI -4], DX; - mov [ESI+2 -4], BX; - cmp ESI, EDI; - jb startsub386; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ - value); - - return a; -} - -unittest -{ - printf("_arraySliceExpMinSliceAssign_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] = b[] - 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(b[i] - 6)) - { - printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = value - b[] - */ - -T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) -{ - return _arrayExpSliceMinSliceAssign_g(a, b, value); -} - -T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) -{ - return _arrayExpSliceMinSliceAssign_g(a, b, value); -} - -T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arrayExpSliceMinSliceAssign_g()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 8748% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - uint l = cast(ubyte) value; - l |= (l << 8); - l |= (l << 16); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startsubrsse2u: - add ESI, 64; - movdqa XMM5, XMM4; - movdqa XMM6, XMM4; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - psubb XMM5, XMM0; - psubb XMM6, XMM1; - movdqu [ESI -64], XMM5; - movdqu [ESI+16-64], XMM6; - movdqa XMM5, XMM4; - movdqa XMM6, XMM4; - movdqu XMM2, [EAX+32]; - movdqu XMM3, [EAX+48]; - add EAX, 64; - psubb XMM5, XMM2; - psubb XMM6, XMM3; - movdqu [ESI+32-64], XMM5; - movdqu [ESI+48-64], XMM6; - cmp ESI, EDI; - jb startsubrsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startsubrsse2a: - add ESI, 64; - movdqa XMM5, XMM4; - movdqa XMM6, XMM4; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - psubb XMM5, XMM0; - psubb XMM6, XMM1; - movdqa [ESI -64], XMM5; - movdqa [ESI+16-64], XMM6; - movdqa XMM5, XMM4; - movdqa XMM6, XMM4; - movdqa XMM2, [EAX+32]; - movdqa XMM3, [EAX+48]; - add EAX, 64; - psubb XMM5, XMM2; - psubb XMM6, XMM3; - movdqa [ESI+32-64], XMM5; - movdqa [ESI+48-64], XMM6; - cmp ESI, EDI; - jb startsubrsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 7397% faster - if (mmx() && a.length >= 32) - { - auto n = aptr + (a.length & ~31); - - uint l = cast(ubyte) value; - l |= (l << 8); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM4, l; - pshufw MM4, MM4, 0; - - align 4; - startsubrmmx: - add ESI, 32; - movq MM5, MM4; - movq MM6, MM4; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - psubb MM5, MM0; - psubb MM6, MM1; - movq [ESI -32], MM5; - movq [ESI+8 -32], MM6; - movq MM5, MM4; - movq MM6, MM4; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - add EAX, 32; - psubb MM5, MM2; - psubb MM6, MM3; - movq [ESI+16-32], MM5; - movq [ESI+24-32], MM6; - cmp ESI, EDI; - jb startsubrmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - - } - - while (aptr < aend) - *aptr++ = cast(T)(value - *bptr++); - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinSliceAssign_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] = 6 - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(6 - b[i])) - { - printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - c[] - */ - -T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMinSliceAssign_g(a, c, b); -} - -T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMinSliceAssign_g(a, c, b); -} - -T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 5756% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 8; - startsublsse2u: - add ESI, 64; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - movdqu XMM2, [EAX+32]; - movdqu XMM3, [EAX+48]; - add EAX, 64; - movdqu XMM4, [ECX]; - movdqu XMM5, [ECX+16]; - movdqu XMM6, [ECX+32]; - movdqu XMM7, [ECX+48]; - add ECX, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM5; - psubb XMM2, XMM6; - psubb XMM3, XMM7; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsublsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 8; - startsublsse2a: - add ESI, 64; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - movdqa XMM2, [EAX+32]; - movdqa XMM3, [EAX+48]; - add EAX, 64; - movdqa XMM4, [ECX]; - movdqa XMM5, [ECX+16]; - movdqa XMM6, [ECX+32]; - movdqa XMM7, [ECX+48]; - add ECX, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM5; - psubb XMM2, XMM6; - psubb XMM3, XMM7; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsublsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 4428% faster - if (mmx() && a.length >= 32) - { - auto n = aptr + (a.length & ~31); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 8; - startsublmmx: - add ESI, 32; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - add EAX, 32; - movq MM4, [ECX]; - movq MM5, [ECX+8]; - movq MM6, [ECX+16]; - movq MM7, [ECX+24]; - add ECX, 32; - psubb MM0, MM4; - psubb MM1, MM5; - psubb MM2, MM6; - psubb MM3, MM7; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startsublmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ - *cptr++); - - return a; -} - -unittest -{ - printf("_arraySliceSliceMinSliceAssign_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= value - */ - -T[] _arrayExpSliceMinass_a(T[] a, T value) -{ - return _arrayExpSliceMinass_g(a, value); -} - -T[] _arrayExpSliceMinass_h(T[] a, T value) -{ - return _arrayExpSliceMinass_g(a, value); -} - -T[] _arrayExpSliceMinass_g(T[] a, T value) -{ - //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1577% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - uint l = cast(ubyte) value; - l |= (l << 8); - l |= (l << 16); - - if (((cast(uint) aptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startsubasssse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - movdqu XMM2, [ESI+32]; - movdqu XMM3, [ESI+48]; - add ESI, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM4; - psubb XMM2, XMM4; - psubb XMM3, XMM4; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsubasssse2u; - - mov aptr, ESI; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 8; - startsubasssse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - movdqa XMM2, [ESI+32]; - movdqa XMM3, [ESI+48]; - add ESI, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM4; - psubb XMM2, XMM4; - psubb XMM3, XMM4; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsubasssse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 1577% faster - if (mmx() && a.length >= 32) - { - - auto n = aptr + (a.length & ~31); - - uint l = cast(ubyte) value; - l |= (l << 8); - - asm - { - mov ESI, aptr; - mov EDI, n; - movd MM4, l; - pshufw MM4, MM4, 0; - - align 8; - startsubassmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - movq MM2, [ESI+16]; - movq MM3, [ESI+24]; - add ESI, 32; - psubb MM0, MM4; - psubb MM1, MM4; - psubb MM2, MM4; - psubb MM3, MM4; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startsubassmmx; - - emms; - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ -= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinass_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] - */ - -T[] _arraySliceSliceMinass_a(T[] a, T[] b) -{ - return _arraySliceSliceMinass_g(a, b); -} - -T[] _arraySliceSliceMinass_h(T[] a, T[] b) -{ - return _arraySliceSliceMinass_g(a, b); -} - -T[] _arraySliceSliceMinass_g(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMinass_g()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 4800% faster - if (sse2() && a.length >= 64) - { - auto n = aptr + (a.length & ~63); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 8; - startsubasslsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - movdqu XMM2, [ESI+32]; - movdqu XMM3, [ESI+48]; - add ESI, 64; - movdqu XMM4, [ECX]; - movdqu XMM5, [ECX+16]; - movdqu XMM6, [ECX+32]; - movdqu XMM7, [ECX+48]; - add ECX, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM5; - psubb XMM2, XMM6; - psubb XMM3, XMM7; - movdqu [ESI -64], XMM0; - movdqu [ESI+16-64], XMM1; - movdqu [ESI+32-64], XMM2; - movdqu [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsubasslsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 8; - startsubasslsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - movdqa XMM2, [ESI+32]; - movdqa XMM3, [ESI+48]; - add ESI, 64; - movdqa XMM4, [ECX]; - movdqa XMM5, [ECX+16]; - movdqa XMM6, [ECX+32]; - movdqa XMM7, [ECX+48]; - add ECX, 64; - psubb XMM0, XMM4; - psubb XMM1, XMM5; - psubb XMM2, XMM6; - psubb XMM3, XMM7; - movdqa [ESI -64], XMM0; - movdqa [ESI+16-64], XMM1; - movdqa [ESI+32-64], XMM2; - movdqa [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsubasslsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 3107% faster - if (mmx() && a.length >= 32) - { - - auto n = aptr + (a.length & ~31); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 8; - startsubasslmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - movq MM2, [ESI+16]; - movq MM3, [ESI+24]; - add ESI, 32; - movq MM4, [ECX]; - movq MM5, [ECX+8]; - movq MM6, [ECX+16]; - movq MM7, [ECX+24]; - add ECX, 32; - psubb MM0, MM4; - psubb MM1, MM5; - psubb MM2, MM6; - psubb MM3, MM7; - movq [ESI -32], MM0; - movq [ESI+8 -32], MM1; - movq [ESI+16-32], MM2; - movq [ESI+24-32], MM3; - cmp ESI, EDI; - jb startsubasslmmx; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ -= *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMinass_g unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] -= b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycast.d druntime/src/rt/arraycast.d --- druntime-old/src/rt/arraycast.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arraycast.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,94 +0,0 @@ -/** - * Implementation of array cast support routines. - * - * Copyright: Copyright Digital Mars 2004 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, Sean Kelly - * - * Copyright Digital Mars 2004 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arraycast; - -/****************************************** - * Runtime helper to convert dynamic array of one - * type to dynamic array of another. - * Adjusts the length of the array. - * Throws exception if new length is not aligned. - */ - -extern (C) - -void[] _d_arraycast(size_t tsize, size_t fsize, void[] a) -{ - auto length = a.length; - - auto nbytes = length * fsize; - if (nbytes % tsize != 0) - { - throw new Exception("array cast misalignment"); - } - length = nbytes / tsize; - *cast(size_t *)&a = length; // jam new length - return a; -} - -unittest -{ - byte[int.sizeof * 3] b; - int[] i; - short[] s; - - i = cast(int[])b; - assert(i.length == 3); - - s = cast(short[])b; - assert(s.length == 6); - - s = cast(short[])i; - assert(s.length == 6); -} - -/****************************************** - * Runtime helper to convert dynamic array of bits - * dynamic array of another. - * Adjusts the length of the array. - * Throws exception if new length is not aligned. - */ - -version (none) -{ -extern (C) - -void[] _d_arraycast_frombit(uint tsize, void[] a) -{ - uint length = a.length; - - if (length & 7) - { - throw new Exception("bit[] array cast misalignment"); - } - length /= 8 * tsize; - *cast(size_t *)&a = length; // jam new length - return a; -} - -unittest -{ - version (D_Bits) - { - bit[int.sizeof * 3 * 8] b; - int[] i; - short[] s; - - i = cast(int[])b; - assert(i.length == 3); - - s = cast(short[])b; - assert(s.length == 6); - } -} - -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycat.d druntime/src/rt/arraycat.d --- druntime-old/src/rt/arraycat.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arraycat.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,42 +0,0 @@ -/** - * Implementation of array copy support routines. - * - * Copyright: Copyright Digital Mars 2004 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, Sean Kelly - * - * Copyright Digital Mars 2004 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arraycat; - -private -{ - import core.stdc.string; - debug import core.stdc.stdio; -} - -extern (C): - -byte[] _d_arraycopy(size_t size, byte[] from, byte[] to) -{ - debug printf("f = %p,%d, t = %p,%d, size = %d\n", - from.ptr, from.length, to.ptr, to.length, size); - - if (to.length != from.length) - { - throw new Exception("lengths don't match for array copy"); - } - else if (to.ptr + to.length * size <= from.ptr || - from.ptr + from.length * size <= to.ptr) - { - memcpy(to.ptr, from.ptr, to.length * size); - } - else - { - throw new Exception("overlapping array copy"); - } - return to; -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraydouble.d druntime/src/rt/arraydouble.d --- druntime-old/src/rt/arraydouble.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arraydouble.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,1720 +0,0 @@ -/** - * Contains SSE2 and MMX versions of certain operations for double. - * - * Copyright: Copyright Digital Mars 2008 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, based on code originally written by Burton Radons - * - * Copyright Digital Mars 2008 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arraydouble; - -private import core.cpuid; - -version (unittest) -{ - private import core.stdc.stdio : printf; - /* This is so unit tests will test every CPU variant - */ - int cpuid; - const int CPUID_MAX = 5; - bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } - bool sse() { return cpuid == 2 && core.cpuid.sse(); } - bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } - bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } -} -else -{ - alias core.cpuid.mmx mmx; - alias core.cpuid.sse sse; - alias core.cpuid.sse2 sse2; - alias core.cpuid.amd3dnow amd3dnow; -} - -//version = log; - -bool disjoint(T)(T[] a, T[] b) -{ - return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); -} - -/* Performance figures measured by Burton Radons - */ - -alias double T; - -extern (C): - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + c[] - */ - -T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 333% faster - if (sse2() && b.length >= 16) - { - auto n = aptr + (b.length & ~15); - - // Unaligned case - asm - { - mov EAX, bptr; // left operand - mov ECX, cptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - movupd XMM4, [ECX]; - movupd XMM5, [ECX+16]; - movupd XMM6, [ECX+32]; - movupd XMM7, [ECX+48]; - add ESI, 64; - addpd XMM0, XMM4; - addpd XMM1, XMM5; - addpd XMM2, XMM6; - addpd XMM3, XMM7; - add ECX, 64; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - // Handle remainder - while (aptr < aend) - *aptr++ = *bptr++ + *cptr++; - - return a; -} - - -unittest -{ - printf("_arraySliceSliceAddSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - c[] - */ - -T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 324% faster - if (sse2() && b.length >= 8) - { - auto n = aptr + (b.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; // left operand - mov ECX, cptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - movupd XMM4, [ECX]; - movupd XMM5, [ECX+16]; - movupd XMM6, [ECX+32]; - movupd XMM7, [ECX+48]; - add ESI, 64; - subpd XMM0, XMM4; - subpd XMM1, XMM5; - subpd XMM2, XMM6; - subpd XMM3, XMM7; - add ECX, 64; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - // Handle remainder - while (aptr < aend) - *aptr++ = *bptr++ - *cptr++; - - return a; -} - - -unittest -{ - printf("_arraySliceSliceMinSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + value - */ - -T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpAddSliceAssign_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 305% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - addpd XMM0, XMM4; - addpd XMM1, XMM4; - addpd XMM2, XMM4; - addpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ + value; - - return a; -} - -unittest -{ - printf("_arraySliceExpAddSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += value - */ - -T[] _arrayExpSliceAddass_d(T[] a, T value) -{ - //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 version is 114% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - if (aptr < n) - - // Unaligned case - asm - { - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloopa: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - addpd XMM0, XMM4; - addpd XMM1, XMM4; - addpd XMM2, XMM4; - addpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopa; - - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ += value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceAddass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] += 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += b[] - */ - -T[] _arraySliceSliceAddass_d(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceAddass_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 183% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov ECX, bptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - movupd XMM4, [ECX]; - movupd XMM5, [ECX+16]; - movupd XMM6, [ECX+32]; - movupd XMM7, [ECX+48]; - add ECX, 64; - addpd XMM0, XMM4; - addpd XMM1, XMM5; - addpd XMM2, XMM6; - addpd XMM3, XMM7; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ += *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] += b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - value - */ - -T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMinSliceAssign_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 305% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - subpd XMM0, XMM4; - subpd XMM1, XMM4; - subpd XMM2, XMM4; - subpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ - value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMinSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = value - b[] - */ - -T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arrayExpSliceMinSliceAssign_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 66% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movapd XMM5, XMM4; - movapd XMM6, XMM4; - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - subpd XMM5, XMM0; - subpd XMM6, XMM1; - movupd [ESI+ 0-64], XMM5; - movupd [ESI+16-64], XMM6; - movapd XMM5, XMM4; - movapd XMM6, XMM4; - subpd XMM5, XMM2; - subpd XMM6, XMM3; - movupd [ESI+32-64], XMM5; - movupd [ESI+48-64], XMM6; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = value - *bptr++; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = 6 - a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(6 - a[i])) - { - printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= value - */ - -T[] _arrayExpSliceMinass_d(T[] a, T value) -{ - //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 version is 115% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - if (aptr < n) - - // Unaligned case - asm - { - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloopa: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - subpd XMM0, XMM4; - subpd XMM1, XMM4; - subpd XMM2, XMM4; - subpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopa; - - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ -= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] - */ - -T[] _arraySliceSliceMinass_d(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMinass_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 183% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov ECX, bptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - movupd XMM4, [ECX]; - movupd XMM5, [ECX+16]; - movupd XMM6, [ECX+32]; - movupd XMM7, [ECX+48]; - add ECX, 64; - subpd XMM0, XMM4; - subpd XMM1, XMM5; - subpd XMM2, XMM6; - subpd XMM3, XMM7; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ -= *bptr++; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * value - */ - -T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMulSliceAssign_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 304% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - mulpd XMM0, XMM4; - mulpd XMM1, XMM4; - mulpd XMM2, XMM4; - mulpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ * value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMulSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * c[] - */ - -T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - //printf("_arraySliceSliceMulSliceAssign_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 329% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; // left operand - mov ECX, cptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add ESI, 64; - movupd XMM4, [ECX]; - movupd XMM5, [ECX+16]; - movupd XMM6, [ECX+32]; - movupd XMM7, [ECX+48]; - add EAX, 64; - mulpd XMM0, XMM4; - mulpd XMM1, XMM5; - mulpd XMM2, XMM6; - mulpd XMM3, XMM7; - add ECX, 64; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ * *cptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMulSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * b[i])) - { - printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= value - */ - -T[] _arrayExpSliceMulass_d(T[] a, T value) -{ - //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 version is 109% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - if (aptr < n) - - // Unaligned case - asm - { - mov ESI, aptr; - mov EDI, n; - movsd XMM4, value; - shufpd XMM4, XMM4, 0; - - align 8; - startsseloopa: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - mulpd XMM0, XMM4; - mulpd XMM1, XMM4; - mulpd XMM2, XMM4; - mulpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopa; - - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ *= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMulass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] *= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= b[] - */ - -T[] _arraySliceSliceMulass_d(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMulass_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 version is 205% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov ECX, bptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - movupd XMM4, [ECX]; - movupd XMM5, [ECX+16]; - movupd XMM6, [ECX+32]; - movupd XMM7, [ECX+48]; - add ECX, 64; - mulpd XMM0, XMM4; - mulpd XMM1, XMM5; - mulpd XMM2, XMM6; - mulpd XMM3, XMM7; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ *= *bptr++; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMulass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] *= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] / value - */ - -T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpDivSliceAssign_d()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - /* Multiplying by the reciprocal is faster, but does - * not produce as accurate an answer. - */ - T recip = cast(T)1 / value; - - version (D_InlineAsm_X86) - { - // SSE2 version is 299% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movsd XMM4, recip; - //movsd XMM4, value - //rcpsd XMM4, XMM4 - shufpd XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movupd XMM0, [EAX]; - movupd XMM1, [EAX+16]; - movupd XMM2, [EAX+32]; - movupd XMM3, [EAX+48]; - add EAX, 64; - mulpd XMM0, XMM4; - mulpd XMM1, XMM4; - mulpd XMM2, XMM4; - mulpd XMM3, XMM4; - //divpd XMM0, XMM4; - //divpd XMM1, XMM4; - //divpd XMM2, XMM4; - //divpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - { - *aptr++ = *bptr++ / value; - //*aptr++ = *bptr++ * recip; - } - - return a; -} - -unittest -{ - printf("_arraySliceExpDivSliceAssign_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] / 8; - - for (int i = 0; i < dim; i++) - { - //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); - if (c[i] != cast(T)(a[i] / 8)) - { - printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] /= value - */ - -T[] _arrayExpSliceDivass_d(T[] a, T value) -{ - //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - /* Multiplying by the reciprocal is faster, but does - * not produce as accurate an answer. - */ - T recip = cast(T)1 / value; - - version (D_InlineAsm_X86) - { - // SSE2 version is 65% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - // Unaligned case - asm - { - mov ESI, aptr; - mov EDI, n; - movsd XMM4, recip; - //movsd XMM4, value - //rcpsd XMM4, XMM4 - shufpd XMM4, XMM4, 0; - - align 8; - startsseloopa: - movupd XMM0, [ESI]; - movupd XMM1, [ESI+16]; - movupd XMM2, [ESI+32]; - movupd XMM3, [ESI+48]; - add ESI, 64; - mulpd XMM0, XMM4; - mulpd XMM1, XMM4; - mulpd XMM2, XMM4; - mulpd XMM3, XMM4; - //divpd XMM0, XMM4; - //divpd XMM1, XMM4; - //divpd XMM2, XMM4; - //divpd XMM3, XMM4; - movupd [ESI+ 0-64], XMM0; - movupd [ESI+16-64], XMM1; - movupd [ESI+32-64], XMM2; - movupd [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopa; - - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ *= recip; - - return a; -} - - -unittest -{ - printf("_arrayExpSliceDivass_d unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] /= 8; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] / 8)) - { - printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] * value - */ - -T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAddass_d(a, -value, b); -} - -/*********************** - * Computes: - * a[] += b[] * value - */ - -T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - // Handle remainder - while (aptr < aend) - *aptr++ += *bptr++ * value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMulSliceAddass_d unittest\n"); - - cpuid = 1; - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 1; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] += a[] * 6; - - for (int i = 0; i < dim; i++) - { - //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); - if (c[i] != cast(T)(b[i] + a[i] * 6)) - { - printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayfloat.d druntime/src/rt/arrayfloat.d --- druntime-old/src/rt/arrayfloat.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arrayfloat.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,1435 +0,0 @@ -/** - * Contains SSE2 and MMX versions of certain operations for float. - * - * Copyright: Copyright Digital Mars 2008 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, based on code originally written by Burton Radons - * - * Copyright Digital Mars 2008 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arrayfloat; - -private import core.cpuid; - -version (unittest) -{ - private import core.stdc.stdio : printf; - /* This is so unit tests will test every CPU variant - */ - int cpuid; - const int CPUID_MAX = 5; - bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } - bool sse() { return cpuid == 2 && core.cpuid.sse(); } - bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } - bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } -} -else -{ - alias core.cpuid.mmx mmx; - alias core.cpuid.sse sse; - alias core.cpuid.sse2 sse2; - alias core.cpuid.amd3dnow amd3dnow; -} - -//version = log; - -bool disjoint(T)(T[] a, T[] b) -{ - return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); -} - -alias float T; - -extern (C): - -/* ======================================================================== */ -/* ======================================================================== */ - -/* template for the case - * a[] = b[] ? c[] - * with some binary operator ? - */ -private template CodeGenSliceSliceOp(string opD, string opSSE, string op3DNow) -{ - const CodeGenSliceSliceOp = ` - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE version is 834% faster - if (sse() && b.length >= 16) - { - auto n = aptr + (b.length & ~15); - - // Unaligned case - asm - { - mov EAX, bptr; // left operand - mov ECX, cptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movups XMM0, [EAX]; - movups XMM1, [EAX+16]; - movups XMM2, [EAX+32]; - movups XMM3, [EAX+48]; - add EAX, 64; - movups XMM4, [ECX]; - movups XMM5, [ECX+16]; - movups XMM6, [ECX+32]; - movups XMM7, [ECX+48]; - add ESI, 64; - ` ~ opSSE ~ ` XMM0, XMM4; - ` ~ opSSE ~ ` XMM1, XMM5; - ` ~ opSSE ~ ` XMM2, XMM6; - ` ~ opSSE ~ ` XMM3, XMM7; - add ECX, 64; - movups [ESI+ 0-64], XMM0; - movups [ESI+16-64], XMM1; - movups [ESI+32-64], XMM2; - movups [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - // 3DNow! version is only 13% faster - if (amd3dnow() && b.length >= 8) - { - auto n = aptr + (b.length & ~7); - - asm - { - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - mov EAX, bptr; // left operand - mov ECX, cptr; // right operand - - align 4; - start3dnow: - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - ` ~ op3DNow ~ ` MM0, [ECX]; - ` ~ op3DNow ~ ` MM1, [ECX+8]; - ` ~ op3DNow ~ ` MM2, [ECX+16]; - ` ~ op3DNow ~ ` MM3, [ECX+24]; - movq [ESI], MM0; - movq [ESI+8], MM1; - movq [ESI+16], MM2; - movq [ESI+24], MM3; - add ECX, 32; - add ESI, 32; - add EAX, 32; - cmp ESI, EDI; - jb start3dnow; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - // Handle remainder - while (aptr < aend) - *aptr++ = *bptr++ ` ~ opD ~ ` *cptr++; - - return a;`; -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + c[] - */ - -T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - mixin(CodeGenSliceSliceOp!("+", "addps", "pfadd")); -} - - -unittest -{ - printf("_arraySliceSliceAddSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - c[] - */ - -T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - mixin(CodeGenSliceSliceOp!("-", "subps", "pfsub")); -} - - -unittest -{ - printf("_arraySliceSliceMinSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * c[] - */ - -T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - mixin(CodeGenSliceSliceOp!("*", "mulps", "pfmul")); -} - -unittest -{ - printf("_arraySliceSliceMulSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * b[i])) - { - printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/* template for the case - * a[] ?= value - * with some binary operator ? - */ -private template CodeGenExpSliceOpAssign(string opD, string opSSE, string op3DNow) -{ - const CodeGenExpSliceOpAssign = ` - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - if (sse() && a.length >= 16) - { - auto aabeg = cast(T*)((cast(uint)aptr + 15) & ~15); // beginning of paragraph-aligned slice of a - auto aaend = cast(T*)((cast(uint)aend) & ~15); // end of paragraph-aligned slice of a - - int numAligned = cast(int)(aaend - aabeg); // how many floats are in the aligned slice? - - // are there at least 16 floats in the paragraph-aligned slice? - // otherwise we can't do anything with SSE. - if (numAligned >= 16) - { - aaend = aabeg + (numAligned & ~15); // make sure the slice is actually a multiple of 16 floats long - - // process values up to aligned slice one by one - while (aptr < aabeg) - *aptr++ ` ~ opD ~ ` value; - - // process aligned slice with fast SSE operations - asm - { - mov ESI, aabeg; - mov EDI, aaend; - movss XMM4, value; - shufps XMM4, XMM4, 0; - - align 8; - startsseloopa: - movaps XMM0, [ESI]; - movaps XMM1, [ESI+16]; - movaps XMM2, [ESI+32]; - movaps XMM3, [ESI+48]; - add ESI, 64; - ` ~ opSSE ~ ` XMM0, XMM4; - ` ~ opSSE ~ ` XMM1, XMM4; - ` ~ opSSE ~ ` XMM2, XMM4; - ` ~ opSSE ~ ` XMM3, XMM4; - movaps [ESI+ 0-64], XMM0; - movaps [ESI+16-64], XMM1; - movaps [ESI+32-64], XMM2; - movaps [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopa; - } - aptr = aaend; - } - } - else - // 3DNow! version is 63% faster - if (amd3dnow() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - ulong w = *cast(uint *) &value; - ulong v = w | (w << 32L); - - asm - { - mov ESI, dword ptr [aptr]; - mov EDI, dword ptr [n]; - movq MM4, qword ptr [v]; - - align 8; - start: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - movq MM2, [ESI+16]; - movq MM3, [ESI+24]; - ` ~ op3DNow ~ ` MM0, MM4; - ` ~ op3DNow ~ ` MM1, MM4; - ` ~ op3DNow ~ ` MM2, MM4; - ` ~ op3DNow ~ ` MM3, MM4; - movq [ESI], MM0; - movq [ESI+8], MM1; - movq [ESI+16], MM2; - movq [ESI+24], MM3; - add ESI, 32; - cmp ESI, EDI; - jb start; - - emms; - mov dword ptr [aptr], ESI; - } - } - } - - while (aptr < aend) - *aptr++ ` ~ opD ~ ` value; - - return a;`; -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += value - */ - -T[] _arrayExpSliceAddass_f(T[] a, T value) -{ - mixin(CodeGenExpSliceOpAssign!("+=", "addps", "pfadd")); -} - -unittest -{ - printf("_arrayExpSliceAddass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] += 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= value - */ - -T[] _arrayExpSliceMinass_f(T[] a, T value) -{ - mixin(CodeGenExpSliceOpAssign!("-=", "subps", "pfsub")); -} - -unittest -{ - printf("_arrayExpSliceminass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= value - */ - -T[] _arrayExpSliceMulass_f(T[] a, T value) -{ - mixin(CodeGenExpSliceOpAssign!("*=", "mulps", "pfmul")); -} - -unittest -{ - printf("_arrayExpSliceMulass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] *= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] /= value - */ - -T[] _arrayExpSliceDivass_f(T[] a, T value) -{ - return _arrayExpSliceMulass_f(a, 1f / value); -} - -unittest -{ - printf("_arrayExpSliceDivass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] /= 8; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] / 8)) - { - printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ -/* ======================================================================== */ - -/* template for the case - * a[] = b[] ? value - * with some binary operator ? - */ -private template CodeGenSliceExpOp(string opD, string opSSE, string op3DNow) -{ - const CodeGenSliceExpOp = ` - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE version is 665% faster - if (sse() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movss XMM4, value; - shufps XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movups XMM0, [EAX]; - movups XMM1, [EAX+16]; - movups XMM2, [EAX+32]; - movups XMM3, [EAX+48]; - add EAX, 64; - ` ~ opSSE ~ ` XMM0, XMM4; - ` ~ opSSE ~ ` XMM1, XMM4; - ` ~ opSSE ~ ` XMM2, XMM4; - ` ~ opSSE ~ ` XMM3, XMM4; - movups [ESI+ 0-64], XMM0; - movups [ESI+16-64], XMM1; - movups [ESI+32-64], XMM2; - movups [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - // 3DNow! version is 69% faster - if (amd3dnow() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - ulong w = *cast(uint *) &value; - ulong v = w | (w << 32L); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movq MM4, qword ptr [v]; - - align 8; - start3dnow: - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - ` ~ op3DNow ~ ` MM0, MM4; - ` ~ op3DNow ~ ` MM1, MM4; - ` ~ op3DNow ~ ` MM2, MM4; - ` ~ op3DNow ~ ` MM3, MM4; - movq [ESI], MM0; - movq [ESI+8], MM1; - movq [ESI+16], MM2; - movq [ESI+24], MM3; - add ESI, 32; - add EAX, 32; - cmp ESI, EDI; - jb start3dnow; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ ` ~ opD ~ ` value; - - return a;`; -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + value - */ - -T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - mixin(CodeGenSliceExpOp!("+", "addps", "pfadd")); -} - -unittest -{ - printf("_arraySliceExpAddSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - value - */ - -T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - mixin(CodeGenSliceExpOp!("-", "subps", "pfsub")); -} - -unittest -{ - printf("_arraySliceExpMinSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * value - */ - -T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - mixin(CodeGenSliceExpOp!("*", "mulps", "pfmul")); -} - -unittest -{ - printf("_arraySliceExpMulSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] / value - */ - -T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAssign_f(a, 1f/value, b); -} - -unittest -{ - printf("_arraySliceExpDivSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] / 8; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] / 8)) - { - printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ -/* ======================================================================== */ - -private template CodeGenSliceOpAssign(string opD, string opSSE, string op3DNow) -{ - const CodeGenSliceOpAssign = ` - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE version is 468% faster - if (sse() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - // Unaligned case - asm - { - mov ECX, bptr; // right operand - mov ESI, aptr; // destination operand - mov EDI, n; // end comparison - - align 8; - startsseloopb: - movups XMM0, [ESI]; - movups XMM1, [ESI+16]; - movups XMM2, [ESI+32]; - movups XMM3, [ESI+48]; - add ESI, 64; - movups XMM4, [ECX]; - movups XMM5, [ECX+16]; - movups XMM6, [ECX+32]; - movups XMM7, [ECX+48]; - add ECX, 64; - ` ~ opSSE ~ ` XMM0, XMM4; - ` ~ opSSE ~ ` XMM1, XMM5; - ` ~ opSSE ~ ` XMM2, XMM6; - ` ~ opSSE ~ ` XMM3, XMM7; - movups [ESI+ 0-64], XMM0; - movups [ESI+16-64], XMM1; - movups [ESI+32-64], XMM2; - movups [ESI+48-64], XMM3; - cmp ESI, EDI; - jb startsseloopb; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - // 3DNow! version is 57% faster - if (amd3dnow() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, dword ptr [aptr]; // destination operand - mov EDI, dword ptr [n]; // end comparison - mov ECX, dword ptr [bptr]; // right operand - - align 4; - start3dnow: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - movq MM2, [ESI+16]; - movq MM3, [ESI+24]; - ` ~ op3DNow ~ ` MM0, [ECX]; - ` ~ op3DNow ~ ` MM1, [ECX+8]; - ` ~ op3DNow ~ ` MM2, [ECX+16]; - ` ~ op3DNow ~ ` MM3, [ECX+24]; - movq [ESI], MM0; - movq [ESI+8], MM1; - movq [ESI+16], MM2; - movq [ESI+24], MM3; - add ESI, 32; - add ECX, 32; - cmp ESI, EDI; - jb start3dnow; - - emms; - mov dword ptr [aptr], ESI; - mov dword ptr [bptr], ECX; - } - } - } - - while (aptr < aend) - *aptr++ ` ~ opD ~ ` *bptr++; - - return a;`; -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += b[] - */ - -T[] _arraySliceSliceAddass_f(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - mixin(CodeGenSliceOpAssign!("+=", "addps", "pfadd")); -} - -unittest -{ - printf("_arraySliceSliceAddass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] += b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] - */ - -T[] _arraySliceSliceMinass_f(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - mixin(CodeGenSliceOpAssign!("-=", "subps", "pfsub")); -} - -unittest -{ - printf("_arrayExpSliceMinass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= b[] - */ - -T[] _arraySliceSliceMulass_f(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - mixin(CodeGenSliceOpAssign!("*=", "mulps", "pfmul")); -} - -unittest -{ - printf("_arrayExpSliceMulass_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - c[] *= 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = value - b[] - */ - -T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arrayExpSliceMinSliceAssign_f()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE version is 690% faster - if (sse() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - // Unaligned case - asm - { - mov EAX, bptr; - mov ESI, aptr; - mov EDI, n; - movss XMM4, value; - shufps XMM4, XMM4, 0; - - align 8; - startsseloop: - add ESI, 64; - movaps XMM5, XMM4; - movaps XMM6, XMM4; - movups XMM0, [EAX]; - movups XMM1, [EAX+16]; - movups XMM2, [EAX+32]; - movups XMM3, [EAX+48]; - add EAX, 64; - subps XMM5, XMM0; - subps XMM6, XMM1; - movups [ESI+ 0-64], XMM5; - movups [ESI+16-64], XMM6; - movaps XMM5, XMM4; - movaps XMM6, XMM4; - subps XMM5, XMM2; - subps XMM6, XMM3; - movups [ESI+32-64], XMM5; - movups [ESI+48-64], XMM6; - cmp ESI, EDI; - jb startsseloop; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - // 3DNow! version is 67% faster - if (amd3dnow() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - ulong w = *cast(uint *) &value; - ulong v = w | (w << 32L); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movq MM4, qword ptr [v]; - - align 8; - start3dnow: - movq MM0, [EAX]; - movq MM1, [EAX+8]; - movq MM2, [EAX+16]; - movq MM3, [EAX+24]; - pfsubr MM0, MM4; - pfsubr MM1, MM4; - pfsubr MM2, MM4; - pfsubr MM3, MM4; - movq [ESI], MM0; - movq [ESI+8], MM1; - movq [ESI+16], MM2; - movq [ESI+24], MM3; - add ESI, 32; - add EAX, 32; - cmp ESI, EDI; - jb start3dnow; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = value - *bptr++; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinSliceAssign_f unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = 6 - a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(6 - a[i])) - { - printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] * value - */ - -T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAddass_f(a, -value, b); -} - -/*********************** - * Computes: - * a[] += b[] * value - */ - -T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - // Handle remainder - while (aptr < aend) - *aptr++ += *bptr++ * value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMulSliceAddass_f unittest\n"); - - cpuid = 1; - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 1; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] += a[] * 6; - - for (int i = 0; i < dim; i++) - { - //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); - if (c[i] != cast(T)(b[i] + a[i] * 6)) - { - printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayint.d druntime/src/rt/arrayint.d --- druntime-old/src/rt/arrayint.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arrayint.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,2430 +0,0 @@ -/** - * Contains MMX versions of certain operations for dchar, int, and uint ('w', - * 'i' and 'k' suffixes). - * - * Copyright: Copyright Digital Mars 2008 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, based on code originally written by Burton Radons - * - * Copyright Digital Mars 2008 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arrayint; - -private import core.cpuid; - -version (unittest) -{ - private import core.stdc.stdio : printf; - /* This is so unit tests will test every CPU variant - */ - int cpuid; - const int CPUID_MAX = 4; - bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } - bool sse() { return cpuid == 2 && core.cpuid.sse(); } - bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } - bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } -} -else -{ - alias core.cpuid.mmx mmx; - alias core.cpuid.sse sse; - alias core.cpuid.sse2 sse2; - alias core.cpuid.amd3dnow amd3dnow; -} - -//version = log; - -bool disjoint(T)(T[] a, T[] b) -{ - return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); -} - -alias int T; - -extern (C): - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + value - */ - -T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b) -{ - return _arraySliceExpAddSliceAssign_i(a, value, b); -} - -T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b) -{ - return _arraySliceExpAddSliceAssign_i(a, value, b); -} - -T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpAddSliceAssign_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 380% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 298% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movq MM2, l; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - paddd MM0, MM2; - paddd MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - else - if (a.length >= 2) - { - auto n = aptr + (a.length & ~1); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov EDX, value; - - align 4; - start386: - add ESI, 8; - mov EBX, [EAX]; - mov ECX, [EAX+4]; - add EAX, 8; - add EBX, EDX; - add ECX, EDX; - mov [ESI -8], EBX; - mov [ESI+4-8], ECX; - cmp ESI, EDI; - jb start386; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ + value; - - return a; -} - -unittest -{ - printf("_arraySliceExpAddSliceAssign_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + c[] - */ - -T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceAddSliceAssign_i(a, c, b); -} - -T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceAddSliceAssign_i(a, c, b); -} - -T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - //printf("_arraySliceSliceAddSliceAssign_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1710% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM2, [ECX]; - movdqu XMM1, [EAX+16]; - movdqu XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM2, [ECX]; - movdqa XMM1, [EAX+16]; - movdqa XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 995% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM2, [ECX]; - movq MM1, [EAX+8]; - movq MM3, [ECX+8]; - add EAX, 16; - add ECX, 16; - paddd MM0, MM2; - paddd MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - -normal: - while (aptr < aend) - *aptr++ = *bptr++ + *cptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddSliceAssign_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += value - */ - -T[] _arrayExpSliceAddass_w(T[] a, T value) -{ - return _arrayExpSliceAddass_i(a, value); -} - -T[] _arrayExpSliceAddass_k(T[] a, T value) -{ - return _arrayExpSliceAddass_i(a, value); -} - -T[] _arrayExpSliceAddass_i(T[] a, T value) -{ - //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 83% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 81% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - movq MM2, l; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - paddd MM0, MM2; - paddd MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - } - } - else - if (a.length >= 2) - { - auto n = aptr + (a.length & ~1); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EDX, value; - - align 4; - start386: - mov EBX, [ESI]; - mov ECX, [ESI+4]; - add ESI, 8; - add EBX, EDX; - add ECX, EDX; - mov [ESI -8], EBX; - mov [ESI+4-8], ECX; - cmp ESI, EDI; - jb start386; - - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ += value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceAddass_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - a[] += 6; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(c[i] + 6)) - { - printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += b[] - */ - -T[] _arraySliceSliceAddass_w(T[] a, T[] b) -{ - return _arraySliceSliceAddass_i(a, b); -} - -T[] _arraySliceSliceAddass_k(T[] a, T[] b) -{ - return _arraySliceSliceAddass_i(a, b); -} - -T[] _arraySliceSliceAddass_i(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceAddass_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 695% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM2, [ECX]; - movdqu XMM1, [ESI+16]; - movdqu XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM2, [ECX]; - movdqa XMM1, [ESI+16]; - movdqa XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - paddd XMM0, XMM2; - paddd XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 471% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM2, [ECX]; - movq MM1, [ESI+8]; - movq MM3, [ECX+8]; - add ESI, 16; - add ECX, 16; - paddd MM0, MM2; - paddd MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - -normal: - while (aptr < aend) - *aptr++ += *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddass_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] += a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(b[i] + a[i])) - { - printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - value - */ - -T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b) -{ - return _arraySliceExpMinSliceAssign_i(a, value, b); -} - -T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b) -{ - return _arraySliceExpMinSliceAssign_i(a, value, b); -} - -T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMinSliceAssign_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 400% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 315% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movq MM2, l; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - psubd MM0, MM2; - psubd MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - else - if (a.length >= 2) - { - auto n = aptr + (a.length & ~1); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov EDX, value; - - align 4; - start386: - add ESI, 8; - mov EBX, [EAX]; - mov ECX, [EAX+4]; - add EAX, 8; - sub EBX, EDX; - sub ECX, EDX; - mov [ESI -8], EBX; - mov [ESI+4-8], ECX; - cmp ESI, EDI; - jb start386; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ - value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMinSliceAssign_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = value - b[] - */ - -T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value) -{ - return _arrayExpSliceMinSliceAssign_i(a, b, value); -} - -T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value) -{ - return _arrayExpSliceMinSliceAssign_i(a, b, value); -} - -T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arrayExpSliceMinSliceAssign_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1812% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 4; - startaddsse2u: - add ESI, 32; - movdqu XMM2, [EAX]; - movdqu XMM3, [EAX+16]; - movdqa XMM0, XMM4; - movdqa XMM1, XMM4; - add EAX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM4, l; - pshufd XMM4, XMM4, 0; - - align 4; - startaddsse2a: - add ESI, 32; - movdqa XMM2, [EAX]; - movdqa XMM3, [EAX+16]; - movdqa XMM0, XMM4; - movdqa XMM1, XMM4; - add EAX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 1077% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movq MM4, l; - - align 4; - startmmx: - add ESI, 16; - movq MM2, [EAX]; - movq MM3, [EAX+8]; - movq MM0, MM4; - movq MM1, MM4; - add EAX, 16; - psubd MM0, MM2; - psubd MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = value - *bptr++; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinSliceAssign_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = 6 - a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(6 - a[i])) - { - printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - c[] - */ - -T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMinSliceAssign_i(a, c, b); -} - -T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMinSliceAssign_i(a, c, b); -} - -T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1721% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM2, [ECX]; - movdqu XMM1, [EAX+16]; - movdqu XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM2, [ECX]; - movdqa XMM1, [EAX+16]; - movdqa XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 1002% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM2, [ECX]; - movq MM1, [EAX+8]; - movq MM3, [ECX+8]; - add EAX, 16; - add ECX, 16; - psubd MM0, MM2; - psubd MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ - *cptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMinSliceAssign_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= value - */ - -T[] _arrayExpSliceMinass_w(T[] a, T value) -{ - return _arrayExpSliceMinass_i(a, value); -} - -T[] _arrayExpSliceMinass_k(T[] a, T value) -{ - return _arrayExpSliceMinass_i(a, value); -} - -T[] _arrayExpSliceMinass_i(T[] a, T value) -{ - //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 81% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 81% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - movq MM2, l; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - psubd MM0, MM2; - psubd MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - } - } - else - if (a.length >= 2) - { - auto n = aptr + (a.length & ~1); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EDX, value; - - align 4; - start386: - mov EBX, [ESI]; - mov ECX, [ESI+4]; - add ESI, 8; - sub EBX, EDX; - sub ECX, EDX; - mov [ESI -8], EBX; - mov [ESI+4-8], ECX; - cmp ESI, EDI; - jb start386; - - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ -= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinass_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - a[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(c[i] - 6)) - { - printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] - */ - -T[] _arraySliceSliceMinass_w(T[] a, T[] b) -{ - return _arraySliceSliceMinass_i(a, b); -} - -T[] _arraySliceSliceMinass_k(T[] a, T[] b) -{ - return _arraySliceSliceMinass_i(a, b); -} - -T[] _arraySliceSliceMinass_i(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMinass_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 731% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM2, [ECX]; - movdqu XMM1, [ESI+16]; - movdqu XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM2, [ECX]; - movdqa XMM1, [ESI+16]; - movdqa XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - psubd XMM0, XMM2; - psubd XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 441% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM2, [ECX]; - movq MM1, [ESI+8]; - movq MM3, [ECX+8]; - add ESI, 16; - add ECX, 16; - psubd MM0, MM2; - psubd MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ -= *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMinass_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] -= a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(b[i] - a[i])) - { - printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * value - */ - -T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAssign_i(a, value, b); -} - -T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAssign_i(a, value, b); -} - -T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMulSliceAssign_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (none) // multiplying a pair is not supported by MMX - { - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1380% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - { - // MMX version is 1380% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movq MM2, l; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - pmuludq MM0, MM2; // only multiplies low 32 bits - pmuludq MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ * value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMulSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * 6; - - for (int i = 0; i < dim; i++) - { - //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]); - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * c[] - */ - -T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMulSliceAssign_i(a, c, b); -} - -T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMulSliceAssign_i(a, c, b); -} - -T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - //printf("_arraySliceSliceMulSliceAssign_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (none) - { - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 1407% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM2, [ECX]; - movdqu XMM1, [EAX+16]; - movdqu XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM2, [ECX]; - movdqa XMM1, [EAX+16]; - movdqa XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 1029% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM2, [ECX]; - movq MM1, [EAX+8]; - movq MM3, [ECX+8]; - add EAX, 16; - add ECX, 16; - pmuludq MM0, MM2; - pmuludq MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - } - - while (aptr < aend) - *aptr++ = *bptr++ * *cptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMulSliceAssign_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * b[i])) - { - printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= value - */ - -T[] _arrayExpSliceMulass_w(T[] a, T value) -{ - return _arrayExpSliceMulass_i(a, value); -} - -T[] _arrayExpSliceMulass_k(T[] a, T value) -{ - return _arrayExpSliceMulass_i(a, value); -} - -T[] _arrayExpSliceMulass_i(T[] a, T value) -{ - //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (none) - { - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 400% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = value; - - if (((cast(uint) aptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 402% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); - - asm - { - mov ESI, aptr; - mov EDI, n; - movq MM2, l; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - pmuludq MM0, MM2; - pmuludq MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - } - } - } - } - - while (aptr < aend) - *aptr++ *= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMulass_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = a[]; - a[] *= 6; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(b[i] * 6)) - { - printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= b[] - */ - -T[] _arraySliceSliceMulass_w(T[] a, T[] b) -{ - return _arraySliceSliceMulass_i(a, b); -} - -T[] _arraySliceSliceMulass_k(T[] a, T[] b) -{ - return _arraySliceSliceMulass_i(a, b); -} - -T[] _arraySliceSliceMulass_i(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMulass_i()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (none) - { - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 873% faster - if (sse2() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM2, [ECX]; - movdqu XMM1, [ESI+16]; - movdqu XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM2, [ECX]; - movdqa XMM1, [ESI+16]; - movdqa XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - pmuludq XMM0, XMM2; - pmuludq XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } -/+ BUG: comment out this section until we figure out what is going - wrong with the invalid pshufd instructions. - - else - // MMX version is 573% faster - if (mmx() && a.length >= 4) - { - auto n = aptr + (a.length & ~3); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM2, [ECX]; - movq MM1, [ESI+8]; - movq MM3, [ECX+8]; - pxor MM4, MM4; - pxor MM5, MM5; - punpckldq MM4, MM0; - punpckldq MM5, MM2; - add ESI, 16; - add ECX, 16; - pmuludq MM4, MM5; - pshufd MM4, MM4, 8; // ? - movq [ESI -16], MM4; - pxor MM4, MM4; - pxor MM5, MM5; - punpckldq MM4, MM1; - punpckldq MM5, MM3; - pmuludq MM4, MM5; - pshufd MM4, MM4, 8; // ? - movq [ESI+8-16], MM4; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } -+/ - } - } - - while (aptr < aend) - *aptr++ *= *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMulass_i unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = a[]; - a[] *= c[]; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(b[i] * c[i])) - { - printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); - assert(0); - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayreal.d druntime/src/rt/arrayreal.d --- druntime-old/src/rt/arrayreal.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arrayreal.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,241 +0,0 @@ -/** - * Contains SSE2 and MMX versions of certain operations for real. - * - * Copyright: Copyright Digital Mars 2008 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, based on code originally written by Burton Radons - * - * Copyright Digital Mars 2008 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arrayreal; - -import core.cpuid; - -version (unittest) -{ - private import core.stdc.stdio : printf; - /* This is so unit tests will test every CPU variant - */ - int cpuid; - const int CPUID_MAX = 1; - bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } - bool sse() { return cpuid == 2 && core.cpuid.sse(); } - bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } - bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } -} -else -{ - alias core.cpuid.mmx mmx; - alias core.cpuid.sse sse; - alias core.cpuid.sse2 sse2; - alias core.cpuid.amd3dnow amd3dnow; -} - -//version = log; - -bool disjoint(T)(T[] a, T[] b) -{ - return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); -} - -alias real T; - -extern (C): - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + c[] - */ - -T[] _arraySliceSliceAddSliceAssign_r(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - for (int i = 0; i < a.length; i++) - a[i] = b[i] + c[i]; - return a; -} - -unittest -{ - printf("_arraySliceSliceAddSliceAssign_r unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %Lg != %Lg + %Lg\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - c[] - */ - -T[] _arraySliceSliceMinSliceAssign_r(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - for (int i = 0; i < a.length; i++) - a[i] = b[i] - c[i]; - return a; -} - - -unittest -{ - printf("_arraySliceSliceMinSliceAssign_r unittest\n"); - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %Lg != %Lg - %Lg\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] * value - */ - -T[] _arraySliceExpMulSliceMinass_r(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAddass_r(a, -value, b); -} - -/*********************** - * Computes: - * a[] += b[] * value - */ - -T[] _arraySliceExpMulSliceAddass_r(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - // Handle remainder - while (aptr < aend) - *aptr++ += *bptr++ * value; - - return a; -} - -unittest -{ - printf("_arraySliceExpMulSliceAddass_r unittest\n"); - - cpuid = 1; - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 1; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] += a[] * 6; - - for (int i = 0; i < dim; i++) - { - //printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]); - if (c[i] != cast(T)(b[i] + a[i] * 6)) - { - printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayshort.d druntime/src/rt/arrayshort.d --- druntime-old/src/rt/arrayshort.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/arrayshort.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,2303 +0,0 @@ -/** - * Contains SSE2 and MMX versions of certain operations for wchar, short, - * and ushort ('u', 's' and 't' suffixes). - * - * Copyright: Copyright Digital Mars 2008 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright, based on code originally written by Burton Radons - * - * Copyright Digital Mars 2008 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.arrayshort; - -private import core.cpuid; - -version (unittest) -{ - private import core.stdc.stdio : printf; - /* This is so unit tests will test every CPU variant - */ - int cpuid; - const int CPUID_MAX = 4; - bool mmx() { return cpuid == 1 && core.cpuid.mmx(); } - bool sse() { return cpuid == 2 && core.cpuid.sse(); } - bool sse2() { return cpuid == 3 && core.cpuid.sse2(); } - bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); } -} -else -{ - alias core.cpuid.mmx mmx; - alias core.cpuid.sse sse; - alias core.cpuid.sse2 sse2; - alias core.cpuid.sse2 sse2; -} - -//version = log; - -bool disjoint(T)(T[] a, T[] b) -{ - return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); -} - -alias short T; - -extern (C): - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + value - */ - -T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b) -{ - return _arraySliceExpAddSliceAssign_s(a, value, b); -} - -T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b) -{ - return _arraySliceExpAddSliceAssign_s(a, value, b); -} - -T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpAddSliceAssign_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 3343% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= (l << 16); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 3343% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM2, l; - pshufw MM2, MM2, 0; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - paddw MM0, MM2; - paddw MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ + value); - - return a; -} - -unittest -{ - printf("_arraySliceExpAddSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + 6)) - { - printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] + c[] - */ - -T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceAddSliceAssign_s(a, c, b); -} - -T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceAddSliceAssign_s(a, c, b); -} - -T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - //printf("_arraySliceSliceAddSliceAssign_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 3777% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - movdqu XMM2, [ECX]; - movdqu XMM3, [ECX+16]; - add ECX, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - movdqa XMM2, [ECX]; - movdqa XMM3, [ECX+16]; - add ECX, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 2068% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - movq MM2, [ECX]; - movq MM3, [ECX+8]; - add ECX, 16; - paddw MM0, MM2; - paddw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ + *cptr++); - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] + b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] + b[i])) - { - printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += value - */ - -T[] _arrayExpSliceAddass_u(T[] a, T value) -{ - return _arrayExpSliceAddass_s(a, value); -} - -T[] _arrayExpSliceAddass_t(T[] a, T value) -{ - return _arrayExpSliceAddass_s(a, value); -} - -T[] _arrayExpSliceAddass_s(T[] a, T value) -{ - //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 832% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= (l << 16); - - if (((cast(uint) aptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 826% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - movd MM2, l; - pshufw MM2, MM2, 0; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - paddw MM0, MM2; - paddw MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ += value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceAddass_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - a[] += 6; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(c[i] + 6)) - { - printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] += b[] - */ - -T[] _arraySliceSliceAddass_u(T[] a, T[] b) -{ - return _arraySliceSliceAddass_s(a, b); -} - -T[] _arraySliceSliceAddass_t(T[] a, T[] b) -{ - return _arraySliceSliceAddass_s(a, b); -} - -T[] _arraySliceSliceAddass_s(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceAddass_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 2085% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - movdqu XMM2, [ECX]; - movdqu XMM3, [ECX+16]; - add ECX, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - movdqa XMM2, [ECX]; - movdqa XMM3, [ECX+16]; - add ECX, 32; - paddw XMM0, XMM2; - paddw XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 1022% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - start: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - movq MM2, [ECX]; - movq MM3, [ECX+8]; - add ECX, 16; - paddw MM0, MM2; - paddw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb start; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ += *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceAddass_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] += a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(b[i] + a[i])) - { - printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - value - */ - -T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) -{ - return _arraySliceExpMinSliceAssign_s(a, value, b); -} - -T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) -{ - return _arraySliceExpMinSliceAssign_s(a, value, b); -} - -T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMinSliceAssign_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 3695% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= (l << 16); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 3049% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM2, l; - pshufw MM2, MM2, 0; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - psubw MM0, MM2; - psubw MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ - value); - - return a; -} - -unittest -{ - printf("_arraySliceExpMinSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - 6)) - { - printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = value - b[] - */ - -T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) -{ - return _arrayExpSliceMinSliceAssign_s(a, b, value); -} - -T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) -{ - return _arrayExpSliceMinSliceAssign_s(a, b, value); -} - -T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arrayExpSliceMinSliceAssign_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 4995% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= (l << 16); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - - align 4; - startaddsse2u: - movd XMM2, l; - pshufd XMM2, XMM2, 0; - movd XMM3, l; - pshufd XMM3, XMM3, 0; - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - psubw XMM2, XMM0; - psubw XMM3, XMM1; - movdqu [ESI -32], XMM2; - movdqu [ESI+16-32], XMM3; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - - align 4; - startaddsse2a: - movd XMM2, l; - pshufd XMM2, XMM2, 0; - movd XMM3, l; - pshufd XMM3, XMM3, 0; - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - psubw XMM2, XMM0; - psubw XMM3, XMM1; - movdqa [ESI -32], XMM2; - movdqa [ESI+16-32], XMM3; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 4562% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM4, l; - pshufw MM4, MM4, 0; - - align 4; - startmmx: - add ESI, 16; - movq MM2, [EAX]; - movq MM3, [EAX+8]; - movq MM0, MM4; - movq MM1, MM4; - add EAX, 16; - psubw MM0, MM2; - psubw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(value - *bptr++); - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = 6 - a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(6 - a[i])) - { - printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] - c[] - */ - -T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMinSliceAssign_s(a, c, b); -} - -T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMinSliceAssign_s(a, c, b); -} - -T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 4129% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - movdqu XMM2, [ECX]; - movdqu XMM3, [ECX+16]; - add ECX, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - movdqa XMM2, [ECX]; - movdqa XMM3, [ECX+16]; - add ECX, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 2018% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - movq MM2, [ECX]; - movq MM3, [ECX+8]; - add ECX, 16; - psubw MM0, MM2; - psubw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ - *cptr++); - - return a; -} - -unittest -{ - printf("_arraySliceSliceMinSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] - b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] - b[i])) - { - printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= value - */ - -T[] _arrayExpSliceMinass_u(T[] a, T value) -{ - return _arrayExpSliceMinass_s(a, value); -} - -T[] _arrayExpSliceMinass_t(T[] a, T value) -{ - return _arrayExpSliceMinass_s(a, value); -} - -T[] _arrayExpSliceMinass_s(T[] a, T value) -{ - //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 835% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= (l << 16); - - if (((cast(uint) aptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2u; - - mov aptr, ESI; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startaddsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startaddsse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 835% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - movd MM2, l; - pshufw MM2, MM2, 0; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - psubw MM0, MM2; - psubw MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ -= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMinass_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - a[] = c[]; - a[] -= 6; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(c[i] - 6)) - { - printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] -= b[] - */ - -T[] _arraySliceSliceMinass_u(T[] a, T[] b) -{ - return _arraySliceSliceMinass_s(a, b); -} - -T[] _arraySliceSliceMinass_t(T[] a, T[] b) -{ - return _arraySliceSliceMinass_s(a, b); -} - -T[] _arraySliceSliceMinass_s(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMinass_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 2121% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm // unaligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - movdqu XMM2, [ECX]; - movdqu XMM3, [ECX+16]; - add ECX, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm // aligned case - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - movdqa XMM2, [ECX]; - movdqa XMM3, [ECX+16]; - add ECX, 32; - psubw XMM0, XMM2; - psubw XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 1116% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - start: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - movq MM2, [ECX]; - movq MM3, [ECX+8]; - add ECX, 16; - psubw MM0, MM2; - psubw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb start; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ -= *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMinass_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = c[]; - c[] -= a[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(b[i] - a[i])) - { - printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * value - */ - -T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAssign_s(a, value, b); -} - -T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) -{ - return _arraySliceExpMulSliceAssign_s(a, value, b); -} - -T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) -in -{ - assert(a.length == b.length); - assert(disjoint(a, b)); -} -body -{ - //printf("_arraySliceExpMulSliceAssign_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 3733% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= l << 16; - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM1, [EAX+16]; - add EAX, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM1, [EAX+16]; - add EAX, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - } - } - } - else - // MMX version is 3733% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - movd MM2, l; - pshufw MM2, MM2, 0; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM1, [EAX+8]; - add EAX, 16; - pmullw MM0, MM2; - pmullw MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ * value); - - return a; -} - -unittest -{ - printf("_arraySliceExpMulSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * 6; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * 6)) - { - printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] = b[] * c[] - */ - -T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMulSliceAssign_s(a, c, b); -} - -T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) -{ - return _arraySliceSliceMulSliceAssign_s(a, c, b); -} - -T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) -in -{ - assert(a.length == b.length && b.length == c.length); - assert(disjoint(a, b)); - assert(disjoint(a, c)); - assert(disjoint(b, c)); -} -body -{ - //printf("_arraySliceSliceMulSliceAssign_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - auto cptr = c.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 2515% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2u: - add ESI, 32; - movdqu XMM0, [EAX]; - movdqu XMM2, [ECX]; - movdqu XMM1, [EAX+16]; - movdqu XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startsse2a: - add ESI, 32; - movdqa XMM0, [EAX]; - movdqa XMM2, [ECX]; - movdqa XMM1, [EAX+16]; - movdqa XMM3, [ECX+16]; - add EAX, 32; - add ECX, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - else - // MMX version is 2515% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov EAX, bptr; - mov ECX, cptr; - - align 4; - startmmx: - add ESI, 16; - movq MM0, [EAX]; - movq MM2, [ECX]; - movq MM1, [EAX+8]; - movq MM3, [ECX+8]; - add EAX, 16; - add ECX, 16; - pmullw MM0, MM2; - pmullw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, EAX; - mov cptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ = cast(T)(*bptr++ * *cptr++); - - return a; -} - -unittest -{ - printf("_arraySliceSliceMulSliceAssign_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - c[] = a[] * b[]; - - for (int i = 0; i < dim; i++) - { - if (c[i] != cast(T)(a[i] * b[i])) - { - printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= value - */ - -T[] _arrayExpSliceMulass_u(T[] a, T value) -{ - return _arrayExpSliceMulass_s(a, value); -} - -T[] _arrayExpSliceMulass_t(T[] a, T value) -{ - return _arrayExpSliceMulass_s(a, value); -} - -T[] _arrayExpSliceMulass_s(T[] a, T value) -{ - //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); - auto aptr = a.ptr; - auto aend = aptr + a.length; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 2044% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - uint l = cast(ushort) value; - l |= l << 16; - - if (((cast(uint) aptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM1, [ESI+16]; - add ESI, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM2; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - movd XMM2, l; - pshufd XMM2, XMM2, 0; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM1, [ESI+16]; - add ESI, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM2; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - } - } - } - else - // MMX version is 2056% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - uint l = cast(ushort) value; - - asm - { - mov ESI, aptr; - mov EDI, n; - movd MM2, l; - pshufw MM2, MM2, 0; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM1, [ESI+8]; - add ESI, 16; - pmullw MM0, MM2; - pmullw MM1, MM2; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - } - } - } - - while (aptr < aend) - *aptr++ *= value; - - return a; -} - -unittest -{ - printf("_arrayExpSliceMulass_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = a[]; - a[] *= 6; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(b[i] * 6)) - { - printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); - assert(0); - } - } - } - } -} - - -/* ======================================================================== */ - -/*********************** - * Computes: - * a[] *= b[] - */ - -T[] _arraySliceSliceMulass_u(T[] a, T[] b) -{ - return _arraySliceSliceMulass_s(a, b); -} - -T[] _arraySliceSliceMulass_t(T[] a, T[] b) -{ - return _arraySliceSliceMulass_s(a, b); -} - -T[] _arraySliceSliceMulass_s(T[] a, T[] b) -in -{ - assert (a.length == b.length); - assert (disjoint(a, b)); -} -body -{ - //printf("_arraySliceSliceMulass_s()\n"); - auto aptr = a.ptr; - auto aend = aptr + a.length; - auto bptr = b.ptr; - - version (D_InlineAsm_X86) - { - // SSE2 aligned version is 2519% faster - if (sse2() && a.length >= 16) - { - auto n = aptr + (a.length & ~15); - - if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2u: - movdqu XMM0, [ESI]; - movdqu XMM2, [ECX]; - movdqu XMM1, [ESI+16]; - movdqu XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM3; - movdqu [ESI -32], XMM0; - movdqu [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2u; - - mov aptr, ESI; - mov bptr, ECX; - } - } - else - { - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startsse2a: - movdqa XMM0, [ESI]; - movdqa XMM2, [ECX]; - movdqa XMM1, [ESI+16]; - movdqa XMM3, [ECX+16]; - add ESI, 32; - add ECX, 32; - pmullw XMM0, XMM2; - pmullw XMM1, XMM3; - movdqa [ESI -32], XMM0; - movdqa [ESI+16-32], XMM1; - cmp ESI, EDI; - jb startsse2a; - - mov aptr, ESI; - mov bptr, ECX; - } - } - } - else - // MMX version is 1712% faster - if (mmx() && a.length >= 8) - { - auto n = aptr + (a.length & ~7); - - asm - { - mov ESI, aptr; - mov EDI, n; - mov ECX, bptr; - - align 4; - startmmx: - movq MM0, [ESI]; - movq MM2, [ECX]; - movq MM1, [ESI+8]; - movq MM3, [ECX+8]; - add ESI, 16; - add ECX, 16; - pmullw MM0, MM2; - pmullw MM1, MM3; - movq [ESI -16], MM0; - movq [ESI+8-16], MM1; - cmp ESI, EDI; - jb startmmx; - - emms; - mov aptr, ESI; - mov bptr, ECX; - } - } - } - - while (aptr < aend) - *aptr++ *= *bptr++; - - return a; -} - -unittest -{ - printf("_arraySliceSliceMulass_s unittest\n"); - - for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) - { - version (log) printf(" cpuid %d\n", cpuid); - - for (int j = 0; j < 2; j++) - { - const int dim = 67; - T[] a = new T[dim + j]; // aligned on 16 byte boundary - a = a[j .. dim + j]; // misalign for second iteration - T[] b = new T[dim + j]; - b = b[j .. dim + j]; - T[] c = new T[dim + j]; - c = c[j .. dim + j]; - - for (int i = 0; i < dim; i++) - { a[i] = cast(T)i; - b[i] = cast(T)(i + 7); - c[i] = cast(T)(i * 2); - } - - b[] = a[]; - a[] *= c[]; - - for (int i = 0; i < dim; i++) - { - if (a[i] != cast(T)(b[i] * c[i])) - { - printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); - assert(0); - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh.c druntime/src/rt/deh.c --- druntime-old/src/rt/deh.c 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/deh.c 1970-01-01 03:00:00.000000000 +0300 @@ -1,734 +0,0 @@ -/** - * Implementation of exception handling support routines for Windows. - * - * Copyright: Copyright Digital Mars 1999 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright - * - * Copyright Digital Mars 1999 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -#include -#include -#include -#include - -/* ======================== Win32 =============================== */ - -#if _WIN32 - -#include -#include - -//#include "\sc\src\include\ehsup.h" - -/*** From Digital Mars C runtime library ***/ -EXCEPTION_DISPOSITION __cdecl _local_except_handler (EXCEPTION_RECORD *ExceptionRecord, - void* EstablisherFrame, - void *ContextRecord, - void *DispatcherContext - ); -void __cdecl _global_unwind(void *frame,EXCEPTION_RECORD *eRecord); -#define EXCEPTION_UNWIND 6 // Flag to indicate if the system is unwinding - -extern DWORD _except_list; -/*** ***/ - -#include "mars.h" - -extern ClassInfo D6object9Throwable7__ClassZ; -#define _Class_9Throwable D6object9Throwable7__ClassZ; - -extern ClassInfo D6object5Error7__ClassZ; -#define _Class_5Error D6object5Error7__ClassZ - -typedef int (__pascal *fp_t)(); // function pointer in ambient memory model - -void _d_setunhandled(Object*); - -// The layout of DEstablisherFrame is the same for C++ - -struct DEstablisherFrame -{ - void *prev; // pointer to previous exception list - void *handler; // pointer to routine for exception handler - DWORD table_index; // current index into handler_info[] - DWORD ebp; // this is EBP of routine -}; - -struct DHandlerInfo -{ - int prev_index; // previous table index - unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) - void *finally_code; // pointer to finally code to execute - // (!=0 if try-finally) -}; - -// Address of DHandlerTable is passed in EAX to _d_framehandler() - -struct DHandlerTable -{ - void *fptr; // pointer to start of function - unsigned espoffset; // offset of ESP from EBP - unsigned retoffset; // offset from start of function to return code - struct DHandlerInfo handler_info[1]; -}; - -struct DCatchBlock -{ - ClassInfo *type; // catch type - unsigned bpoffset; // EBP offset of catch var - void *code; // catch handler code -}; - -// Create one of these for each try-catch -struct DCatchInfo -{ - unsigned ncatches; // number of catch blocks - struct DCatchBlock catch_block[1]; // data for each catch block -}; - -// Macro to make our own exception code -#define MAKE_EXCEPTION_CODE(severity, facility, exception) \ - (((severity) << 30) | (1 << 29) | (0 << 28) | ((facility) << 16) | (exception)) - -#define STATUS_DIGITAL_MARS_D_EXCEPTION MAKE_EXCEPTION_CODE(3,'D',1) - -Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record); -void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, struct DEstablisherFrame *frame, int stop_index); - - -/*********************************** - * The frame handler, this is called for each frame that has been registered - * in the OS except_list. - * Input: - * EAX the handler table for the frame - */ - -EXCEPTION_DISPOSITION _d_framehandler( - EXCEPTION_RECORD *exception_record, - struct DEstablisherFrame *frame, - CONTEXT *context, - void *dispatcher_context) -{ - struct DHandlerTable *handler_table; - - __asm { mov handler_table,EAX } - - if (exception_record->ExceptionFlags & EXCEPTION_UNWIND) - { - // Call all the finally blocks in this frame - _d_local_unwind(handler_table, frame, -1); - } - else - { - // Jump to catch block if matching one is found - - int ndx,prev_ndx,i; - struct DHandlerInfo *phi; - struct DCatchInfo *pci; - struct DCatchBlock *pcb; - unsigned ncatches; // number of catches in the current handler - Object *pti; - ClassInfo *ci; - - ci = NULL; // only compute it if we need it - - // walk through handler table, checking each handler - // with an index smaller than the current table_index - for (ndx = frame->table_index; ndx != -1; ndx = prev_ndx) - { - phi = &handler_table->handler_info[ndx]; - prev_ndx = phi->prev_index; - if (phi->cioffset) - { - // this is a catch handler (no finally) - pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); - ncatches = pci->ncatches; - for (i = 0; i < ncatches; i++) - { - pcb = &pci->catch_block[i]; - - if (!ci) - { - // This code must match the translation code - if (exception_record->ExceptionCode == STATUS_DIGITAL_MARS_D_EXCEPTION) - { - //printf("ei[0] = %p\n", exception_record->ExceptionInformation[0]); - ci = **(ClassInfo ***)(exception_record->ExceptionInformation[0]); - } - else - ci = &_Class_9Throwable; - } - - if (_d_isbaseof(ci, pcb->type)) - { - // Matched the catch type, so we've found the handler. - int regebp; - - pti = _d_translate_se_to_d_exception(exception_record); - - // Initialize catch variable - regebp = (int)&frame->ebp; // EBP for this frame - *(void **)(regebp + (pcb->bpoffset)) = pti; - - _d_setunhandled(pti); - - // Have system call all finally blocks in intervening frames - _global_unwind(frame, exception_record); - - // Call all the finally blocks skipped in this frame - _d_local_unwind(handler_table, frame, ndx); - - _d_setunhandled(NULL); - - frame->table_index = prev_ndx; // we are out of this handler - - // Jump to catch block. Does not return. - { - unsigned catch_esp; - fp_t catch_addr; - - catch_addr = (fp_t)(pcb->code); - catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); - _asm - { - mov EAX,catch_esp - mov ECX,catch_addr - mov [EAX],ECX - mov EBP,regebp - mov ESP,EAX // reset stack - ret // jump to catch block - } - } - } - } - } - } - } - return ExceptionContinueSearch; -} - -/*********************************** - * Exception filter for use in __try..__except block - * surrounding call to Dmain() - */ - -int _d_exception_filter(struct _EXCEPTION_POINTERS *eptrs, - int retval, - Object **exception_object) -{ - *exception_object = _d_translate_se_to_d_exception(eptrs->ExceptionRecord); - return retval; -} - -/*********************************** - * Throw a D object. - */ - -void __stdcall _d_throw(Object *h) -{ - //printf("_d_throw(h = %p, &h = %p)\n", h, &h); - //printf("\tvptr = %p\n", *(void **)h); - RaiseException(STATUS_DIGITAL_MARS_D_EXCEPTION, - EXCEPTION_NONCONTINUABLE, - 1, (DWORD *)&h); -} - -/*********************************** - * Create an exception object - */ - -Object *_d_create_exception_object(ClassInfo *ci, char *msg) -{ - Throwable *exc; - - exc = (Throwable *)_d_newclass(ci); - // BUG: what if _d_newclass() throws an out of memory exception? - - if (msg) - { - exc->msglen = strlen(msg); - exc->msg = msg; - } - return (Object *)exc; -} - -/*********************************** - * Converts a Windows Structured Exception code to a D Exception Object. - */ - -Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record) -{ - Object *pti; - - switch (exception_record->ExceptionCode) { - case STATUS_DIGITAL_MARS_D_EXCEPTION: - // Generated D exception - pti = (Object *)(exception_record->ExceptionInformation[0]); - break; - - case STATUS_INTEGER_DIVIDE_BY_ZERO: - pti = _d_create_exception_object(&_Class_5Error, "Integer Divide by Zero"); - break; - - case STATUS_FLOAT_DIVIDE_BY_ZERO: - pti = _d_create_exception_object(&_Class_5Error, "Float Divide by Zero"); - break; - - case STATUS_ACCESS_VIOLATION: - pti = _d_create_exception_object(&_Class_5Error, "Access Violation"); - break; - - case STATUS_STACK_OVERFLOW: - pti = _d_create_exception_object(&_Class_5Error, "Stack Overflow"); - break; - - case STATUS_DATATYPE_MISALIGNMENT: - pti = _d_create_exception_object(&_Class_5Error, "Datatype Misalignment"); - break; - - case STATUS_ARRAY_BOUNDS_EXCEEDED: - pti = _d_create_exception_object(&_Class_5Error, "Array Bounds Exceeded"); - break; - - case STATUS_FLOAT_INVALID_OPERATION: - pti = _d_create_exception_object(&_Class_5Error, "Invalid Floating Point Operation"); - break; - - case STATUS_FLOAT_DENORMAL_OPERAND: - pti = _d_create_exception_object(&_Class_5Error, "Floating Point Denormal Operand"); - break; - - case STATUS_FLOAT_INEXACT_RESULT: - pti = _d_create_exception_object(&_Class_5Error, "Floating Point Inexact Result"); - break; - - case STATUS_FLOAT_OVERFLOW: - pti = _d_create_exception_object(&_Class_5Error, "Floating Point Overflow"); - break; - - case STATUS_FLOAT_UNDERFLOW: - pti = _d_create_exception_object(&_Class_5Error, "Floating Point Underflow"); - break; - - case STATUS_FLOAT_STACK_CHECK: - pti = _d_create_exception_object(&_Class_5Error, "Floating Point Stack Check"); - break; - - case STATUS_PRIVILEGED_INSTRUCTION: - if (*((unsigned char *)(exception_record->ExceptionAddress))==0xF4) { // HLT - pti = _d_create_exception_object(&_Class_5Error, "assert(0) or HLT instruction"); - } else { - pti = _d_create_exception_object(&_Class_5Error, "Privileged Instruction"); - } - break; - - case STATUS_ILLEGAL_INSTRUCTION: - pti = _d_create_exception_object(&_Class_5Error, "Illegal Instruction"); - break; - - case STATUS_BREAKPOINT: - pti = _d_create_exception_object(&_Class_5Error, "Breakpoint"); - break; - - case STATUS_IN_PAGE_ERROR: - pti = _d_create_exception_object(&_Class_5Error, "Win32 In Page Exception"); - break; -/* - case STATUS_INTEGER_OVERFLOW: // not supported on any x86 processor - case STATUS_INVALID_DISPOSITION: - case STATUS_NONCONTINUABLE_EXCEPTION: - case STATUS_SINGLE_STEP: - case DBG_CONTROL_C: // only when a debugger is attached - // In DMC, but not in Microsoft docs - case STATUS_GUARD_PAGE_VIOLATION: - case STATUS_INVALID_HANDLE: -*/ - // convert all other exception codes into a Win32Exception - default: - pti = _d_create_exception_object(&_Class_5Error, "Win32 Exception"); - break; - } - - return pti; -} - -/************************************** - * Call finally blocks in the current stack frame until stop_index. - * This is roughly equivalent to _local_unwind() for C in \src\win32\ehsup.c - */ - -void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, - struct DEstablisherFrame *frame, int stop_index) -{ - struct DHandlerInfo *phi; - struct DCatchInfo *pci; - int i; - - // Set up a special exception handler to catch double-fault exceptions. - __asm - { - push dword ptr -1 - push dword ptr 0 - push offset _local_except_handler // defined in src\win32\ehsup.c - push dword ptr fs:_except_list - mov FS:_except_list,ESP - } - - for (i = frame->table_index; i != -1 && i != stop_index; i = phi->prev_index) - { - phi = &handler_table->handler_info[i]; - if (phi->finally_code) - { - // Note that it is unnecessary to adjust the ESP, as the finally block - // accesses all items on the stack as relative to EBP. - - DWORD *catch_ebp = &frame->ebp; - void *blockaddr = phi->finally_code; - - _asm - { - push EBX - mov EBX,blockaddr - push EBP - mov EBP,catch_ebp - call EBX - pop EBP - pop EBX - } - } - } - - _asm - { - pop FS:_except_list - add ESP,12 - } -} - -/*********************************** - * external version of the unwinder - */ - -__declspec(naked) void __cdecl _d_local_unwind2() -{ - __asm - { - jmp _d_local_unwind - } -} - -/*********************************** - * The frame handler, this is called for each frame that has been registered - * in the OS except_list. - * Input: - * EAX the handler table for the frame - */ - -EXCEPTION_DISPOSITION _d_monitor_handler( - EXCEPTION_RECORD *exception_record, - struct DEstablisherFrame *frame, - CONTEXT *context, - void *dispatcher_context) -{ - if (exception_record->ExceptionFlags & EXCEPTION_UNWIND) - { - _d_monitorexit((Object *)frame->table_index); - } - else - { - } - return ExceptionContinueSearch; -} - -/*********************************** - */ - -void _d_monitor_prolog(void *x, void *y, Object *h) -{ - __asm - { - push EAX - } - //printf("_d_monitor_prolog(x=%p, y=%p, h=%p)\n", x, y, h); - _d_monitorenter(h); - __asm - { - pop EAX - } -} - -/*********************************** - */ - -void _d_monitor_epilog(void *x, void *y, Object *h) -{ - //printf("_d_monitor_epilog(x=%p, y=%p, h=%p)\n", x, y, h); - __asm - { - push EAX - push EDX - } - _d_monitorexit(h); - __asm - { - pop EDX - pop EAX - } -} - -#endif - -/* ======================== linux =============================== */ - -#if linux - -#include "mars.h" - -extern ClassInfo D6object9Throwable7__ClassZ; -#define _Class_9Throwable D6object9Throwable7__ClassZ; - -extern ClassInfo D6object5Error7__ClassZ; -#define _Class_5Error D6object5Error7__ClassZ - -typedef int (*fp_t)(); // function pointer in ambient memory model - -struct DHandlerInfo -{ - unsigned offset; // offset from function address to start of guarded section - int prev_index; // previous table index - unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) - void *finally_code; // pointer to finally code to execute - // (!=0 if try-finally) -}; - -// Address of DHandlerTable, searched for by eh_finddata() - -struct DHandlerTable -{ - void *fptr; // pointer to start of function - unsigned espoffset; // offset of ESP from EBP - unsigned retoffset; // offset from start of function to return code - unsigned nhandlers; // dimension of handler_info[] - struct DHandlerInfo handler_info[1]; -}; - -struct DCatchBlock -{ - ClassInfo *type; // catch type - unsigned bpoffset; // EBP offset of catch var - void *code; // catch handler code -}; - -// Create one of these for each try-catch -struct DCatchInfo -{ - unsigned ncatches; // number of catch blocks - struct DCatchBlock catch_block[1]; // data for each catch block -}; - -// One of these is generated for each function with try-catch or try-finally - -struct FuncTable -{ - void *fptr; // pointer to start of function - struct DHandlerTable *handlertable; // eh data for this function - unsigned size; // size of function in bytes -}; - -extern struct FuncTable *table_start; -extern struct FuncTable *table_end; - -void terminate() -{ -// _asm -// { -// hlt -// } -} - -/******************************************* - * Given address that is inside a function, - * figure out which function it is in. - * Return DHandlerTable if there is one, NULL if not. - */ - -struct DHandlerTable *__eh_finddata(void *address) -{ - struct FuncTable *ft; - - for (ft = (struct FuncTable *)table_start; - ft < (struct FuncTable *)table_end; - ft++) - { - if (ft->fptr <= address && - address < (void *)((char *)ft->fptr + ft->size)) - { - return ft->handlertable; - } - } - return NULL; -} - - -/****************************** - * Given EBP, find return address to caller, and caller's EBP. - * Input: - * regbp Value of EBP for current function - * *pretaddr Return address - * Output: - * *pretaddr return address to caller - * Returns: - * caller's EBP - */ - -unsigned __eh_find_caller(unsigned regbp, unsigned *pretaddr) -{ - unsigned bp = *(unsigned *)regbp; - - if (bp) // if not end of call chain - { - // Perform sanity checks on new EBP. - // If it is screwed up, terminate() hopefully before we do more damage. - if (bp <= regbp) - // stack should grow to smaller values - terminate(); - - *pretaddr = *(unsigned *)(regbp + sizeof(int)); - } - return bp; -} - -/*********************************** - * Throw a D object. - */ - -void __stdcall _d_throw(Object *h) -{ - unsigned regebp; - - //printf("_d_throw(h = %p, &h = %p)\n", h, &h); - //printf("\tvptr = %p\n", *(void **)h); - - regebp = _EBP; - - while (1) // for each function on the stack - { - struct DHandlerTable *handler_table; - struct FuncTable *pfunc; - struct DHandlerInfo *phi; - unsigned retaddr; - unsigned funcoffset; - unsigned spoff; - unsigned retoffset; - int index; - int dim; - int ndx; - int prev_ndx; - - regebp = __eh_find_caller(regebp,&retaddr); - if (!regebp) - // if end of call chain - break; - - handler_table = __eh_finddata((void *)retaddr); // find static data associated with function - if (!handler_table) // if no static data - { - continue; - } - funcoffset = (unsigned)handler_table->fptr; - spoff = handler_table->espoffset; - retoffset = handler_table->retoffset; - -#ifdef DEBUG - printf("retaddr = x%x\n",(unsigned)retaddr); - printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n", - regebp,funcoffset,spoff,retoffset); -#endif - - // Find start index for retaddr in static data - dim = handler_table->nhandlers; - index = -1; - for (int i = 0; i < dim; i++) - { - phi = &handler_table->handler_info[i]; - - if ((unsigned)retaddr >= funcoffset + phi->offset) - index = i; - } - - // walk through handler table, checking each handler - // with an index smaller than the current table_index - for (ndx = index; ndx != -1; ndx = prev_ndx) - { - phi = &handler_table->handler_info[ndx]; - prev_ndx = phi->prev_index; - if (phi->cioffset) - { - // this is a catch handler (no finally) - struct DCatchInfo *pci; - int ncatches; - int i; - - pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset); - ncatches = pci->ncatches; - for (i = 0; i < ncatches; i++) - { - struct DCatchBlock *pcb; - ClassInfo *ci = **(ClassInfo ***)h; - - pcb = &pci->catch_block[i]; - - if (_d_isbaseof(ci, pcb->type)) - { // Matched the catch type, so we've found the handler. - - // Initialize catch variable - *(void **)(regebp + (pcb->bpoffset)) = h; - - // Jump to catch block. Does not return. - { - unsigned catch_esp; - fp_t catch_addr; - - catch_addr = (fp_t)(pcb->code); - catch_esp = regebp - handler_table->espoffset - sizeof(fp_t); - _asm - { - mov EAX,catch_esp - mov ECX,catch_addr - mov [EAX],ECX - mov EBP,regebp - mov ESP,EAX // reset stack - ret // jump to catch block - } - } - } - } - } - else if (phi->finally_code) - { // Call finally block - // Note that it is unnecessary to adjust the ESP, as the finally block - // accesses all items on the stack as relative to EBP. - - void *blockaddr = phi->finally_code; - - _asm - { - push EBX - mov EBX,blockaddr - push EBP - mov EBP,regebp - call EBX - pop EBP - pop EBX - } - } - } - } -} - - -#endif diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh2.d druntime/src/rt/deh2.d --- druntime-old/src/rt/deh2.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/deh2.d 1970-01-01 03:00:00.000000000 +0300 @@ -1,322 +0,0 @@ -/** - * Implementation of exception handling support routines for Posix. - * - * Copyright: Copyright Digital Mars 2000 - 2009. - * License: Boost License 1.0. - * Authors: Walter Bright - * - * Copyright Digital Mars 2000 - 2009. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -module rt.deh2; - -//debug=1; - -extern (C) -{ - extern __gshared - { - void* _deh_beg; - void* _deh_end; - } - - int _d_isbaseof(ClassInfo oc, ClassInfo c); - - void _d_setunhandled(Object* o); -} - -alias int (*fp_t)(); // function pointer in ambient memory model - -struct DHandlerInfo -{ - uint offset; // offset from function address to start of guarded section - uint endoffset; // offset of end of guarded section - int prev_index; // previous table index - uint cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch) - void *finally_code; // pointer to finally code to execute - // (!=0 if try-finally) -} - -// Address of DHandlerTable, searched for by eh_finddata() - -struct DHandlerTable -{ - void *fptr; // pointer to start of function - uint espoffset; // offset of ESP from EBP - uint retoffset; // offset from start of function to return code - uint nhandlers; // dimension of handler_info[] - DHandlerInfo handler_info[1]; -} - -struct DCatchBlock -{ - ClassInfo type; // catch type - uint bpoffset; // EBP offset of catch var - void *code; // catch handler code -} - -// Create one of these for each try-catch -struct DCatchInfo -{ - uint ncatches; // number of catch blocks - DCatchBlock catch_block[1]; // data for each catch block -} - -// One of these is generated for each function with try-catch or try-finally - -struct FuncTable -{ - void *fptr; // pointer to start of function - DHandlerTable *handlertable; // eh data for this function - uint fsize; // size of function in bytes -} - -void terminate() -{ - asm - { - hlt ; - } -} - -/******************************************* - * Given address that is inside a function, - * figure out which function it is in. - * Return DHandlerTable if there is one, NULL if not. - */ - -DHandlerTable *__eh_finddata(void *address) -{ - FuncTable *ft; - -// debug printf("__eh_finddata(address = x%x)\n", address); -// debug printf("_deh_beg = x%x, _deh_end = x%x\n", &_deh_beg, &_deh_end); - for (ft = cast(FuncTable *)&_deh_beg; - ft < cast(FuncTable *)&_deh_end; - ft++) - { -// debug printf("\tfptr = x%x, fsize = x%03x, handlertable = x%x\n", -// ft.fptr, ft.fsize, ft.handlertable); - - if (ft.fptr <= address && - address < cast(void *)(cast(char *)ft.fptr + ft.fsize)) - { -// debug printf("\tfound handler table\n"); - return ft.handlertable; - } - } -// debug printf("\tnot found\n"); - return null; -} - - -/****************************** - * Given EBP, find return address to caller, and caller's EBP. - * Input: - * regbp Value of EBP for current function - * *pretaddr Return address - * Output: - * *pretaddr return address to caller - * Returns: - * caller's EBP - */ - -uint __eh_find_caller(uint regbp, uint *pretaddr) -{ - uint bp = *cast(uint *)regbp; - - if (bp) // if not end of call chain - { - // Perform sanity checks on new EBP. - // If it is screwed up, terminate() hopefully before we do more damage. - if (bp <= regbp) - // stack should grow to smaller values - terminate(); - - *pretaddr = *cast(uint *)(regbp + int.sizeof); - } - return bp; -} - -/*********************************** - * Throw a D object. - */ - -extern (Windows) void _d_throw(Object *h) -{ - uint regebp; - - debug - { - printf("_d_throw(h = %p, &h = %p)\n", h, &h); - printf("\tvptr = %p\n", *cast(void **)h); - } - - asm - { - mov regebp,EBP ; - } - - _d_setunhandled(h); - -//static uint abc; -//if (++abc == 2) *(char *)0=0; - -//int count = 0; - while (1) // for each function on the stack - { - DHandlerTable *handler_table; - FuncTable *pfunc; - DHandlerInfo *phi; - uint retaddr; - uint funcoffset; - uint spoff; - uint retoffset; - int index; - int dim; - int ndx; - int prev_ndx; - - regebp = __eh_find_caller(regebp,&retaddr); - if (!regebp) - { // if end of call chain - debug printf("end of call chain\n"); - break; - } - - debug printf("found caller, EBP = x%x, retaddr = x%x\n", regebp, retaddr); -//if (++count == 12) *(char*)0=0; - handler_table = __eh_finddata(cast(void *)retaddr); // find static data associated with function - if (!handler_table) // if no static data - { - debug printf("no handler table\n"); - continue; - } - funcoffset = cast(uint)handler_table.fptr; - spoff = handler_table.espoffset; - retoffset = handler_table.retoffset; - - debug - { - printf("retaddr = x%x\n",cast(uint)retaddr); - printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n", - regebp,funcoffset,spoff,retoffset); - } - - // Find start index for retaddr in static data - dim = handler_table.nhandlers; - - debug - { - printf("handler_info[]:\n"); - for (int i = 0; i < dim; i++) - { - phi = &handler_table.handler_info[i]; - printf("\t[%d]: offset = x%04x, endoffset = x%04x, prev_index = %d, cioffset = x%04x, finally_code = %x\n", - i, phi.offset, phi.endoffset, phi.prev_index, phi.cioffset, phi.finally_code); - } - } - - index = -1; - for (int i = 0; i < dim; i++) - { - phi = &handler_table.handler_info[i]; - - debug printf("i = %d, phi.offset = %04x\n", i, funcoffset + phi.offset); - if (cast(uint)retaddr > funcoffset + phi.offset && - cast(uint)retaddr <= funcoffset + phi.endoffset) - index = i; - } - debug printf("index = %d\n", index); - - // walk through handler table, checking each handler - // with an index smaller than the current table_index - for (ndx = index; ndx != -1; ndx = prev_ndx) - { - phi = &handler_table.handler_info[ndx]; - prev_ndx = phi.prev_index; - if (phi.cioffset) - { - // this is a catch handler (no finally) - DCatchInfo *pci; - int ncatches; - int i; - - pci = cast(DCatchInfo *)(cast(char *)handler_table + phi.cioffset); - ncatches = pci.ncatches; - for (i = 0; i < ncatches; i++) - { - DCatchBlock *pcb; - ClassInfo ci = **cast(ClassInfo **)h; - - pcb = &pci.catch_block[i]; - - if (_d_isbaseof(ci, pcb.type)) - { // Matched the catch type, so we've found the handler. - - _d_setunhandled(null); - - // Initialize catch variable - *cast(void **)(regebp + (pcb.bpoffset)) = h; - - // Jump to catch block. Does not return. - { - uint catch_esp; - fp_t catch_addr; - - catch_addr = cast(fp_t)(pcb.code); - catch_esp = regebp - handler_table.espoffset - fp_t.sizeof; - asm - { - mov EAX,catch_esp ; - mov ECX,catch_addr ; - mov [EAX],ECX ; - mov EBP,regebp ; - mov ESP,EAX ; // reset stack - ret ; // jump to catch block - } - } - } - } - } - else if (phi.finally_code) - { // Call finally block - // Note that it is unnecessary to adjust the ESP, as the finally block - // accesses all items on the stack as relative to EBP. - - void *blockaddr = phi.finally_code; - - version (OSX) - { - asm - { - sub ESP,4 ; // align stack to 16 - push EBX ; - mov EBX,blockaddr ; - push EBP ; - mov EBP,regebp ; - call EBX ; - pop EBP ; - pop EBX ; - add ESP,4 ; - } - } - else - { - asm - { - push EBX ; - mov EBX,blockaddr ; - push EBP ; - mov EBP,regebp ; - call EBX ; - pop EBP ; - pop EBX ; - } - } - } - } - } -} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/eh.d druntime/src/rt/eh.d --- druntime-old/src/rt/eh.d 1970-01-01 03:00:00.000000000 +0300 +++ druntime/src/rt/eh.d 2010-10-03 18:29:58.099624002 +0400 @@ -0,0 +1,428 @@ +/** + * This module contains functions and structures required for + * exception handling. + */ +module eh; + +private import core.stdc.stdio; +private import core.stdc.stdlib; +private import rt.util.console; +private import ldc.cstdarg; + +// debug = EH_personality; +// debug = EH_personality_verbose; + +// current EH implementation works on x86 +// if it has a working unwind runtime +version(X86) { + version(linux) version=X86_UNWIND; + version(darwin) version=X86_UNWIND; + version(solaris) version=X86_UNWIND; +} +version(X86_64) { + version(linux) version=X86_UNWIND; + version(darwin) version=X86_UNWIND; + version(solaris) version=X86_UNWIND; +} + +//version = HP_LIBUNWIND; + +// D runtime functions +extern(C) { + int _d_isbaseof(ClassInfo oc, ClassInfo c); +} + +// libunwind headers +extern(C) +{ + enum _Unwind_Reason_Code : int + { + NO_REASON = 0, + FOREIGN_EXCEPTION_CAUGHT = 1, + FATAL_PHASE2_ERROR = 2, + FATAL_PHASE1_ERROR = 3, + NORMAL_STOP = 4, + END_OF_STACK = 5, + HANDLER_FOUND = 6, + INSTALL_CONTEXT = 7, + CONTINUE_UNWIND = 8 + } + + enum _Unwind_Action : int + { + SEARCH_PHASE = 1, + CLEANUP_PHASE = 2, + HANDLER_FRAME = 4, + FORCE_UNWIND = 8 + } + + alias void* _Unwind_Context_Ptr; + + alias void function(_Unwind_Reason_Code, _Unwind_Exception*) _Unwind_Exception_Cleanup_Fn; + + struct _Unwind_Exception + { + ulong exception_class; + _Unwind_Exception_Cleanup_Fn exception_cleanup; + ptrdiff_t private_1; + ptrdiff_t private_2; + } + +// interface to HP's libunwind from http://www.nongnu.org/libunwind/ +version(HP_LIBUNWIND) +{ + void __libunwind_Unwind_Resume(_Unwind_Exception *); + _Unwind_Reason_Code __libunwind_Unwind_RaiseException(_Unwind_Exception *); + ptrdiff_t __libunwind_Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr + context); + ptrdiff_t __libunwind_Unwind_GetIP(_Unwind_Context_Ptr context); + ptrdiff_t __libunwind_Unwind_SetIP(_Unwind_Context_Ptr context, + ptrdiff_t new_value); + ptrdiff_t __libunwind_Unwind_SetGR(_Unwind_Context_Ptr context, int index, + ptrdiff_t new_value); + ptrdiff_t __libunwind_Unwind_GetRegionStart(_Unwind_Context_Ptr context); + + alias __libunwind_Unwind_Resume _Unwind_Resume; + alias __libunwind_Unwind_RaiseException _Unwind_RaiseException; + alias __libunwind_Unwind_GetLanguageSpecificData + _Unwind_GetLanguageSpecificData; + alias __libunwind_Unwind_GetIP _Unwind_GetIP; + alias __libunwind_Unwind_SetIP _Unwind_SetIP; + alias __libunwind_Unwind_SetGR _Unwind_SetGR; + alias __libunwind_Unwind_GetRegionStart _Unwind_GetRegionStart; +} +else version(X86_UNWIND) +{ + void _Unwind_Resume(_Unwind_Exception*); + _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*); + ptrdiff_t _Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr context); + ptrdiff_t _Unwind_GetIP(_Unwind_Context_Ptr context); + ptrdiff_t _Unwind_SetIP(_Unwind_Context_Ptr context, ptrdiff_t new_value); + ptrdiff_t _Unwind_SetGR(_Unwind_Context_Ptr context, int index, + ptrdiff_t new_value); + ptrdiff_t _Unwind_GetRegionStart(_Unwind_Context_Ptr context); +} +else +{ + // runtime calls these directly + void _Unwind_Resume(_Unwind_Exception*) + { + console("_Unwind_Resume is not implemented on this platform.\n"); + } + _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*) + { + console("_Unwind_RaiseException is not implemented on this platform.\n"); + return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; + } +} + +} + +// error and exit +extern(C) private void fatalerror(in char* format, ...) +{ + va_list args; + va_start(args, format); + printf("Fatal error in EH code: "); + vprintf(format, args); + printf("\n"); + abort(); +} + + +// helpers for reading certain DWARF data +private ubyte* get_uleb128(ubyte* addr, ref size_t res) +{ + res = 0; + size_t bitsize = 0; + + // read as long as high bit is set + while(*addr & 0x80) { + res |= (*addr & 0x7f) << bitsize; + bitsize += 7; + addr += 1; + if(bitsize >= size_t.sizeof*8) + fatalerror("tried to read uleb128 that exceeded size of size_t"); + } + // read last + if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize) + fatalerror("Fatal error in EH code: tried to read uleb128 that exceeded size of size_t"); + res |= (*addr) << bitsize; + + return addr + 1; +} + +private ubyte* get_sleb128(ubyte* addr, ref ptrdiff_t res) +{ + res = 0; + size_t bitsize = 0; + + // read as long as high bit is set + while(*addr & 0x80) { + res |= (*addr & 0x7f) << bitsize; + bitsize += 7; + addr += 1; + if(bitsize >= size_t.sizeof*8) + fatalerror("tried to read sleb128 that exceeded size of size_t"); + } + // read last + if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize) + fatalerror("tried to read sleb128 that exceeded size of size_t"); + res |= (*addr) << bitsize; + + // take care of sign + if(bitsize < size_t.sizeof*8 && ((*addr) & 0x40)) + res |= cast(ptrdiff_t)(-1) ^ ((1 << (bitsize+7)) - 1); + + return addr + 1; +} + + +// exception struct used by the runtime. +// _d_throw allocates a new instance and passes the address of its +// _Unwind_Exception member to the unwind call. The personality +// routine is then able to get the whole struct by looking at the data +// surrounding the unwind info. +struct _d_exception +{ + Object exception_object; + _Unwind_Exception unwind_info; +} + +// the 8-byte string identifying the type of exception +// the first 4 are for vendor, the second 4 for language +//TODO: This may be the wrong way around +char[8] _d_exception_class = "LLDCD1\0\0"; + + +// +// x86 unwind specific implementation of personality function +// and helpers +// +version(X86_UNWIND) +{ + +// the personality routine gets called by the unwind handler and is responsible for +// reading the EH tables and deciding what to do +extern(C) _Unwind_Reason_Code _d_eh_personality(int ver, _Unwind_Action actions, ulong exception_class, _Unwind_Exception* exception_info, _Unwind_Context_Ptr context) +{ + debug(EH_personality_verbose) printf("entering personality function. context: %p\n", context); + // check ver: the C++ Itanium ABI only allows ver == 1 + if(ver != 1) + return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; + + // check exceptionClass + //TODO: Treat foreign exceptions with more respect + if((cast(char*)&exception_class)[0..8] != _d_exception_class) + return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; + + // find call site table, action table and classinfo table + // Note: callsite and action tables do not contain static-length + // data and will be parsed as needed + // Note: classinfo_table points past the end of the table + ubyte* callsite_table; + ubyte* action_table; + ClassInfo* classinfo_table; + _d_getLanguageSpecificTables(context, callsite_table, action_table, classinfo_table); + if (callsite_table is null) + return _Unwind_Reason_Code.CONTINUE_UNWIND; + + /* + find landing pad and action table index belonging to ip by walking + the callsite_table + */ + ubyte* callsite_walker = callsite_table; + + // get the instruction pointer + // will be used to find the right entry in the callsite_table + // -1 because it will point past the last instruction + ptrdiff_t ip = _Unwind_GetIP(context) - 1; + + // address block_start is relative to + ptrdiff_t region_start = _Unwind_GetRegionStart(context); + + // table entries + uint block_start_offset, block_size; + ptrdiff_t landing_pad; + size_t action_offset; + + while(true) { + // if we've gone through the list and found nothing... + if(callsite_walker >= action_table) + return _Unwind_Reason_Code.CONTINUE_UNWIND; + + block_start_offset = *cast(uint*)callsite_walker; + block_size = *(cast(uint*)callsite_walker + 1); + landing_pad = *(cast(uint*)callsite_walker + 2); + if(landing_pad) + landing_pad += region_start; + callsite_walker = get_uleb128(callsite_walker + 3*uint.sizeof, action_offset); + + debug(EH_personality_verbose) printf("ip=%llx %d %d %llx\n", ip, block_start_offset, block_size, landing_pad); + + // since the list is sorted, as soon as we're past the ip + // there's no handler to be found + if(ip < region_start + block_start_offset) + return _Unwind_Reason_Code.CONTINUE_UNWIND; + + // if we've found our block, exit + if(ip < region_start + block_start_offset + block_size) + break; + } + + debug(EH_personality) printf("Found correct landing pad and actionOffset %d\n", action_offset); + + // now we need the exception's classinfo to find a handler + // the exception_info is actually a member of a larger _d_exception struct + // the runtime allocated. get that now + _d_exception* exception_struct = cast(_d_exception*)(cast(ubyte*)exception_info - _d_exception.unwind_info.offsetof); + + // if there's no action offset and no landing pad, continue unwinding + if(!action_offset && !landing_pad) + return _Unwind_Reason_Code.CONTINUE_UNWIND; + + // if there's no action offset but a landing pad, this is a cleanup handler + else if(!action_offset && landing_pad) + return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context); + + /* + walk action table chain, comparing classinfos using _d_isbaseof + */ + ubyte* action_walker = action_table + action_offset - 1; + + ptrdiff_t ti_offset, next_action_offset; + while(true) { + action_walker = get_sleb128(action_walker, ti_offset); + // it is intentional that we not modify action_walker here + // next_action_offset is from current action_walker position + get_sleb128(action_walker, next_action_offset); + + // negative are 'filters' which we don't use + if(!(ti_offset >= 0)) + fatalerror("Filter actions are unsupported"); + + // zero means cleanup, which we require to be the last action + if(ti_offset == 0) { + if(!(next_action_offset == 0)) + fatalerror("Cleanup action must be last in chain"); + return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context); + } + + // get classinfo for action and check if the one in the + // exception structure is a base + ClassInfo catch_ci = *(classinfo_table - ti_offset); + debug(EH_personality) printf("Comparing catch %s to exception %s\n", catch_ci.name.ptr, exception_struct.exception_object.classinfo.name.ptr); + if(_d_isbaseof(exception_struct.exception_object.classinfo, catch_ci)) + return _d_eh_install_catch_context(actions, ti_offset, landing_pad, exception_struct, context); + + // we've walked through all actions and found nothing... + if(next_action_offset == 0) + return _Unwind_Reason_Code.CONTINUE_UNWIND; + else + action_walker += next_action_offset; + } + + fatalerror("reached unreachable"); + return _Unwind_Reason_Code.FATAL_PHASE1_ERROR; +} + +// These are the register numbers for SetGR that +// llvm's eh.exception and eh.selector intrinsics +// will pick up. +// Hints for these can be found by looking at the +// EH_RETURN_DATA_REGNO macro in GCC, careful testing +// is required though. +version (X86_64) +{ + private int eh_exception_regno = 0; + private int eh_selector_regno = 1; +} else { + private int eh_exception_regno = 0; + private int eh_selector_regno = 2; +} + +private _Unwind_Reason_Code _d_eh_install_catch_context(_Unwind_Action actions, ptrdiff_t switchval, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context) +{ + debug(EH_personality) printf("Found catch clause!\n"); + + if(actions & _Unwind_Action.SEARCH_PHASE) + return _Unwind_Reason_Code.HANDLER_FOUND; + + else if(actions & _Unwind_Action.CLEANUP_PHASE) + { + debug(EH_personality) printf("Setting switch value to: %d!\n", switchval); + _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)cast(void*)(exception_struct.exception_object)); + _Unwind_SetGR(context, eh_selector_regno, cast(ptrdiff_t)switchval); + _Unwind_SetIP(context, landing_pad); + return _Unwind_Reason_Code.INSTALL_CONTEXT; + } + + fatalerror("reached unreachable"); + return _Unwind_Reason_Code.FATAL_PHASE2_ERROR; +} + +private _Unwind_Reason_Code _d_eh_install_finally_context(_Unwind_Action actions, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context) +{ + // if we're merely in search phase, continue + if(actions & _Unwind_Action.SEARCH_PHASE) + return _Unwind_Reason_Code.CONTINUE_UNWIND; + + debug(EH_personality) printf("Calling cleanup routine...\n"); + + _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)exception_struct); + _Unwind_SetGR(context, eh_selector_regno, 0); + _Unwind_SetIP(context, landing_pad); + return _Unwind_Reason_Code.INSTALL_CONTEXT; +} + +private void _d_getLanguageSpecificTables(_Unwind_Context_Ptr context, ref ubyte* callsite, ref ubyte* action, ref ClassInfo* ci) +{ + ubyte* data = cast(ubyte*)_Unwind_GetLanguageSpecificData(context); + if (data is null) + { + //printf("language specific data was null\n"); + callsite = null; + action = null; + ci = null; + return; + } + + //TODO: Do proper DWARF reading here + if(*data++ != 0xff) + fatalerror("DWARF header has unexpected format 1"); + + if(*data++ != 0x00) + fatalerror("DWARF header has unexpected format 2"); + size_t cioffset; + data = get_uleb128(data, cioffset); + ci = cast(ClassInfo*)(data + cioffset); + + if(*data++ != 0x03) + fatalerror("DWARF header has unexpected format 3"); + size_t callsitelength; + data = get_uleb128(data, callsitelength); + action = data + callsitelength; + + callsite = data; +} + +} // end of x86 Linux specific implementation + + +extern(C) void _d_throw_exception(Object e) +{ + if (e !is null) + { + _d_exception* exc_struct = new _d_exception; + exc_struct.unwind_info.exception_class = *cast(ulong*)_d_exception_class.ptr; + exc_struct.exception_object = e; + _Unwind_Reason_Code ret = _Unwind_RaiseException(&exc_struct.unwind_info); + console("_Unwind_RaiseException failed with reason code: ")(ret)("\n"); + } + abort(); +} + +extern(C) void _d_eh_resume_unwind(_d_exception* exception_struct) +{ + _Unwind_Resume(&exception_struct.unwind_info); +} diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/lifetime.d druntime/src/rt/lifetime.d --- druntime-old/src/rt/lifetime.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/lifetime.d 2010-10-08 14:55:56.581547002 +0400 @@ -81,6 +81,28 @@ MAXSMALLSIZE = 256-SMALLPAD, MAXMEDSIZE = (PAGESIZE / 2) - MEDPAD } + + version( LDC ) + { + size_t length_adjust(size_t sizeelem, size_t newlength) + { + size_t newsize = void; + static if (size_t.sizeof < ulong.sizeof) + { + ulong s = cast(ulong)sizeelem * cast(ulong)newlength; + if (s > size_t.max) + onOutOfMemoryError(); + newsize = cast(size_t)s; + } + else + { + newsize = sizeelem * newlength; + if (newsize / newlength != sizeelem) + onOutOfMemoryError(); + } + return newsize; + } + } } @@ -92,6 +114,13 @@ return gc_malloc(sz); } +/** + * for allocating a single POD value + */ +extern (C) void* _d_allocmemoryT(TypeInfo ti) +{ + return gc_malloc(ti.tsize(), !(ti.flags() & 1) ? BlkAttr.NO_SCAN : 0); +} /** * @@ -670,7 +699,7 @@ * ti is the type of the resulting array, or pointer to element. * (For when the array is initialized to 0) */ -extern (C) ulong _d_newarrayT(TypeInfo ti, size_t length) +extern (C) void[] _d_newarrayT(TypeInfo ti, size_t length) { ulong result; auto size = ti.next.tsize(); // array element size @@ -702,7 +731,7 @@ __setArrayAllocLength(info, size, isshared); result = cast(ulong)length + (cast(ulong)cast(size_t)arrstart << 32); } - return result; + return *cast(void[]*)&result; Loverflow: onOutOfMemoryError(); @@ -711,7 +740,7 @@ /** * For when the array has a non-zero initializer. */ -extern (C) ulong _d_newarrayiT(TypeInfo ti, size_t length) +extern (C) void[] _d_newarrayiT(TypeInfo ti, size_t length) { ulong result; auto size = ti.next.tsize(); // array element size @@ -764,7 +793,7 @@ __setArrayAllocLength(info, size, isshared); result = cast(ulong)length + (cast(ulong)cast(uint)arrstart << 32); } - return result; + return *cast(void[]*)&result; Loverflow: onOutOfMemoryError(); @@ -773,7 +802,7 @@ /** * */ -extern (C) ulong _d_newarraymT(TypeInfo ti, int ndims, ...) +extern (C) void[] _d_newarraymT(TypeInfo ti, int ndims, ...) { ulong result; @@ -823,14 +852,14 @@ } va_end(q); } - return result; + return *cast(void[]*)&result; } /** * */ -extern (C) ulong _d_newarraymiT(TypeInfo ti, int ndims, ...) +extern (C) void[] _d_newarraymiT(TypeInfo ti, int ndims, ...) { ulong result; @@ -881,10 +910,9 @@ } va_end(q); } - return result; + return *cast(void[]*)&result; } - /** * */ @@ -1046,7 +1074,7 @@ /** * Resize dynamic arrays with 0 initializers. */ -extern (C) byte[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p) +extern (C) void[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p) in { assert(ti); @@ -1206,7 +1234,7 @@ * initsize size of initializer * ... initializer */ -extern (C) byte[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p) +extern (C) void[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p) in { assert(!p.length || p.data); @@ -1376,12 +1404,11 @@ onOutOfMemoryError(); } - /** * Append y[] to array pointed to by px * size is size of each array element. */ -extern (C) long _d_arrayappendT(TypeInfo ti, Array *px, byte[] y) +extern (C) void[] _d_arrayappendT(TypeInfo ti, Array *px, byte[] y) { // only optimize array append where ti is not a shared type auto sizeelem = ti.next.tsize(); // array element size @@ -1468,10 +1495,9 @@ L1: px.length = newlength; memcpy(px.data + length * sizeelem, y.ptr, y.length * sizeelem); - return *cast(long*)px; + return *cast(void[]*)px; } - /** * */ @@ -1552,21 +1578,36 @@ return newcap; } +version (LDC) +{ + +/** + * Appends a single element to an array. + */ +extern (C) void[] _d_arrayappendcT(TypeInfo ti, byte[] *x, byte *argp) +{ + return _d_arrayappendT(ti, cast(Array*)x, argp[0..1]); +} + +} +else +{ /** * */ -extern (C) long _d_arrayappendcT(TypeInfo ti, Array *x, ...) +extern (C) void[] _d_arrayappendcT(TypeInfo ti, Array *x, ...) { byte *argp = cast(byte*)(&ti + 2); return _d_arrayappendT(ti, x, argp[0..1]); } +} /** * Append dchar to char[] */ -extern (C) long _d_arrayappendcd(ref char[] x, dchar c) +extern (C) void[] _d_arrayappendcd(ref char[] x, dchar c) { // c could encode into from 1 to 4 characters char[4] buf = void; @@ -1612,7 +1653,7 @@ /** * Append dchar to wchar[] */ -extern (C) long _d_arrayappendwd(ref wchar[] x, dchar c) +extern (C) void[] _d_arrayappendwd(ref wchar[] x, dchar c) { // c could encode into from 1 to 2 w characters wchar[2] buf = void; @@ -1641,7 +1682,6 @@ return _d_arrayappendT(typeid(shared wchar[]), cast(Array *)&x, appendthis); } - /** * */ @@ -1794,11 +1834,10 @@ void* ptr; } - /** * */ -extern (C) long _adDupT(TypeInfo ti, Array2 a) +extern (C) void[] _adDupT(TypeInfo ti, void[] a) out (result) { auto sizeelem = ti.next.tsize(); // array element size @@ -1819,7 +1858,7 @@ r.length = a.length; memcpy(r.ptr, a.ptr, size); } - return *cast(long*)(&r); + return *cast(void[]*)(&r); } diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort.d druntime/src/rt/qsort.d --- druntime-old/src/rt/qsort.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/qsort.d 2010-10-07 13:59:06.815253002 +0400 @@ -44,7 +44,7 @@ structures. The default value is optimized for a high cost for compares. */ -extern (C) long _adSort(Array a, TypeInfo ti) +extern (C) void[] _adSort(void[] a, TypeInfo ti) { byte* base; byte*[40] stack; // stack @@ -124,7 +124,7 @@ limit = sp[1]; } else // else stack empty, all done - return *cast(long*)(&a); + return a; } assert(0); } diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort2.d druntime/src/rt/qsort2.d --- druntime-old/src/rt/qsort2.d 2010-08-05 05:39:06.000000000 +0400 +++ druntime/src/rt/qsort2.d 2010-10-07 14:01:41.359253001 +0400 @@ -31,14 +31,14 @@ return tiglobal.compare(p1, p2); } -extern (C) long _adSort(Array a, TypeInfo ti) +extern (C) void[] _adSort(void[] a, TypeInfo ti) { synchronized { tiglobal = ti; qsort(a.ptr, a.length, cast(size_t)ti.tsize(), &cmp); } - return *cast(long*)(&a); + return a; } diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/trace.d druntime/src/rt/trace.d --- druntime-old/src/rt/trace.d 2010-08-07 09:46:06.000000000 +0400 +++ druntime/src/rt/trace.d 2010-10-01 21:01:58.444892002 +0400 @@ -855,7 +855,7 @@ version (OSX) { // 16 byte align stack asm - { naked ; + { pushad ; sub ESP,12 ; } @@ -870,7 +870,7 @@ else { asm - { naked ; + { pushad ; } trace_epi(); diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/std/intrinsic.d druntime/src/std/intrinsic.d --- druntime-old/src/std/intrinsic.d 1970-01-01 03:00:00.000000000 +0300 +++ druntime/src/std/intrinsic.d 2010-10-03 20:07:21.183624002 +0400 @@ -0,0 +1,212 @@ +/* + * D phobos intrinsics for LDC + * + * From GDC ... public domain! + */ +module std.intrinsic; + +// Check for the right compiler +version(LDC) +{ + // OK +} +else +{ + static assert(false, "This module is only valid for LDC"); +} + +/** + * Scans the bits in v starting with bit 0, looking + * for the first set bit. + * Returns: + * The bit number of the first bit set. + * The return value is undefined if v is zero. + */ +nothrow int bsf(uint v) +{ + uint m = 1; + uint i; + for (i = 0; i < 32; i++,m<<=1) { + if (v&m) + return i; + } + return i; // supposed to be undefined +} + +/** + * Scans the bits in v from the most significant bit + * to the least significant bit, looking + * for the first set bit. + * Returns: + * The bit number of the first bit set. + * The return value is undefined if v is zero. + * Example: + * --- + * import std.intrinsic; + * + * int main() + * { + * uint v; + * int x; + * + * v = 0x21; + * x = bsf(v); + * printf("bsf(x%x) = %d\n", v, x); + * x = bsr(v); + * printf("bsr(x%x) = %d\n", v, x); + * return 0; + * } + * --- + * Output: + * bsf(x21) = 0
+ * bsr(x21) = 5 + */ +nothrow int bsr(uint v) +{ + uint m = 0x80000000; + uint i; + for (i = 32; i ; i--,m>>>=1) { + if (v&m) + return i-1; + } + return i; // supposed to be undefined +} + + +/** + * Tests the bit. + */ +nothrow int bt(uint *p, uint bitnum) +{ + return (p[bitnum / (uint.sizeof*8)] & (1<<(bitnum & ((uint.sizeof*8)-1)))) ? -1 : 0 ; +} + + +/** + * Tests and complements the bit. + */ +nothrow int btc(uint *p, uint bitnum) +{ + uint * q = p + (bitnum / (uint.sizeof*8)); + uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1)); + int result = *q & mask; + *q ^= mask; + return result ? -1 : 0; +} + + +/** + * Tests and resets (sets to 0) the bit. + */ +nothrow int btr(uint *p, uint bitnum) +{ + uint * q = p + (bitnum / (uint.sizeof*8)); + uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1)); + int result = *q & mask; + *q &= ~mask; + return result ? -1 : 0; +} + + +/** + * Tests and sets the bit. + * Params: + * p = a non-NULL pointer to an array of uints. + * index = a bit number, starting with bit 0 of p[0], + * and progressing. It addresses bits like the expression: +--- +p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1))) +--- + * Returns: + * A non-zero value if the bit was set, and a zero + * if it was clear. + * + * Example: + * --- +import std.intrinsic; + +int main() +{ + uint array[2]; + + array[0] = 2; + array[1] = 0x100; + + printf("btc(array, 35) = %d\n", btc(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("btc(array, 35) = %d\n", btc(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("bts(array, 35) = %d\n", bts(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("btr(array, 35) = %d\n", btr(array, 35)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + printf("bt(array, 1) = %d\n", bt(array, 1)); + printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]); + + return 0; +} + * --- + * Output: +
+btc(array, 35) = 0
+array = [0]:x2, [1]:x108
+btc(array, 35) = -1
+array = [0]:x2, [1]:x100
+bts(array, 35) = 0
+array = [0]:x2, [1]:x108
+btr(array, 35) = -1
+array = [0]:x2, [1]:x100
+bt(array, 1) = -1
+array = [0]:x2, [1]:x100
+
+ */ +nothrow int bts(uint *p, uint bitnum) +{ + uint * q = p + (bitnum / (uint.sizeof*8)); + uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1)); + int result = *q & mask; + *q |= mask; + return result ? -1 : 0; +} + +/** + * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes + * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3 + * becomes byte 0. + */ +pragma(intrinsic, "llvm.bswap.i32") + uint bswap(uint val); + +/** + * Reads I/O port at port_address. + */ +ubyte inp(uint p) { throw new Exception("inp intrinsic not yet implemented"); } + +/** + * ditto + */ +ushort inpw(uint p) { throw new Exception("inpw intrinsic not yet implemented"); } + +/** + * ditto + */ +uint inpl(uint p) { throw new Exception("inpl intrinsic not yet implemented"); } + +/** + * ditto + */ +ubyte outp(uint p, ubyte v) { throw new Exception("outp intrinsic not yet implemented"); } + +/** + * ditto + */ +ushort outpw(uint p, ushort v) { throw new Exception("outpw intrinsic not yet implemented"); } + +/** + * ditto + */ +uint outpl(uint p, uint v) { throw new Exception("outpl intrinsic not yet implemented"); }