mirror of
https://github.com/xomboverlord/ldc.git
synced 2026-01-12 02:43:14 +01:00
13914 lines
396 KiB
Diff
13914 lines
396 KiB
Diff
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/cstdarg.di druntime/import/ldc/cstdarg.di
|
|
--- druntime-old/import/ldc/cstdarg.di 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/import/ldc/cstdarg.di 2010-09-30 22:10:37.000000000 +0400
|
|
@@ -0,0 +1,29 @@
|
|
+/*
|
|
+ * vararg support for extern(C) functions
|
|
+ */
|
|
+
|
|
+module ldc.cstdarg;
|
|
+
|
|
+// Check for the right compiler
|
|
+version(LDC)
|
|
+{
|
|
+ // OK
|
|
+}
|
|
+else
|
|
+{
|
|
+ static assert(false, "This module is only valid for LDC");
|
|
+}
|
|
+
|
|
+alias void* va_list;
|
|
+
|
|
+pragma(va_start)
|
|
+ void va_start(T)(va_list ap, ref T);
|
|
+
|
|
+pragma(va_arg)
|
|
+ T va_arg(T)(va_list ap);
|
|
+
|
|
+pragma(va_end)
|
|
+ void va_end(va_list args);
|
|
+
|
|
+pragma(va_copy)
|
|
+ void va_copy(va_list dst, va_list src);
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/intrinsics.di druntime/import/ldc/intrinsics.di
|
|
--- druntime-old/import/ldc/intrinsics.di 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/import/ldc/intrinsics.di 2010-10-02 14:01:02.975890001 +0400
|
|
@@ -0,0 +1,413 @@
|
|
+/*
|
|
+ * This module holds declarations to LLVM intrinsics.
|
|
+ *
|
|
+ * See the LLVM language reference for more information:
|
|
+ *
|
|
+ * - http://llvm.org/docs/LangRef.html#intrinsics
|
|
+ *
|
|
+ */
|
|
+
|
|
+module ldc.intrinsics;
|
|
+
|
|
+// Check for the right compiler
|
|
+version(LDC)
|
|
+{
|
|
+ // OK
|
|
+}
|
|
+else
|
|
+{
|
|
+ static assert(false, "This module is only valid for LDC");
|
|
+}
|
|
+
|
|
+//
|
|
+// CODE GENERATOR INTRINSICS
|
|
+//
|
|
+
|
|
+
|
|
+// The 'llvm.returnaddress' intrinsic attempts to compute a target-specific
|
|
+// value indicating the return address of the current function or one of its
|
|
+// callers.
|
|
+
|
|
+pragma(intrinsic, "llvm.returnaddress")
|
|
+ void* llvm_returnaddress(uint level);
|
|
+
|
|
+
|
|
+// The 'llvm.frameaddress' intrinsic attempts to return the target-specific
|
|
+// frame pointer value for the specified stack frame.
|
|
+
|
|
+pragma(intrinsic, "llvm.frameaddress")
|
|
+ void* llvm_frameaddress(uint level);
|
|
+
|
|
+
|
|
+// The 'llvm.stacksave' intrinsic is used to remember the current state of the
|
|
+// function stack, for use with llvm.stackrestore. This is useful for
|
|
+// implementing language features like scoped automatic variable sized arrays
|
|
+// in C99.
|
|
+
|
|
+pragma(intrinsic, "llvm.stacksave")
|
|
+ void* llvm_stacksave();
|
|
+
|
|
+
|
|
+// The 'llvm.stackrestore' intrinsic is used to restore the state of the
|
|
+// function stack to the state it was in when the corresponding llvm.stacksave
|
|
+// intrinsic executed. This is useful for implementing language features like
|
|
+// scoped automatic variable sized arrays in C99.
|
|
+
|
|
+pragma(intrinsic, "llvm.stackrestore")
|
|
+ void llvm_stackrestore(void* ptr);
|
|
+
|
|
+
|
|
+// The 'llvm.prefetch' intrinsic is a hint to the code generator to insert a
|
|
+// prefetch instruction if supported; otherwise, it is a noop. Prefetches have
|
|
+// no effect on the behavior of the program but can change its performance
|
|
+// characteristics.
|
|
+
|
|
+pragma(intrinsic, "llvm.prefetch")
|
|
+ void llvm_prefetch(void* ptr, uint rw, uint locality);
|
|
+
|
|
+
|
|
+// The 'llvm.pcmarker' intrinsic is a method to export a Program Counter (PC)
|
|
+// in a region of code to simulators and other tools. The method is target
|
|
+// specific, but it is expected that the marker will use exported symbols to
|
|
+// transmit the PC of the marker. The marker makes no guarantees that it will
|
|
+// remain with any specific instruction after optimizations. It is possible
|
|
+// that the presence of a marker will inhibit optimizations. The intended use
|
|
+// is to be inserted after optimizations to allow correlations of simulation
|
|
+// runs.
|
|
+
|
|
+pragma(intrinsic, "llvm.pcmarker")
|
|
+ void llvm_pcmarker(uint id);
|
|
+
|
|
+
|
|
+// The 'llvm.readcyclecounter' intrinsic provides access to the cycle counter
|
|
+// register (or similar low latency, high accuracy clocks) on those targets that
|
|
+// support it. On X86, it should map to RDTSC. On Alpha, it should map to RPCC.
|
|
+// As the backing counters overflow quickly (on the order of 9 seconds on
|
|
+// alpha), this should only be used for small timings.
|
|
+
|
|
+pragma(intrinsic, "llvm.readcyclecounter")
|
|
+ ulong readcyclecounter();
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+//
|
|
+// STANDARD C LIBRARY INTRINSICS
|
|
+//
|
|
+
|
|
+
|
|
+// The 'llvm.memcpy.*' intrinsics copy a block of memory from the source
|
|
+// location to the destination location.
|
|
+// Note that, unlike the standard libc function, the llvm.memcpy.* intrinsics do
|
|
+// not return a value, and takes an extra alignment argument.
|
|
+
|
|
+pragma(intrinsic, "llvm.memcpy.i#")
|
|
+ void llvm_memcpy(T)(void* dst, void* src, T len, uint alignment);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_memcpy!(uint) llvm_memcpy_i32;
|
|
+ alias llvm_memcpy!(ulong) llvm_memcpy_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.memmove.*' intrinsics move a block of memory from the source
|
|
+// location to the destination location. It is similar to the 'llvm.memcpy'
|
|
+// intrinsic but allows the two memory locations to overlap.
|
|
+// Note that, unlike the standard libc function, the llvm.memmove.* intrinsics
|
|
+// do not return a value, and takes an extra alignment argument.
|
|
+
|
|
+pragma(intrinsic, "llvm.memmove.i#")
|
|
+ void llvm_memmove(T)(void* dst, void* src, T len, uint alignment);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_memmove!(uint) llvm_memmove_i32;
|
|
+ alias llvm_memmove!(ulong) llvm_memmove_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.memset.*' intrinsics fill a block of memory with a particular byte
|
|
+// value.
|
|
+// Note that, unlike the standard libc function, the llvm.memset intrinsic does
|
|
+// not return a value, and takes an extra alignment argument.
|
|
+
|
|
+pragma(intrinsic, "llvm.memset.i#")
|
|
+ void llvm_memset(T)(void* dst, ubyte val, T len, uint alignment);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_memset!(uint) llvm_memset_i32;
|
|
+ alias llvm_memset!(ulong) llvm_memset_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.sqrt' intrinsics return the sqrt of the specified operand,
|
|
+// returning the same value as the libm 'sqrt' functions would. Unlike sqrt in
|
|
+// libm, however, llvm.sqrt has undefined behavior for negative numbers other
|
|
+// than -0.0 (which allows for better optimization, because there is no need to
|
|
+// worry about errno being set). llvm.sqrt(-0.0) is defined to return -0.0 like
|
|
+// IEEE sqrt.
|
|
+
|
|
+pragma(intrinsic, "llvm.sqrt.f#")
|
|
+ T llvm_sqrt(T)(T val);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_sqrt!(float) llvm_sqrt_f32;
|
|
+ alias llvm_sqrt!(double) llvm_sqrt_f64;
|
|
+ alias llvm_sqrt!(real) llvm_sqrt_f80; // may not actually be .f80
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.sin.*' intrinsics return the sine of the operand.
|
|
+
|
|
+pragma(intrinsic, "llvm.sin.f#")
|
|
+ T llvm_sin(T)(T val);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_sin!(float) llvm_sin_f32;
|
|
+ alias llvm_sin!(double) llvm_sin_f64;
|
|
+ alias llvm_sin!(real) llvm_sin_f80; // may not actually be .f80
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.cos.*' intrinsics return the cosine of the operand.
|
|
+
|
|
+pragma(intrinsic, "llvm.cos.f#")
|
|
+ T llvm_cos(T)(T val);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_cos!(float) llvm_cos_f32;
|
|
+ alias llvm_cos!(double) llvm_cos_f64;
|
|
+ alias llvm_cos!(real) llvm_cos_f80; // may not actually be .f80
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.powi.*' intrinsics return the first operand raised to the specified
|
|
+// (positive or negative) power. The order of evaluation of multiplications is
|
|
+// not defined. When a vector of floating point type is used, the second
|
|
+// argument remains a scalar integer value.
|
|
+
|
|
+pragma(intrinsic, "llvm.powi.f#")
|
|
+ T llvm_powi(T)(T val, int power);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_powi!(float) llvm_powi_f32;
|
|
+ alias llvm_powi!(double) llvm_powi_f64;
|
|
+ alias llvm_powi!(real) llvm_powi_f80; // may not actually be .f80
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.pow.*' intrinsics return the first operand raised to the specified
|
|
+// (positive or negative) power.
|
|
+
|
|
+pragma(intrinsic, "llvm.pow.f#")
|
|
+ T llvm_pow(T)(T val, T power);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_pow!(float) llvm_pow_f32;
|
|
+ alias llvm_pow!(double) llvm_pow_f64;
|
|
+ alias llvm_pow!(real) llvm_pow_f80; // may not actually be .f80
|
|
+}
|
|
+
|
|
+
|
|
+//
|
|
+// BIT MANIPULATION INTRINSICS
|
|
+//
|
|
+
|
|
+// The 'llvm.bswap' family of intrinsics is used to byte swap integer values
|
|
+// with an even number of bytes (positive multiple of 16 bits). These are
|
|
+// useful for performing operations on data that is not in the target's native
|
|
+// byte order.
|
|
+
|
|
+pragma(intrinsic, "llvm.bswap.i#.i#")
|
|
+ T llvm_bswap(T)(T val);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_bswap!(ushort) llvm_bswap_i16;
|
|
+ alias llvm_bswap!(uint) llvm_bswap_i32;
|
|
+ alias llvm_bswap!(ulong) llvm_bswap_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.ctpop' family of intrinsics counts the number of bits set in a
|
|
+// value.
|
|
+
|
|
+pragma(intrinsic, "llvm.ctpop.i#")
|
|
+ T llvm_ctpop(T)(T src);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_ctpop!(ubyte) llvm_ctpop_i8;
|
|
+ alias llvm_ctpop!(ushort) llvm_ctpop_i16;
|
|
+ alias llvm_ctpop!(uint) llvm_ctpop_i32;
|
|
+ alias llvm_ctpop!(ulong) llvm_ctpop_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.ctlz' family of intrinsic functions counts the number of leading
|
|
+// zeros in a variable.
|
|
+
|
|
+pragma(intrinsic, "llvm.ctlz.i#")
|
|
+ T llvm_ctlz(T)(T src);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_ctlz!(ubyte) llvm_ctlz_i8;
|
|
+ alias llvm_ctlz!(ushort) llvm_ctlz_i16;
|
|
+ alias llvm_ctlz!(uint) llvm_ctlz_i32;
|
|
+ alias llvm_ctlz!(ulong) llvm_ctlz_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.cttz' family of intrinsic functions counts the number of trailing
|
|
+// zeros.
|
|
+
|
|
+pragma(intrinsic, "llvm.cttz.i#")
|
|
+ T llvm_cttz(T)(T src);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_cttz!(ubyte) llvm_cttz_i8;
|
|
+ alias llvm_cttz!(ushort) llvm_cttz_i16;
|
|
+ alias llvm_cttz!(uint) llvm_cttz_i32;
|
|
+ alias llvm_cttz!(ulong) llvm_cttz_i64;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.part.select' family of intrinsic functions selects a range of bits
|
|
+// from an integer value and returns them in the same bit width as the original
|
|
+// value.
|
|
+
|
|
+pragma(intrinsic, "llvm.part.select.i#")
|
|
+ T llvm_part_select(T)(T val, uint loBit, uint hiBit);
|
|
+
|
|
+deprecated {
|
|
+ alias llvm_part_select!(ubyte) llvm_part_select_i;
|
|
+ alias llvm_part_select!(ushort) llvm_part_select_i;
|
|
+ alias llvm_part_select!(uint) llvm_part_select_i;
|
|
+ alias llvm_part_select!(ulong) llvm_part_select_i;
|
|
+}
|
|
+
|
|
+
|
|
+// The 'llvm.part.set' family of intrinsic functions replaces a range of bits
|
|
+// in an integer value with another integer value. It returns the integer with
|
|
+// the replaced bits.
|
|
+
|
|
+// TODO
|
|
+// declare i17 @llvm.part.set.i17.i9 (i17 %val, i9 %repl, i32 %lo, i32 %hi)
|
|
+// declare i29 @llvm.part.set.i29.i9 (i29 %val, i9 %repl, i32 %lo, i32 %hi)
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+//
|
|
+// ATOMIC OPERATIONS AND SYNCHRONIZATION INTRINSICS
|
|
+//
|
|
+
|
|
+// The llvm.memory.barrier intrinsic guarantees ordering between specific
|
|
+// pairs of memory access types.
|
|
+
|
|
+pragma(intrinsic, "llvm.memory.barrier")
|
|
+ void llvm_memory_barrier(bool ll, bool ls, bool sl, bool ss, bool device);
|
|
+
|
|
+// This loads a value in memory and compares it to a given value. If they are
|
|
+// equal, it stores a new value into the memory.
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.cmp.swap.i#.p0i#")
|
|
+ T llvm_atomic_cmp_swap(T)(shared T* ptr, T cmp, T val);
|
|
+
|
|
+// This intrinsic loads the value stored in memory at ptr and yields the value
|
|
+// from memory. It then stores the value in val in the memory at ptr.
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.swap.i#.p0i#")
|
|
+ T llvm_atomic_swap(T)(T* ptr, T val);
|
|
+
|
|
+// This intrinsic adds delta to the value stored in memory at ptr. It yields
|
|
+// the original value at ptr.
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.add.i#.p0i#")
|
|
+ T llvm_atomic_load_add(T)(shared const T* ptr, T val);
|
|
+
|
|
+// This intrinsic subtracts delta to the value stored in memory at ptr. It
|
|
+// yields the original value at ptr.
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.sub.i#.p0i#")
|
|
+ T llvm_atomic_load_sub(T)(T* ptr, T val);
|
|
+
|
|
+// These intrinsics bitwise the operation (and, nand, or, xor) delta to the
|
|
+// value stored in memory at ptr. It yields the original value at ptr.
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.and.i#.p0i#")
|
|
+ T llvm_atomic_load_and(T)(T* ptr, T val);
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.nand.i#.p0i#")
|
|
+ T llvm_atomic_load_nand(T)(T* ptr, T val);
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.or.i#.p0i#")
|
|
+ T llvm_atomic_load_or(T)(T* ptr, T val);
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.xor.i#.p0i#")
|
|
+ T llvm_atomic_load_xor(T)(T* ptr, T val);
|
|
+
|
|
+// These intrinsics takes the signed or unsigned minimum or maximum of delta
|
|
+// and the value stored in memory at ptr. It yields the original value at ptr.
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.max.i#.p0i#")
|
|
+ T llvm_atomic_load_max(T)(T* ptr, T val);
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.min.i#.p0i#")
|
|
+ T llvm_atomic_load_min(T)(T* ptr, T val);
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.umax.i#.p0i#")
|
|
+ T llvm_atomic_load_umax(T)(T* ptr, T val);
|
|
+
|
|
+pragma(intrinsic, "llvm.atomic.load.umin.i#.p0i#")
|
|
+ T llvm_atomic_load_umin(T)(T* ptr, T val);
|
|
+
|
|
+
|
|
+//
|
|
+// ARITHMETIC-WITH-OVERFLOW INTRINSICS
|
|
+//
|
|
+
|
|
+struct OverflowRet(T) {
|
|
+ static assert(is(T : int), T.stringof ~ " is not an integer type!");
|
|
+ T result;
|
|
+ bool overflow;
|
|
+}
|
|
+
|
|
+// Signed and unsigned addition
|
|
+pragma(intrinsic, "llvm.sadd.with.overflow.i#")
|
|
+ OverflowRet!(T) llvm_sadd_with_overflow(T)(T lhs, T rhs);
|
|
+
|
|
+pragma(intrinsic, "llvm.uadd.with.overflow.i#")
|
|
+ OverflowRet!(T) llvm_uadd_with_overflow(T)(T lhs, T rhs);
|
|
+
|
|
+
|
|
+// Signed and unsigned subtraction
|
|
+pragma(intrinsic, "llvm.ssub.with.overflow.i#")
|
|
+ OverflowRet!(T) llvm_ssub_with_overflow(T)(T lhs, T rhs);
|
|
+
|
|
+pragma(intrinsic, "llvm.usub.with.overflow.i#")
|
|
+ OverflowRet!(T) llvm_usub_with_overflow(T)(T lhs, T rhs);
|
|
+
|
|
+
|
|
+// Signed and unsigned multiplication
|
|
+pragma(intrinsic, "llvm.smul.with.overflow.i#")
|
|
+ OverflowRet!(T) llvm_smul_with_overflow(T)(T lhs, T rhs);
|
|
+
|
|
+/* Note: LLVM documentations says:
|
|
+ * Warning: 'llvm.umul.with.overflow' is badly broken.
|
|
+ * It is actively being fixed, but it should not currently be used!
|
|
+ *
|
|
+ * See: http://llvm.org/docs/LangRef.html#int_umul_overflow
|
|
+ */
|
|
+//pragma(intrinsic, "llvm.umul.with.overflow.i#")
|
|
+// OverflowRet!(T) llvm_umul_with_overflow(T)(T lhs, T rhs);
|
|
+
|
|
+
|
|
+//
|
|
+// GENERAL INTRINSICS
|
|
+//
|
|
+
|
|
+
|
|
+// This intrinsics is lowered to the target dependent trap instruction. If the
|
|
+// target does not have a trap instruction, this intrinsic will be lowered to
|
|
+// the call of the abort() function.
|
|
+
|
|
+pragma(intrinsic, "llvm.trap")
|
|
+ void llvm_trap();
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/llvmasm.di druntime/import/ldc/llvmasm.di
|
|
--- druntime-old/import/ldc/llvmasm.di 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/import/ldc/llvmasm.di 2010-09-30 22:10:37.000000000 +0400
|
|
@@ -0,0 +1,17 @@
|
|
+module ldc.llvmasm;
|
|
+
|
|
+struct __asmtuple_t(T...)
|
|
+{
|
|
+ T v;
|
|
+}
|
|
+
|
|
+pragma(llvm_inline_asm)
|
|
+{
|
|
+ void __asm( )(char[] asmcode, char[] constraints, ...);
|
|
+ T __asm(T)(char[] asmcode, char[] constraints, ...);
|
|
+
|
|
+ template __asmtuple(T...)
|
|
+ {
|
|
+ __asmtuple_t!(T) __asmtuple(char[] asmcode, char[] constraints, ...);
|
|
+ }
|
|
+}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/ldc/vararg.d druntime/import/ldc/vararg.d
|
|
--- druntime-old/import/ldc/vararg.d 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/import/ldc/vararg.d 2010-09-30 22:10:37.000000000 +0400
|
|
@@ -0,0 +1,43 @@
|
|
+/*
|
|
+ * This module holds the implementation of special vararg templates for D style var args.
|
|
+ *
|
|
+ * Provides the functions tango.core.Vararg expects to be present!
|
|
+ */
|
|
+
|
|
+module ldc.Vararg;
|
|
+
|
|
+// Check for the right compiler
|
|
+version(LDC)
|
|
+{
|
|
+ // OK
|
|
+}
|
|
+else
|
|
+{
|
|
+ static assert(false, "This module is only valid for LDC");
|
|
+}
|
|
+
|
|
+alias void* va_list;
|
|
+
|
|
+void va_start(T) ( out va_list ap, inout T parmn )
|
|
+{
|
|
+ // not needed !
|
|
+}
|
|
+
|
|
+T va_arg(T)(ref va_list vp)
|
|
+{
|
|
+ T* arg = cast(T*) vp;
|
|
+ // ldc always aligns to size_t.sizeof in vararg lists
|
|
+ vp = cast(va_list) ( cast(void*) vp + ( ( T.sizeof + size_t.sizeof - 1 ) & ~( size_t.sizeof - 1 ) ) );
|
|
+ return *arg;
|
|
+}
|
|
+
|
|
+void va_end( va_list ap )
|
|
+{
|
|
+ // not needed !
|
|
+}
|
|
+
|
|
+void va_copy( out va_list dst, va_list src )
|
|
+{
|
|
+ // seems pretty useless !
|
|
+ dst = src;
|
|
+}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/object.di druntime/import/object.di
|
|
--- druntime-old/import/object.di 2010-09-03 12:28:52.000000000 +0400
|
|
+++ druntime/import/object.di 2010-10-05 12:47:24.873150000 +0400
|
|
@@ -130,7 +130,7 @@
|
|
Interface[] interfaces;
|
|
TypeInfo_Class base;
|
|
void* destructor;
|
|
- void(*classInvariant)(Object);
|
|
+ void function(Object) classInvariant;
|
|
uint m_flags;
|
|
// 1: // is IUnknown or is derived from IUnknown
|
|
// 2: // has no possible pointers into GC memory
|
|
@@ -140,7 +140,7 @@
|
|
// 32: // has typeinfo member
|
|
void* deallocator;
|
|
OffsetTypeInfo[] m_offTi;
|
|
- void* defaultConstructor;
|
|
+ void function(Object) defaultConstructor; // default Constructor
|
|
const(MemberInfo[]) function(string) xgetMembers;
|
|
|
|
static TypeInfo_Class find(in char[] classname);
|
|
@@ -179,7 +179,7 @@
|
|
|
|
class TypeInfo_Const : TypeInfo
|
|
{
|
|
- TypeInfo next;
|
|
+ TypeInfo base;
|
|
}
|
|
|
|
class TypeInfo_Invariant : TypeInfo_Const
|
|
@@ -288,7 +288,6 @@
|
|
interface TraceInfo
|
|
{
|
|
int opApply(scope int delegate(ref char[]));
|
|
- string toString();
|
|
}
|
|
|
|
string msg;
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/import/std/intrinsic.di druntime/import/std/intrinsic.di
|
|
--- druntime-old/import/std/intrinsic.di 2010-08-05 05:39:08.000000000 +0400
|
|
+++ druntime/import/std/intrinsic.di 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,176 +0,0 @@
|
|
-/**
|
|
- * These functions are built-in intrinsics to the compiler.
|
|
- *
|
|
- * Intrinsic functions are functions built in to the compiler, usually to take
|
|
- * advantage of specific CPU features that are inefficient to handle via
|
|
- * external functions. The compiler's optimizer and code generator are fully
|
|
- * integrated in with intrinsic functions, bringing to bear their full power on
|
|
- * them. This can result in some surprising speedups.
|
|
- *
|
|
- * Copyright: Public Domain
|
|
- * License: Public Domain
|
|
- * Authors: Walter Bright
|
|
- */
|
|
-module std.intrinsic;
|
|
-
|
|
-
|
|
-/**
|
|
- * Scans the bits in v starting with bit 0, looking
|
|
- * for the first set bit.
|
|
- * Returns:
|
|
- * The bit number of the first bit set.
|
|
- * The return value is undefined if v is zero.
|
|
- */
|
|
-pure nothrow int bsf( uint v );
|
|
-
|
|
-
|
|
-/**
|
|
- * Scans the bits in v from the most significant bit
|
|
- * to the least significant bit, looking
|
|
- * for the first set bit.
|
|
- * Returns:
|
|
- * The bit number of the first bit set.
|
|
- * The return value is undefined if v is zero.
|
|
- * Example:
|
|
- * ---
|
|
- * import std.intrinsic;
|
|
- *
|
|
- * int main()
|
|
- * {
|
|
- * uint v;
|
|
- * int x;
|
|
- *
|
|
- * v = 0x21;
|
|
- * x = bsf(v);
|
|
- * printf("bsf(x%x) = %d\n", v, x);
|
|
- * x = bsr(v);
|
|
- * printf("bsr(x%x) = %d\n", v, x);
|
|
- * return 0;
|
|
- * }
|
|
- * ---
|
|
- * Output:
|
|
- * bsf(x21) = 0<br>
|
|
- * bsr(x21) = 5
|
|
- */
|
|
-pure nothrow int bsr( uint v );
|
|
-
|
|
-
|
|
-/**
|
|
- * Tests the bit.
|
|
- */
|
|
-pure nothrow int bt( in uint* p, uint bitnum );
|
|
-
|
|
-
|
|
-/**
|
|
- * Tests and complements the bit.
|
|
- */
|
|
-nothrow int btc( uint* p, uint bitnum );
|
|
-
|
|
-
|
|
-/**
|
|
- * Tests and resets (sets to 0) the bit.
|
|
- */
|
|
-nothrow int btr( uint* p, uint bitnum );
|
|
-
|
|
-
|
|
-/**
|
|
- * Tests and sets the bit.
|
|
- * Params:
|
|
- * p = a non-NULL pointer to an array of uints.
|
|
- * index = a bit number, starting with bit 0 of p[0],
|
|
- * and progressing. It addresses bits like the expression:
|
|
----
|
|
-p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1)))
|
|
----
|
|
- * Returns:
|
|
- * A non-zero value if the bit was set, and a zero
|
|
- * if it was clear.
|
|
- *
|
|
- * Example:
|
|
- * ---
|
|
-import std.intrinsic;
|
|
-
|
|
-int main()
|
|
-{
|
|
- uint array[2];
|
|
-
|
|
- array[0] = 2;
|
|
- array[1] = 0x100;
|
|
-
|
|
- printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
|
|
- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
-
|
|
- printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
|
|
- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
-
|
|
- printf("bts(array, 35) = %d\n", <b>bts</b>(array, 35));
|
|
- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
-
|
|
- printf("btr(array, 35) = %d\n", <b>btr</b>(array, 35));
|
|
- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
-
|
|
- printf("bt(array, 1) = %d\n", <b>bt</b>(array, 1));
|
|
- printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
-
|
|
- return 0;
|
|
-}
|
|
- * ---
|
|
- * Output:
|
|
-<pre>
|
|
-btc(array, 35) = 0
|
|
-array = [0]:x2, [1]:x108
|
|
-btc(array, 35) = -1
|
|
-array = [0]:x2, [1]:x100
|
|
-bts(array, 35) = 0
|
|
-array = [0]:x2, [1]:x108
|
|
-btr(array, 35) = -1
|
|
-array = [0]:x2, [1]:x100
|
|
-bt(array, 1) = -1
|
|
-array = [0]:x2, [1]:x100
|
|
-</pre>
|
|
- */
|
|
-nothrow int bts( uint* p, uint bitnum );
|
|
-
|
|
-
|
|
-/**
|
|
- * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes
|
|
- * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3
|
|
- * becomes byte 0.
|
|
- */
|
|
-pure nothrow uint bswap( uint v );
|
|
-
|
|
-
|
|
-/**
|
|
- * Reads I/O port at port_address.
|
|
- */
|
|
-nothrow ubyte inp( uint port_address );
|
|
-
|
|
-
|
|
-/**
|
|
- * ditto
|
|
- */
|
|
-nothrow ushort inpw( uint port_address );
|
|
-
|
|
-
|
|
-/**
|
|
- * ditto
|
|
- */
|
|
-nothrow uint inpl( uint port_address );
|
|
-
|
|
-
|
|
-/**
|
|
- * Writes and returns value to I/O port at port_address.
|
|
- */
|
|
-nothrow ubyte outp( uint port_address, ubyte value );
|
|
-
|
|
-
|
|
-/**
|
|
- * ditto
|
|
- */
|
|
-nothrow ushort outpw( uint port_address, ushort value );
|
|
-
|
|
-
|
|
-/**
|
|
- * ditto
|
|
- */
|
|
-nothrow uint outpl( uint port_address, uint value );
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/core/atomic.d druntime/src/core/atomic.d
|
|
--- druntime-old/src/core/atomic.d 2010-09-03 12:28:52.000000000 +0400
|
|
+++ druntime/src/core/atomic.d 2010-10-05 15:55:10.893150001 +0400
|
|
@@ -89,6 +89,117 @@
|
|
return false;
|
|
}
|
|
}
|
|
+
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
+// LDC Atomics Implementation
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
+
|
|
+else version( LDC )
|
|
+{
|
|
+ import ldc.intrinsics;
|
|
+
|
|
+ T atomicOp(string op, T, V1)( ref shared T val, V1 mod )
|
|
+ if( is( NakedType!(V1) == NakedType!(T) ) )
|
|
+ {
|
|
+ // binary operators
|
|
+ //
|
|
+ // + - * / % ^^ &
|
|
+ // | ^ << >> >>> ~ in
|
|
+ // == != < <= > >=
|
|
+ static if( op == "+" || op == "-" || op == "*" || op == "/" ||
|
|
+ op == "%" || op == "^^" || op == "&" || op == "|" ||
|
|
+ op == "^" || op == "<<" || op == ">>" || op == ">>>" ||
|
|
+ op == "~" || // skip "in"
|
|
+ op == "==" || op == "!=" || op == "<" || op == "<=" ||
|
|
+ op == ">" || op == ">=" )
|
|
+ {
|
|
+ T get = val; // compiler can do atomic load
|
|
+ mixin( "return get " ~ op ~ " mod;" );
|
|
+ }
|
|
+ else
|
|
+ // assignment operators
|
|
+ //
|
|
+ // += -= *= /= %= ^^= &=
|
|
+ // |= ^= <<= >>= >>>= ~=
|
|
+ static if( op == "+=" || op == "-=" || op == "*=" || op == "/=" ||
|
|
+ op == "%=" || op == "^^=" || op == "&=" || op == "|=" ||
|
|
+ op == "^=" || op == "<<=" || op == ">>=" || op == ">>>=" ) // skip "~="
|
|
+ {
|
|
+ T get, set;
|
|
+
|
|
+ do
|
|
+ {
|
|
+ get = set = atomicLoad!(msync.raw)( val );
|
|
+ mixin( "set " ~ op ~ " mod;" );
|
|
+ } while( !cas( &val, get, set ) );
|
|
+ return set;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ static assert( false, "Operation not supported." );
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bool cas(T,V1,V2)( shared(T)* here, const V1 ifThis, const V2 writeThis )
|
|
+ if( is( NakedType!(V1) == NakedType!(T) ) &&
|
|
+ is( NakedType!(V2) == NakedType!(T) ) )
|
|
+
|
|
+ {
|
|
+ T oldval = void;
|
|
+ static if (is(T P == U*, U))
|
|
+ {
|
|
+ oldval = cast(T)llvm_atomic_cmp_swap!(size_t)(cast(shared size_t*)&writeThis, cast(size_t)ifThis, cast(size_t)here);
|
|
+ }
|
|
+ else static if (is(T == bool))
|
|
+ {
|
|
+ oldval = llvm_atomic_cmp_swap!(ubyte)(cast(shared ubyte*)&writeThis, ifThis?1:0, here?1:0)?0:1;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ oldval = llvm_atomic_cmp_swap!(T)(here, ifThis, writeThis);
|
|
+ }
|
|
+ return oldval == ifThis;
|
|
+ }
|
|
+
|
|
+
|
|
+ private
|
|
+ {
|
|
+ enum msync
|
|
+ {
|
|
+ raw, /// not sequenced
|
|
+ acq, /// hoist-load + hoist-store barrier
|
|
+ rel, /// sink-load + sink-store barrier
|
|
+ seq, /// fully sequenced (acq + rel)
|
|
+ }
|
|
+
|
|
+ T atomicLoad(msync ms = msync.seq, T)( const ref shared T val )
|
|
+ {
|
|
+ llvm_memory_barrier(
|
|
+ ms == msync.acq || ms == msync.seq,
|
|
+ ms == msync.acq || ms == msync.seq,
|
|
+ ms == msync.rel || ms == msync.seq,
|
|
+ ms == msync.rel || ms == msync.seq,
|
|
+ false);
|
|
+ static if (is(T P == U*, U)) // pointer
|
|
+ {
|
|
+ return cast(T)llvm_atomic_load_add!(size_t)(cast(size_t*)&val, 0);
|
|
+ }
|
|
+ else static if (is(T == bool))
|
|
+ {
|
|
+ return llvm_atomic_load_add!(ubyte)(cast(ubyte*)&val, cast(ubyte)0) ? 1 : 0;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ return llvm_atomic_load_add!(T)(&val, cast(T)0);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
+// x86_32 Atomic Function Implementation
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
+
|
|
else version( AsmX86_32 )
|
|
{
|
|
T atomicOp(string op, T, V1)( ref shared T val, V1 mod )
|
|
@@ -396,6 +507,12 @@
|
|
}
|
|
}
|
|
}
|
|
+
|
|
+
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
+// x86_64 Atomic Function Implementation
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
+
|
|
else version( AsmX86_64 )
|
|
{
|
|
T atomicOp(string op, T, V1)( ref shared T val, V1 mod )
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gc.d druntime/src/gc/gc.d
|
|
--- druntime-old/src/gc/gc.d 2010-08-05 05:39:08.000000000 +0400
|
|
+++ druntime/src/gc/gc.d 2010-10-04 16:54:06.837685001 +0400
|
|
@@ -100,7 +100,7 @@
|
|
version (GCCLASS)
|
|
{ void* p;
|
|
ClassInfo ci = GC.classinfo;
|
|
-
|
|
+
|
|
p = malloc(ci.init.length);
|
|
(cast(byte*)p)[0 .. ci.init.length] = ci.init[];
|
|
_gc = cast(GC)p;
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcbits.d druntime/src/gc/gcbits.d
|
|
--- druntime-old/src/gc/gcbits.d 2010-08-08 04:10:24.000000000 +0400
|
|
+++ druntime/src/gc/gcbits.d 2010-10-01 20:49:51.268892001 +0400
|
|
@@ -26,6 +26,10 @@
|
|
{
|
|
version = bitops;
|
|
}
|
|
+else version (LDC)
|
|
+{
|
|
+ version = bitops;
|
|
+}
|
|
else version (GNU)
|
|
{
|
|
// use the unoptimized version
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/gc/gcx.d druntime/src/gc/gcx.d
|
|
--- druntime-old/src/gc/gcx.d 2010-08-27 01:23:26.000000000 +0400
|
|
+++ druntime/src/gc/gcx.d 2010-10-07 22:27:41.879253001 +0400
|
|
@@ -1464,7 +1464,8 @@
|
|
|
|
|
|
void initialize()
|
|
- { int dummy;
|
|
+ {
|
|
+ int dummy;
|
|
|
|
(cast(byte*)&this)[0 .. Gcx.sizeof] = 0;
|
|
stackBottom = cast(char*)&dummy;
|
|
@@ -2200,7 +2201,7 @@
|
|
if ((cast(size_t)p & ~(PAGESIZE-1)) == pcache)
|
|
continue;
|
|
|
|
- auto pool = findPool(p);
|
|
+ auto pool = findPool(p);
|
|
if (pool)
|
|
{
|
|
size_t offset = cast(size_t)(p - pool.baseAddr);
|
|
@@ -2270,80 +2271,129 @@
|
|
__builtin_unwind_init();
|
|
sp = & sp;
|
|
}
|
|
+ else version(LDC)
|
|
+ {
|
|
+ version(X86)
|
|
+ {
|
|
+ uint eax,ecx,edx,ebx,ebp,esi,edi;
|
|
+ asm
|
|
+ {
|
|
+ mov eax[EBP], EAX ;
|
|
+ mov ecx[EBP], ECX ;
|
|
+ mov edx[EBP], EDX ;
|
|
+ mov ebx[EBP], EBX ;
|
|
+ mov ebp[EBP], EBP ;
|
|
+ mov esi[EBP], ESI ;
|
|
+ mov edi[EBP], EDI ;
|
|
+ mov sp[EBP], ESP ;
|
|
+ }
|
|
+ }
|
|
+ else version (X86_64)
|
|
+ {
|
|
+ ulong rax,rbx,rcx,rdx,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15;
|
|
+ asm
|
|
+ {
|
|
+ movq rax[RBP], RAX ;
|
|
+ movq rbx[RBP], RBX ;
|
|
+ movq rcx[RBP], RCX ;
|
|
+ movq rdx[RBP], RDX ;
|
|
+ movq rbp[RBP], RBP ;
|
|
+ movq rsi[RBP], RSI ;
|
|
+ movq rdi[RBP], RDI ;
|
|
+ movq r8 [RBP], R8 ;
|
|
+ movq r9 [RBP], R9 ;
|
|
+ movq r10[RBP], R10 ;
|
|
+ movq r11[RBP], R11 ;
|
|
+ movq r12[RBP], R12 ;
|
|
+ movq r13[RBP], R13 ;
|
|
+ movq r14[RBP], R14 ;
|
|
+ movq r15[RBP], R15 ;
|
|
+ movq sp[RBP], RSP ;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ static assert( false, "Architecture not supported." );
|
|
+ }
|
|
+ }
|
|
else version( D_InlineAsm_X86 )
|
|
{
|
|
- asm
|
|
- {
|
|
- pushad ;
|
|
- mov sp[EBP],ESP ;
|
|
- }
|
|
+ asm
|
|
+ {
|
|
+ pushad ;
|
|
+ mov sp[EBP],ESP ;
|
|
+ }
|
|
+ }
|
|
+ else version ( D_InlineAsm_X86_64 )
|
|
+ {
|
|
+ asm
|
|
+ {
|
|
+ push RAX ;
|
|
+ push RBX ;
|
|
+ push RCX ;
|
|
+ push RDX ;
|
|
+ push RSI ;
|
|
+ push RDI ;
|
|
+ push RBP ;
|
|
+ push R8 ;
|
|
+ push R9 ;
|
|
+ push R10 ;
|
|
+ push R11 ;
|
|
+ push R12 ;
|
|
+ push R13 ;
|
|
+ push R14 ;
|
|
+ push R15 ;
|
|
+ push EAX ; // 16 byte align the stack
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ static assert( false, "Architecture not supported." );
|
|
}
|
|
- else version ( D_InlineAsm_X86_64 )
|
|
- {
|
|
- asm
|
|
- {
|
|
- push RAX ;
|
|
- push RBX ;
|
|
- push RCX ;
|
|
- push RDX ;
|
|
- push RSI ;
|
|
- push RDI ;
|
|
- push RBP ;
|
|
- push R8 ;
|
|
- push R9 ;
|
|
- push R10 ;
|
|
- push R11 ;
|
|
- push R12 ;
|
|
- push R13 ;
|
|
- push R14 ;
|
|
- push R15 ;
|
|
- push EAX ; // 16 byte align the stack
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- static assert( false, "Architecture not supported." );
|
|
- }
|
|
|
|
result = fullcollect(sp);
|
|
|
|
- version( GNU )
|
|
- {
|
|
- // registers will be popped automatically
|
|
- }
|
|
- else version( D_InlineAsm_X86 )
|
|
- {
|
|
- asm
|
|
- {
|
|
- popad;
|
|
- }
|
|
- }
|
|
- else version ( D_InlineAsm_X86_64 )
|
|
- {
|
|
- asm
|
|
- {
|
|
- pop EAX ; // 16 byte align the stack
|
|
- pop R15 ;
|
|
- pop R14 ;
|
|
- pop R13 ;
|
|
- pop R12 ;
|
|
- pop R11 ;
|
|
- pop R10 ;
|
|
- pop R9 ;
|
|
- pop R8 ;
|
|
- pop RBP ;
|
|
- pop RDI ;
|
|
- pop RSI ;
|
|
- pop RDX ;
|
|
- pop RCX ;
|
|
- pop RBX ;
|
|
- pop RAX ;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- static assert( false, "Architecture not supported." );
|
|
- }
|
|
+ version( GNU )
|
|
+ {
|
|
+ // registers will be popped automatically
|
|
+ }
|
|
+ else version(LDC)
|
|
+ {
|
|
+ // nothing to do
|
|
+ }
|
|
+ else version( D_InlineAsm_X86 )
|
|
+ {
|
|
+ asm
|
|
+ {
|
|
+ popad;
|
|
+ }
|
|
+ }
|
|
+ else version ( D_InlineAsm_X86_64 )
|
|
+ {
|
|
+ asm
|
|
+ {
|
|
+ pop EAX ; // 16 byte align the stack
|
|
+ pop R15 ;
|
|
+ pop R14 ;
|
|
+ pop R13 ;
|
|
+ pop R12 ;
|
|
+ pop R11 ;
|
|
+ pop R10 ;
|
|
+ pop R9 ;
|
|
+ pop R8 ;
|
|
+ pop RBP ;
|
|
+ pop RDI ;
|
|
+ pop RSI ;
|
|
+ pop RDX ;
|
|
+ pop RCX ;
|
|
+ pop RBX ;
|
|
+ pop RAX ;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ static assert( false, "Architecture not supported." );
|
|
+ }
|
|
return result;
|
|
}
|
|
|
|
@@ -2357,7 +2407,7 @@
|
|
Pool* pool;
|
|
|
|
debug(COLLECT_PRINTF) printf("Gcx.fullcollect()\n");
|
|
- //printf("\tpool address range = %p .. %p\n", minAddr, maxAddr);
|
|
+ //printf("\tpool address range = %p .. %p\n", minAddr, maxAddr);
|
|
|
|
thread_suspendAll();
|
|
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/object_.d druntime/src/object_.d
|
|
--- druntime-old/src/object_.d 2010-09-03 12:28:52.000000000 +0400
|
|
+++ druntime/src/object_.d 2010-10-05 14:50:34.733150002 +0400
|
|
@@ -1073,7 +1073,7 @@
|
|
|
|
abstract class MemberInfo
|
|
{
|
|
- string name();
|
|
+ string name() { return ""; }; // LDC: FIXME:
|
|
}
|
|
|
|
class MemberInfo_field : MemberInfo
|
|
@@ -1663,7 +1663,6 @@
|
|
{
|
|
int len = 0;
|
|
ModuleReference *mr;
|
|
-
|
|
for (mr = _Dmodule_ref; mr; mr = mr.next)
|
|
len++;
|
|
_moduleinfo_array = new ModuleInfo*[len];
|
|
@@ -1802,7 +1801,10 @@
|
|
{
|
|
debug(PRINTF) printf("_moduleTlsCtor()\n");
|
|
|
|
- void* p = alloca(_moduleinfo_array.length * ubyte.sizeof);
|
|
+ version( DMD )
|
|
+ void* p = alloca(_moduleinfo_array.length * ubyte.sizeof);
|
|
+ else
|
|
+ void* p = malloc(_moduleinfo_array.length * ubyte.sizeof);
|
|
auto flags = cast(ubyte[])p[0 .. _moduleinfo_array.length];
|
|
flags[] = 0;
|
|
|
|
@@ -2025,7 +2027,6 @@
|
|
_d_monitor_create(h);
|
|
m = getMonitor(h);
|
|
}
|
|
-
|
|
IMonitor i = m.impl;
|
|
|
|
if (i is null)
|
|
@@ -2124,7 +2125,7 @@
|
|
size_t _aaLen(void* p);
|
|
void* _aaGet(void** pp, TypeInfo keyti, size_t valuesize, ...);
|
|
void* _aaGetRvalue(void* p, TypeInfo keyti, size_t valuesize, ...);
|
|
- void* _aaIn(void* p, TypeInfo keyti);
|
|
+ void* _aaIn(void* p, TypeInfo keyti, ...);
|
|
void _aaDel(void* p, TypeInfo keyti, ...);
|
|
void[] _aaValues(void* p, size_t keysize, size_t valuesize);
|
|
void[] _aaKeys(void* p, size_t keysize, size_t valuesize);
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/adi.d druntime/src/rt/adi.d
|
|
--- druntime-old/src/rt/adi.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/adi.d 2010-10-07 14:32:52.911253001 +0400
|
|
@@ -35,6 +35,14 @@
|
|
extern (C) void gc_free( void* p );
|
|
}
|
|
|
|
+version (DMD)
|
|
+{
|
|
+ version (X86)
|
|
+ {
|
|
+ version = DMD_X86;
|
|
+ }
|
|
+}
|
|
+
|
|
|
|
struct Array
|
|
{
|
|
@@ -48,7 +56,7 @@
|
|
* reversed.
|
|
*/
|
|
|
|
-extern (C) long _adReverseChar(char[] a)
|
|
+extern (C) char[] _adReverseChar(char[] a)
|
|
{
|
|
if (a.length > 1)
|
|
{
|
|
@@ -108,7 +116,7 @@
|
|
hi = hi - 1 + (stridehi - stridelo);
|
|
}
|
|
}
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
|
|
unittest
|
|
@@ -143,7 +151,7 @@
|
|
* reversed.
|
|
*/
|
|
|
|
-extern (C) long _adReverseWchar(wchar[] a)
|
|
+extern (C) wchar[] _adReverseWchar(wchar[] a)
|
|
{
|
|
if (a.length > 1)
|
|
{
|
|
@@ -201,7 +209,7 @@
|
|
hi = hi - 1 + (stridehi - stridelo);
|
|
}
|
|
}
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
|
|
unittest
|
|
@@ -225,10 +233,10 @@
|
|
* Support for array.reverse property.
|
|
*/
|
|
|
|
-extern (C) long _adReverse(Array a, size_t szelem)
|
|
+extern (C) void[] _adReverse(void[] a, size_t szelem)
|
|
out (result)
|
|
{
|
|
- assert(result is *cast(long*)(&a));
|
|
+ assert(result.ptr is a.ptr);
|
|
}
|
|
body
|
|
{
|
|
@@ -243,10 +251,10 @@
|
|
tmp = buffer.ptr;
|
|
if (szelem > 16)
|
|
{
|
|
- //version (Windows)
|
|
+ version (Windows)
|
|
tmp = cast(byte*) alloca(szelem);
|
|
- //else
|
|
- //tmp = gc_malloc(szelem);
|
|
+ else
|
|
+ tmp = cast(byte*) gc_malloc(szelem);
|
|
}
|
|
|
|
for (; lo < hi; lo += szelem, hi -= szelem)
|
|
@@ -267,7 +275,7 @@
|
|
//gc_free(tmp);
|
|
}
|
|
}
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
|
|
unittest
|
|
@@ -311,7 +319,7 @@
|
|
* Sort array of chars.
|
|
*/
|
|
|
|
-extern (C) long _adSortChar(char[] a)
|
|
+extern (C) char[] _adSortChar(char[] a)
|
|
{
|
|
if (a.length > 1)
|
|
{
|
|
@@ -326,14 +334,14 @@
|
|
}
|
|
delete da;
|
|
}
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
|
|
/**********************************************
|
|
* Sort array of wchars.
|
|
*/
|
|
|
|
-extern (C) long _adSortWchar(wchar[] a)
|
|
+extern (C) wchar[] _adSortWchar(wchar[] a)
|
|
{
|
|
if (a.length > 1)
|
|
{
|
|
@@ -348,7 +356,7 @@
|
|
}
|
|
delete da;
|
|
}
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
|
|
/***************************************
|
|
@@ -358,7 +366,7 @@
|
|
* 0 not equal
|
|
*/
|
|
|
|
-extern (C) int _adEq(Array a1, Array a2, TypeInfo ti)
|
|
+extern (C) int _adEq(void[] a1, void[] a2, TypeInfo ti)
|
|
{
|
|
debug(adi) printf("_adEq(a1.length = %d, a2.length = %d)\n", a1.length, a2.length);
|
|
if (a1.length != a2.length)
|
|
@@ -379,7 +387,7 @@
|
|
return 1; // equal
|
|
}
|
|
|
|
-extern (C) int _adEq2(Array a1, Array a2, TypeInfo ti)
|
|
+extern (C) int _adEq2(void[] a1, void[] a2, TypeInfo ti)
|
|
{
|
|
debug(adi) printf("_adEq2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length);
|
|
if (a1.length != a2.length)
|
|
@@ -405,7 +413,7 @@
|
|
* Support for array compare test.
|
|
*/
|
|
|
|
-extern (C) int _adCmp(Array a1, Array a2, TypeInfo ti)
|
|
+extern (C) int _adCmp(void[] a1, void[] a2, TypeInfo ti)
|
|
{
|
|
debug(adi) printf("adCmp()\n");
|
|
auto len = a1.length;
|
|
@@ -435,7 +443,7 @@
|
|
return (a1.length > a2.length) ? 1 : -1;
|
|
}
|
|
|
|
-extern (C) int _adCmp2(Array a1, Array a2, TypeInfo ti)
|
|
+extern (C) int _adCmp2(void[] a1, void[] a2, TypeInfo ti)
|
|
{
|
|
debug(adi) printf("_adCmp2(a1.length = %d, a2.length = %d)\n", a1.length, a2.length);
|
|
return ti.compare(&a1, &a2);
|
|
@@ -461,9 +469,9 @@
|
|
* Support for array compare test.
|
|
*/
|
|
|
|
-extern (C) int _adCmpChar(Array a1, Array a2)
|
|
+extern (C) int _adCmpChar(void[] a1, void[] a2)
|
|
{
|
|
- version (X86)
|
|
+ version (DMD_X86)
|
|
{
|
|
asm
|
|
{ naked ;
|
|
@@ -569,8 +577,8 @@
|
|
|
|
ret ;
|
|
}
|
|
- }
|
|
- else
|
|
+ }
|
|
+ else
|
|
{
|
|
int len;
|
|
int c;
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayInit.d druntime/src/rt/arrayInit.d
|
|
--- druntime-old/src/rt/arrayInit.d 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/src/rt/arrayInit.d 2010-10-03 20:41:52.223624001 +0400
|
|
@@ -0,0 +1,155 @@
|
|
+private import ldc.intrinsics;
|
|
+
|
|
+extern(C):
|
|
+
|
|
+int memcmp(void*,void*,size_t);
|
|
+size_t strlen(char*);
|
|
+
|
|
+version(LLVM64)
|
|
+alias llvm_memcpy_i64 llvm_memcpy;
|
|
+else
|
|
+alias llvm_memcpy_i32 llvm_memcpy;
|
|
+
|
|
+// per-element array init routines
|
|
+
|
|
+void _d_array_init_i16(ushort* a, size_t n, ushort v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_i32(uint* a, size_t n, uint v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_i64(ulong* a, size_t n, ulong v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_float(float* a, size_t n, float v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_double(double* a, size_t n, double v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_real(real* a, size_t n, real v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_cfloat(cfloat* a, size_t n, cfloat v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_cdouble(cdouble* a, size_t n, cdouble v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_creal(creal* a, size_t n, creal v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_pointer(void** a, size_t n, void* v)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a+n;
|
|
+ while (p !is end)
|
|
+ *p++ = v;
|
|
+}
|
|
+
|
|
+void _d_array_init_mem(void* a, size_t na, void* v, size_t nv)
|
|
+{
|
|
+ auto p = a;
|
|
+ auto end = a + na*nv;
|
|
+ while (p !is end) {
|
|
+ llvm_memcpy(p,v,nv,0);
|
|
+ p += nv;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+void _d_array_init(TypeInfo ti, void* a)
|
|
+{
|
|
+ auto initializer = ti.next.init();
|
|
+ auto isize = initializer.length;
|
|
+ auto q = initializer.ptr;
|
|
+
|
|
+ if (isize == 1)
|
|
+ memset(p, *cast(ubyte*)q, size);
|
|
+ else if (isize == int.sizeof)
|
|
+ {
|
|
+ int init = *cast(int*)q;
|
|
+ size /= int.sizeof;
|
|
+ for (size_t u = 0; u < size; u++)
|
|
+ {
|
|
+ (cast(int*)p)[u] = init;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ for (size_t u = 0; u < size; u += isize)
|
|
+ {
|
|
+ memcpy(p + u, q, isize);
|
|
+ }
|
|
+ }
|
|
+}*/
|
|
+
|
|
+// for array cast
|
|
+size_t _d_array_cast_len(size_t len, size_t elemsz, size_t newelemsz)
|
|
+{
|
|
+ if (newelemsz == 1) {
|
|
+ return len*elemsz;
|
|
+ }
|
|
+ else if ((len*elemsz) % newelemsz) {
|
|
+ throw new Exception("Bad array cast");
|
|
+ }
|
|
+ return (len*elemsz)/newelemsz;
|
|
+}
|
|
+
|
|
+// slice copy when assertions are enabled
|
|
+void _d_array_slice_copy(void* dst, size_t dstlen, void* src, size_t srclen)
|
|
+{
|
|
+ assert(dst);
|
|
+ assert(src);
|
|
+ if (dstlen != srclen)
|
|
+ throw new Exception("lengths don't match for array copy");
|
|
+ else if (dst+dstlen <= src || src+srclen <= dst)
|
|
+ llvm_memcpy(dst, src, dstlen, 0);
|
|
+ else
|
|
+ throw new Exception("overlapping array copy");
|
|
+}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayassign.d druntime/src/rt/arrayassign.d
|
|
--- druntime-old/src/rt/arrayassign.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arrayassign.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,186 +0,0 @@
|
|
-/**
|
|
- * Implementation of array assignment support routines.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2000 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright
|
|
- *
|
|
- * Copyright Digital Mars 2000 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arrayassign;
|
|
-
|
|
-private
|
|
-{
|
|
- import rt.util.string;
|
|
- import core.stdc.string;
|
|
- import core.stdc.stdlib;
|
|
- debug(PRINTF) import core.stdc.stdio;
|
|
-}
|
|
-
|
|
-/**
|
|
- * Does array assignment (not construction) from another
|
|
- * array of the same element type.
|
|
- * ti is the element type.
|
|
- * Handles overlapping copies.
|
|
- */
|
|
-extern (C) void[] _d_arrayassign(TypeInfo ti, void[] from, void[] to)
|
|
-{
|
|
- debug(PRINTF) printf("_d_arrayassign(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize());
|
|
-
|
|
- if (to.length != from.length)
|
|
- {
|
|
- char[10] tmp = void;
|
|
- string msg = "lengths don't match for array copy,"c;
|
|
- msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length);
|
|
- throw new Exception(msg);
|
|
- }
|
|
-
|
|
- auto element_size = ti.tsize();
|
|
-
|
|
- /* Need a temporary buffer tmp[] big enough to hold one element
|
|
- */
|
|
- void[16] buf = void;
|
|
- void[] tmp;
|
|
- if (element_size > buf.sizeof)
|
|
- tmp = alloca(element_size)[0 .. element_size];
|
|
- else
|
|
- tmp = buf;
|
|
-
|
|
-
|
|
- if (to.ptr <= from.ptr)
|
|
- {
|
|
- foreach (i; 0 .. to.length)
|
|
- {
|
|
- void* pto = to.ptr + i * element_size;
|
|
- void* pfrom = from.ptr + i * element_size;
|
|
- memcpy(tmp.ptr, pto, element_size);
|
|
- memcpy(pto, pfrom, element_size);
|
|
- ti.postblit(pto);
|
|
- ti.destroy(tmp.ptr);
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- for (int i = to.length; i--; )
|
|
- {
|
|
- void* pto = to.ptr + i * element_size;
|
|
- void* pfrom = from.ptr + i * element_size;
|
|
- memcpy(tmp.ptr, pto, element_size);
|
|
- memcpy(pto, pfrom, element_size);
|
|
- ti.postblit(pto);
|
|
- ti.destroy(tmp.ptr);
|
|
- }
|
|
- }
|
|
- return to;
|
|
-}
|
|
-
|
|
-/**
|
|
- * Does array initialization (not assignment) from another
|
|
- * array of the same element type.
|
|
- * ti is the element type.
|
|
- */
|
|
-extern (C) void[] _d_arrayctor(TypeInfo ti, void[] from, void[] to)
|
|
-{
|
|
- debug(PRINTF) printf("_d_arrayctor(from = %p,%d, to = %p,%d) size = %d\n", from.ptr, from.length, to.ptr, to.length, ti.tsize());
|
|
-
|
|
- if (to.length != from.length)
|
|
- {
|
|
- char[10] tmp = void;
|
|
- string msg = "lengths don't match for array initialization,"c;
|
|
- msg ~= tmp.intToString(to.length) ~ " = " ~ tmp.intToString(from.length);
|
|
- throw new Exception(msg);
|
|
- }
|
|
-
|
|
- auto element_size = ti.tsize();
|
|
-
|
|
- int i;
|
|
- try
|
|
- {
|
|
- for (i = 0; i < to.length; i++)
|
|
- {
|
|
- // Copy construction is defined as bit copy followed by postblit.
|
|
- memcpy(to.ptr + i * element_size, from.ptr + i * element_size, element_size);
|
|
- ti.postblit(to.ptr + i * element_size);
|
|
- }
|
|
- }
|
|
- catch (Object o)
|
|
- {
|
|
- /* Destroy, in reverse order, what we've constructed so far
|
|
- */
|
|
- while (i--)
|
|
- {
|
|
- ti.destroy(to.ptr + i * element_size);
|
|
- }
|
|
-
|
|
- throw o;
|
|
- }
|
|
- return to;
|
|
-}
|
|
-
|
|
-
|
|
-/**
|
|
- * Do assignment to an array.
|
|
- * p[0 .. count] = value;
|
|
- */
|
|
-extern (C) void* _d_arraysetassign(void* p, void* value, int count, TypeInfo ti)
|
|
-{
|
|
- void* pstart = p;
|
|
-
|
|
- auto element_size = ti.tsize();
|
|
-
|
|
- //Need a temporary buffer tmp[] big enough to hold one element
|
|
- void[16] buf = void;
|
|
- void[] tmp;
|
|
- if (element_size > buf.sizeof)
|
|
- {
|
|
- tmp = alloca(element_size)[0 .. element_size];
|
|
- }
|
|
- else
|
|
- tmp = buf;
|
|
-
|
|
- foreach (i; 0 .. count)
|
|
- {
|
|
- memcpy(tmp.ptr, p, element_size);
|
|
- memcpy(p, value, element_size);
|
|
- ti.postblit(p);
|
|
- ti.destroy(tmp.ptr);
|
|
- p += element_size;
|
|
- }
|
|
- return pstart;
|
|
-}
|
|
-
|
|
-/**
|
|
- * Do construction of an array.
|
|
- * ti[count] p = value;
|
|
- */
|
|
-extern (C) void* _d_arraysetctor(void* p, void* value, int count, TypeInfo ti)
|
|
-{
|
|
- void* pstart = p;
|
|
- auto element_size = ti.tsize();
|
|
-
|
|
- try
|
|
- {
|
|
- foreach (i; 0 .. count)
|
|
- {
|
|
- // Copy construction is defined as bit copy followed by postblit.
|
|
- memcpy(p, value, element_size);
|
|
- ti.postblit(p);
|
|
- p += element_size;
|
|
- }
|
|
- }
|
|
- catch (Object o)
|
|
- {
|
|
- // Destroy, in reverse order, what we've constructed so far
|
|
- while (p > pstart)
|
|
- {
|
|
- p -= element_size;
|
|
- ti.destroy(p);
|
|
- }
|
|
-
|
|
- throw o;
|
|
- }
|
|
- return pstart;
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraybyte.d druntime/src/rt/arraybyte.d
|
|
--- druntime-old/src/rt/arraybyte.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arraybyte.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,1893 +0,0 @@
|
|
-/**
|
|
- * Contains SSE2 and MMX versions of certain operations for char, byte, and
|
|
- * ubyte ('a', 'g' and 'h' suffixes).
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2008 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, based on code originally written by Burton Radons
|
|
- *
|
|
- * Copyright Digital Mars 2008 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arraybyte;
|
|
-
|
|
-import core.cpuid;
|
|
-
|
|
-version (unittest)
|
|
-{
|
|
- private import core.stdc.stdio : printf;
|
|
- /* This is so unit tests will test every CPU variant
|
|
- */
|
|
- int cpuid;
|
|
- const int CPUID_MAX = 4;
|
|
- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); }
|
|
- bool sse() { return cpuid == 2 && core.cpuid.sse(); }
|
|
- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); }
|
|
- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
|
|
-}
|
|
-else
|
|
-{
|
|
- alias core.cpuid.mmx mmx;
|
|
- alias core.cpuid.sse sse;
|
|
- alias core.cpuid.sse2 sse2;
|
|
- alias core.cpuid.amd3dnow amd3dnow;
|
|
-}
|
|
-
|
|
-//version = log;
|
|
-
|
|
-bool disjoint(T)(T[] a, T[] b)
|
|
-{
|
|
- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
-}
|
|
-
|
|
-alias byte T;
|
|
-
|
|
-extern (C):
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpAddSliceAssign_g(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpAddSliceAssign_g(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpAddSliceAssign_g()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1088% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startaddsse2u:
|
|
- add ESI, 64;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM2, [EAX+32];
|
|
- movdqu XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM4;
|
|
- paddb XMM2, XMM4;
|
|
- paddb XMM3, XMM4;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startaddsse2a:
|
|
- add ESI, 64;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM2, [EAX+32];
|
|
- movdqa XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM4;
|
|
- paddb XMM2, XMM4;
|
|
- paddb XMM3, XMM4;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1000% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM4, l;
|
|
- pshufw MM4, MM4, 0;
|
|
-
|
|
- align 4;
|
|
- startaddmmx:
|
|
- add ESI, 32;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- add EAX, 32;
|
|
- paddb MM0, MM4;
|
|
- paddb MM1, MM4;
|
|
- paddb MM2, MM4;
|
|
- paddb MM3, MM4;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- /* trying to be fair and treat normal 32-bit cpu the same way as we do
|
|
- * the SIMD units, with unrolled asm. There's not enough registers,
|
|
- * really.
|
|
- */
|
|
- else
|
|
- if (a.length >= 4)
|
|
- {
|
|
-
|
|
- auto n = aptr + (a.length & ~3);
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov CL, value;
|
|
-
|
|
- align 4;
|
|
- startadd386:
|
|
- add ESI, 4;
|
|
- mov DX, [EAX];
|
|
- mov BX, [EAX+2];
|
|
- add EAX, 4;
|
|
- add BL, CL;
|
|
- add BH, CL;
|
|
- add DL, CL;
|
|
- add DH, CL;
|
|
- mov [ESI -4], DX;
|
|
- mov [ESI+2 -4], BX;
|
|
- cmp ESI, EDI;
|
|
- jb startadd386;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
-
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ + value);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpAddSliceAssign_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddSliceAssign_g(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddSliceAssign_g(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddSliceAssign_g()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 5739% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- version (log) printf("\tsse2 unaligned\n");
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 8;
|
|
- startaddlsse2u:
|
|
- add ESI, 64;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM2, [EAX+32];
|
|
- movdqu XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movdqu XMM4, [ECX];
|
|
- movdqu XMM5, [ECX+16];
|
|
- movdqu XMM6, [ECX+32];
|
|
- movdqu XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM5;
|
|
- paddb XMM2, XMM6;
|
|
- paddb XMM3, XMM7;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddlsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- version (log) printf("\tsse2 aligned\n");
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 8;
|
|
- startaddlsse2a:
|
|
- add ESI, 64;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM2, [EAX+32];
|
|
- movdqa XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movdqa XMM4, [ECX];
|
|
- movdqa XMM5, [ECX+16];
|
|
- movdqa XMM6, [ECX+32];
|
|
- movdqa XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM5;
|
|
- paddb XMM2, XMM6;
|
|
- paddb XMM3, XMM7;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddlsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 4428% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
- version (log) printf("\tmmx\n");
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startaddlmmx:
|
|
- add ESI, 32;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- add EAX, 32;
|
|
- movq MM4, [ECX];
|
|
- movq MM5, [ECX+8];
|
|
- movq MM6, [ECX+16];
|
|
- movq MM7, [ECX+24];
|
|
- add ECX, 32;
|
|
- paddb MM0, MM4;
|
|
- paddb MM1, MM5;
|
|
- paddb MM2, MM6;
|
|
- paddb MM3, MM7;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddlmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- version (log) if (aptr < aend) printf("\tbase\n");
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ + *cptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddSliceAssign_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceAddass_a(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceAddass_g(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceAddass_h(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceAddass_g(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceAddass_g(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1578% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startaddasssse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM2, [ESI+32];
|
|
- movdqu XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM4;
|
|
- paddb XMM2, XMM4;
|
|
- paddb XMM3, XMM4;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddasssse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startaddasssse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM2, [ESI+32];
|
|
- movdqa XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM4;
|
|
- paddb XMM2, XMM4;
|
|
- paddb XMM3, XMM4;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddasssse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1721% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
-
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd MM4, l;
|
|
- pshufw MM4, MM4, 0;
|
|
-
|
|
- align 8;
|
|
- startaddassmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM2, [ESI+16];
|
|
- movq MM3, [ESI+24];
|
|
- add ESI, 32;
|
|
- paddb MM0, MM4;
|
|
- paddb MM1, MM4;
|
|
- paddb MM2, MM4;
|
|
- paddb MM3, MM4;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddassmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceAddass_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] += 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddass_a(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddass_g(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddass_h(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddass_g(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddass_g(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddass_g()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 4727% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 8;
|
|
- startaddasslsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM2, [ESI+32];
|
|
- movdqu XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movdqu XMM4, [ECX];
|
|
- movdqu XMM5, [ECX+16];
|
|
- movdqu XMM6, [ECX+32];
|
|
- movdqu XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM5;
|
|
- paddb XMM2, XMM6;
|
|
- paddb XMM3, XMM7;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddasslsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 8;
|
|
- startaddasslsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM2, [ESI+32];
|
|
- movdqa XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movdqa XMM4, [ECX];
|
|
- movdqa XMM5, [ECX+16];
|
|
- movdqa XMM6, [ECX+32];
|
|
- movdqa XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- paddb XMM0, XMM4;
|
|
- paddb XMM1, XMM5;
|
|
- paddb XMM2, XMM6;
|
|
- paddb XMM3, XMM7;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddasslsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 3059% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
-
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 8;
|
|
- startaddasslmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM2, [ESI+16];
|
|
- movq MM3, [ESI+24];
|
|
- add ESI, 32;
|
|
- movq MM4, [ECX];
|
|
- movq MM5, [ECX+8];
|
|
- movq MM6, [ECX+16];
|
|
- movq MM7, [ECX+24];
|
|
- add ECX, 32;
|
|
- paddb MM0, MM4;
|
|
- paddb MM1, MM5;
|
|
- paddb MM2, MM6;
|
|
- paddb MM3, MM7;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddasslmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddass_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] += b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMinSliceAssign_g(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMinSliceAssign_g(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMinSliceAssign_g()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1189% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubsse2u:
|
|
- add ESI, 64;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM2, [EAX+32];
|
|
- movdqu XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM4;
|
|
- psubb XMM2, XMM4;
|
|
- psubb XMM3, XMM4;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubsse2a:
|
|
- add ESI, 64;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM2, [EAX+32];
|
|
- movdqa XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM4;
|
|
- psubb XMM2, XMM4;
|
|
- psubb XMM3, XMM4;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1079% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM4, l;
|
|
- pshufw MM4, MM4, 0;
|
|
-
|
|
- align 4;
|
|
- startsubmmx:
|
|
- add ESI, 32;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- add EAX, 32;
|
|
- psubb MM0, MM4;
|
|
- psubb MM1, MM4;
|
|
- psubb MM2, MM4;
|
|
- psubb MM3, MM4;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really.
|
|
- else
|
|
- if (a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov CL, value;
|
|
-
|
|
- align 4;
|
|
- startsub386:
|
|
- add ESI, 4;
|
|
- mov DX, [EAX];
|
|
- mov BX, [EAX+2];
|
|
- add EAX, 4;
|
|
- sub BL, CL;
|
|
- sub BH, CL;
|
|
- sub DL, CL;
|
|
- sub DH, CL;
|
|
- mov [ESI -4], DX;
|
|
- mov [ESI+2 -4], BX;
|
|
- cmp ESI, EDI;
|
|
- jb startsub386;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ - value);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMinSliceAssign_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] = b[] - 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(b[i] - 6))
|
|
- {
|
|
- printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = value - b[]
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
|
|
-{
|
|
- return _arrayExpSliceMinSliceAssign_g(a, b, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
|
|
-{
|
|
- return _arrayExpSliceMinSliceAssign_g(a, b, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arrayExpSliceMinSliceAssign_g()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 8748% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubrsse2u:
|
|
- add ESI, 64;
|
|
- movdqa XMM5, XMM4;
|
|
- movdqa XMM6, XMM4;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- psubb XMM5, XMM0;
|
|
- psubb XMM6, XMM1;
|
|
- movdqu [ESI -64], XMM5;
|
|
- movdqu [ESI+16-64], XMM6;
|
|
- movdqa XMM5, XMM4;
|
|
- movdqa XMM6, XMM4;
|
|
- movdqu XMM2, [EAX+32];
|
|
- movdqu XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- psubb XMM5, XMM2;
|
|
- psubb XMM6, XMM3;
|
|
- movdqu [ESI+32-64], XMM5;
|
|
- movdqu [ESI+48-64], XMM6;
|
|
- cmp ESI, EDI;
|
|
- jb startsubrsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubrsse2a:
|
|
- add ESI, 64;
|
|
- movdqa XMM5, XMM4;
|
|
- movdqa XMM6, XMM4;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- psubb XMM5, XMM0;
|
|
- psubb XMM6, XMM1;
|
|
- movdqa [ESI -64], XMM5;
|
|
- movdqa [ESI+16-64], XMM6;
|
|
- movdqa XMM5, XMM4;
|
|
- movdqa XMM6, XMM4;
|
|
- movdqa XMM2, [EAX+32];
|
|
- movdqa XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- psubb XMM5, XMM2;
|
|
- psubb XMM6, XMM3;
|
|
- movdqa [ESI+32-64], XMM5;
|
|
- movdqa [ESI+48-64], XMM6;
|
|
- cmp ESI, EDI;
|
|
- jb startsubrsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 7397% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM4, l;
|
|
- pshufw MM4, MM4, 0;
|
|
-
|
|
- align 4;
|
|
- startsubrmmx:
|
|
- add ESI, 32;
|
|
- movq MM5, MM4;
|
|
- movq MM6, MM4;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- psubb MM5, MM0;
|
|
- psubb MM6, MM1;
|
|
- movq [ESI -32], MM5;
|
|
- movq [ESI+8 -32], MM6;
|
|
- movq MM5, MM4;
|
|
- movq MM6, MM4;
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- add EAX, 32;
|
|
- psubb MM5, MM2;
|
|
- psubb MM6, MM3;
|
|
- movq [ESI+16-32], MM5;
|
|
- movq [ESI+24-32], MM6;
|
|
- cmp ESI, EDI;
|
|
- jb startsubrmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
-
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(value - *bptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinSliceAssign_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] = 6 - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(6 - b[i]))
|
|
- {
|
|
- printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinSliceAssign_g(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinSliceAssign_g(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 5756% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 8;
|
|
- startsublsse2u:
|
|
- add ESI, 64;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM2, [EAX+32];
|
|
- movdqu XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movdqu XMM4, [ECX];
|
|
- movdqu XMM5, [ECX+16];
|
|
- movdqu XMM6, [ECX+32];
|
|
- movdqu XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM5;
|
|
- psubb XMM2, XMM6;
|
|
- psubb XMM3, XMM7;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsublsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 8;
|
|
- startsublsse2a:
|
|
- add ESI, 64;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM2, [EAX+32];
|
|
- movdqa XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movdqa XMM4, [ECX];
|
|
- movdqa XMM5, [ECX+16];
|
|
- movdqa XMM6, [ECX+32];
|
|
- movdqa XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM5;
|
|
- psubb XMM2, XMM6;
|
|
- psubb XMM3, XMM7;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsublsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 4428% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 8;
|
|
- startsublmmx:
|
|
- add ESI, 32;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- add EAX, 32;
|
|
- movq MM4, [ECX];
|
|
- movq MM5, [ECX+8];
|
|
- movq MM6, [ECX+16];
|
|
- movq MM7, [ECX+24];
|
|
- add ECX, 32;
|
|
- psubb MM0, MM4;
|
|
- psubb MM1, MM5;
|
|
- psubb MM2, MM6;
|
|
- psubb MM3, MM7;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsublmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ - *cptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinSliceAssign_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinass_a(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMinass_g(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinass_h(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMinass_g(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinass_g(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1577% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubasssse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM2, [ESI+32];
|
|
- movdqu XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM4;
|
|
- psubb XMM2, XMM4;
|
|
- psubb XMM3, XMM4;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubasssse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubasssse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM2, [ESI+32];
|
|
- movdqa XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM4;
|
|
- psubb XMM2, XMM4;
|
|
- psubb XMM3, XMM4;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubasssse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1577% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
-
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- uint l = cast(ubyte) value;
|
|
- l |= (l << 8);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd MM4, l;
|
|
- pshufw MM4, MM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsubassmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM2, [ESI+16];
|
|
- movq MM3, [ESI+24];
|
|
- add ESI, 32;
|
|
- psubb MM0, MM4;
|
|
- psubb MM1, MM4;
|
|
- psubb MM2, MM4;
|
|
- psubb MM3, MM4;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubassmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinass_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinass_a(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinass_g(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinass_h(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinass_g(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinass_g(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMinass_g()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 4800% faster
|
|
- if (sse2() && a.length >= 64)
|
|
- {
|
|
- auto n = aptr + (a.length & ~63);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 8;
|
|
- startsubasslsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM2, [ESI+32];
|
|
- movdqu XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movdqu XMM4, [ECX];
|
|
- movdqu XMM5, [ECX+16];
|
|
- movdqu XMM6, [ECX+32];
|
|
- movdqu XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM5;
|
|
- psubb XMM2, XMM6;
|
|
- psubb XMM3, XMM7;
|
|
- movdqu [ESI -64], XMM0;
|
|
- movdqu [ESI+16-64], XMM1;
|
|
- movdqu [ESI+32-64], XMM2;
|
|
- movdqu [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubasslsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 8;
|
|
- startsubasslsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM2, [ESI+32];
|
|
- movdqa XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movdqa XMM4, [ECX];
|
|
- movdqa XMM5, [ECX+16];
|
|
- movdqa XMM6, [ECX+32];
|
|
- movdqa XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- psubb XMM0, XMM4;
|
|
- psubb XMM1, XMM5;
|
|
- psubb XMM2, XMM6;
|
|
- psubb XMM3, XMM7;
|
|
- movdqa [ESI -64], XMM0;
|
|
- movdqa [ESI+16-64], XMM1;
|
|
- movdqa [ESI+32-64], XMM2;
|
|
- movdqa [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubasslsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 3107% faster
|
|
- if (mmx() && a.length >= 32)
|
|
- {
|
|
-
|
|
- auto n = aptr + (a.length & ~31);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 8;
|
|
- startsubasslmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM2, [ESI+16];
|
|
- movq MM3, [ESI+24];
|
|
- add ESI, 32;
|
|
- movq MM4, [ECX];
|
|
- movq MM5, [ECX+8];
|
|
- movq MM6, [ECX+16];
|
|
- movq MM7, [ECX+24];
|
|
- add ECX, 32;
|
|
- psubb MM0, MM4;
|
|
- psubb MM1, MM5;
|
|
- psubb MM2, MM6;
|
|
- psubb MM3, MM7;
|
|
- movq [ESI -32], MM0;
|
|
- movq [ESI+8 -32], MM1;
|
|
- movq [ESI+16-32], MM2;
|
|
- movq [ESI+24-32], MM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsubasslmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinass_g unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] -= b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycast.d druntime/src/rt/arraycast.d
|
|
--- druntime-old/src/rt/arraycast.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arraycast.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,94 +0,0 @@
|
|
-/**
|
|
- * Implementation of array cast support routines.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2004 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, Sean Kelly
|
|
- *
|
|
- * Copyright Digital Mars 2004 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arraycast;
|
|
-
|
|
-/******************************************
|
|
- * Runtime helper to convert dynamic array of one
|
|
- * type to dynamic array of another.
|
|
- * Adjusts the length of the array.
|
|
- * Throws exception if new length is not aligned.
|
|
- */
|
|
-
|
|
-extern (C)
|
|
-
|
|
-void[] _d_arraycast(size_t tsize, size_t fsize, void[] a)
|
|
-{
|
|
- auto length = a.length;
|
|
-
|
|
- auto nbytes = length * fsize;
|
|
- if (nbytes % tsize != 0)
|
|
- {
|
|
- throw new Exception("array cast misalignment");
|
|
- }
|
|
- length = nbytes / tsize;
|
|
- *cast(size_t *)&a = length; // jam new length
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- byte[int.sizeof * 3] b;
|
|
- int[] i;
|
|
- short[] s;
|
|
-
|
|
- i = cast(int[])b;
|
|
- assert(i.length == 3);
|
|
-
|
|
- s = cast(short[])b;
|
|
- assert(s.length == 6);
|
|
-
|
|
- s = cast(short[])i;
|
|
- assert(s.length == 6);
|
|
-}
|
|
-
|
|
-/******************************************
|
|
- * Runtime helper to convert dynamic array of bits
|
|
- * dynamic array of another.
|
|
- * Adjusts the length of the array.
|
|
- * Throws exception if new length is not aligned.
|
|
- */
|
|
-
|
|
-version (none)
|
|
-{
|
|
-extern (C)
|
|
-
|
|
-void[] _d_arraycast_frombit(uint tsize, void[] a)
|
|
-{
|
|
- uint length = a.length;
|
|
-
|
|
- if (length & 7)
|
|
- {
|
|
- throw new Exception("bit[] array cast misalignment");
|
|
- }
|
|
- length /= 8 * tsize;
|
|
- *cast(size_t *)&a = length; // jam new length
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- version (D_Bits)
|
|
- {
|
|
- bit[int.sizeof * 3 * 8] b;
|
|
- int[] i;
|
|
- short[] s;
|
|
-
|
|
- i = cast(int[])b;
|
|
- assert(i.length == 3);
|
|
-
|
|
- s = cast(short[])b;
|
|
- assert(s.length == 6);
|
|
- }
|
|
-}
|
|
-
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraycat.d druntime/src/rt/arraycat.d
|
|
--- druntime-old/src/rt/arraycat.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arraycat.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,42 +0,0 @@
|
|
-/**
|
|
- * Implementation of array copy support routines.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2004 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, Sean Kelly
|
|
- *
|
|
- * Copyright Digital Mars 2004 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arraycat;
|
|
-
|
|
-private
|
|
-{
|
|
- import core.stdc.string;
|
|
- debug import core.stdc.stdio;
|
|
-}
|
|
-
|
|
-extern (C):
|
|
-
|
|
-byte[] _d_arraycopy(size_t size, byte[] from, byte[] to)
|
|
-{
|
|
- debug printf("f = %p,%d, t = %p,%d, size = %d\n",
|
|
- from.ptr, from.length, to.ptr, to.length, size);
|
|
-
|
|
- if (to.length != from.length)
|
|
- {
|
|
- throw new Exception("lengths don't match for array copy");
|
|
- }
|
|
- else if (to.ptr + to.length * size <= from.ptr ||
|
|
- from.ptr + from.length * size <= to.ptr)
|
|
- {
|
|
- memcpy(to.ptr, from.ptr, to.length * size);
|
|
- }
|
|
- else
|
|
- {
|
|
- throw new Exception("overlapping array copy");
|
|
- }
|
|
- return to;
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arraydouble.d druntime/src/rt/arraydouble.d
|
|
--- druntime-old/src/rt/arraydouble.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arraydouble.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,1720 +0,0 @@
|
|
-/**
|
|
- * Contains SSE2 and MMX versions of certain operations for double.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2008 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, based on code originally written by Burton Radons
|
|
- *
|
|
- * Copyright Digital Mars 2008 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arraydouble;
|
|
-
|
|
-private import core.cpuid;
|
|
-
|
|
-version (unittest)
|
|
-{
|
|
- private import core.stdc.stdio : printf;
|
|
- /* This is so unit tests will test every CPU variant
|
|
- */
|
|
- int cpuid;
|
|
- const int CPUID_MAX = 5;
|
|
- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); }
|
|
- bool sse() { return cpuid == 2 && core.cpuid.sse(); }
|
|
- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); }
|
|
- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
|
|
-}
|
|
-else
|
|
-{
|
|
- alias core.cpuid.mmx mmx;
|
|
- alias core.cpuid.sse sse;
|
|
- alias core.cpuid.sse2 sse2;
|
|
- alias core.cpuid.amd3dnow amd3dnow;
|
|
-}
|
|
-
|
|
-//version = log;
|
|
-
|
|
-bool disjoint(T)(T[] a, T[] b)
|
|
-{
|
|
- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
-}
|
|
-
|
|
-/* Performance figures measured by Burton Radons
|
|
- */
|
|
-
|
|
-alias double T;
|
|
-
|
|
-extern (C):
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 333% faster
|
|
- if (sse2() && b.length >= 16)
|
|
- {
|
|
- auto n = aptr + (b.length & ~15);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr; // left operand
|
|
- mov ECX, cptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movupd XMM4, [ECX];
|
|
- movupd XMM5, [ECX+16];
|
|
- movupd XMM6, [ECX+32];
|
|
- movupd XMM7, [ECX+48];
|
|
- add ESI, 64;
|
|
- addpd XMM0, XMM4;
|
|
- addpd XMM1, XMM5;
|
|
- addpd XMM2, XMM6;
|
|
- addpd XMM3, XMM7;
|
|
- add ECX, 64;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- // Handle remainder
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ + *cptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 324% faster
|
|
- if (sse2() && b.length >= 8)
|
|
- {
|
|
- auto n = aptr + (b.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr; // left operand
|
|
- mov ECX, cptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movupd XMM4, [ECX];
|
|
- movupd XMM5, [ECX+16];
|
|
- movupd XMM6, [ECX+32];
|
|
- movupd XMM7, [ECX+48];
|
|
- add ESI, 64;
|
|
- subpd XMM0, XMM4;
|
|
- subpd XMM1, XMM5;
|
|
- subpd XMM2, XMM6;
|
|
- subpd XMM3, XMM7;
|
|
- add ECX, 64;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- // Handle remainder
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ - *cptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpAddSliceAssign_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 305% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- addpd XMM0, XMM4;
|
|
- addpd XMM1, XMM4;
|
|
- addpd XMM2, XMM4;
|
|
- addpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ + value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpAddSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceAddass_d(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 114% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
- if (aptr < n)
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloopa:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- addpd XMM0, XMM4;
|
|
- addpd XMM1, XMM4;
|
|
- addpd XMM2, XMM4;
|
|
- addpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopa;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceAddass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] += 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddass_d(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddass_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 183% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ECX, bptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movupd XMM4, [ECX];
|
|
- movupd XMM5, [ECX+16];
|
|
- movupd XMM6, [ECX+32];
|
|
- movupd XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- addpd XMM0, XMM4;
|
|
- addpd XMM1, XMM5;
|
|
- addpd XMM2, XMM6;
|
|
- addpd XMM3, XMM7;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] += b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMinSliceAssign_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 305% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- subpd XMM0, XMM4;
|
|
- subpd XMM1, XMM4;
|
|
- subpd XMM2, XMM4;
|
|
- subpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ - value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMinSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = value - b[]
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arrayExpSliceMinSliceAssign_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 66% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movapd XMM5, XMM4;
|
|
- movapd XMM6, XMM4;
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- subpd XMM5, XMM0;
|
|
- subpd XMM6, XMM1;
|
|
- movupd [ESI+ 0-64], XMM5;
|
|
- movupd [ESI+16-64], XMM6;
|
|
- movapd XMM5, XMM4;
|
|
- movapd XMM6, XMM4;
|
|
- subpd XMM5, XMM2;
|
|
- subpd XMM6, XMM3;
|
|
- movupd [ESI+32-64], XMM5;
|
|
- movupd [ESI+48-64], XMM6;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = value - *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = 6 - a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(6 - a[i]))
|
|
- {
|
|
- printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinass_d(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 115% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
- if (aptr < n)
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloopa:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- subpd XMM0, XMM4;
|
|
- subpd XMM1, XMM4;
|
|
- subpd XMM2, XMM4;
|
|
- subpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopa;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinass_d(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMinass_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 183% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ECX, bptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movupd XMM4, [ECX];
|
|
- movupd XMM5, [ECX+16];
|
|
- movupd XMM6, [ECX+32];
|
|
- movupd XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- subpd XMM0, XMM4;
|
|
- subpd XMM1, XMM5;
|
|
- subpd XMM2, XMM6;
|
|
- subpd XMM3, XMM7;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMulSliceAssign_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 304% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- mulpd XMM0, XMM4;
|
|
- mulpd XMM1, XMM4;
|
|
- mulpd XMM2, XMM4;
|
|
- mulpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ * value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMulSliceAssign_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 329% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr; // left operand
|
|
- mov ECX, cptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add ESI, 64;
|
|
- movupd XMM4, [ECX];
|
|
- movupd XMM5, [ECX+16];
|
|
- movupd XMM6, [ECX+32];
|
|
- movupd XMM7, [ECX+48];
|
|
- add EAX, 64;
|
|
- mulpd XMM0, XMM4;
|
|
- mulpd XMM1, XMM5;
|
|
- mulpd XMM2, XMM6;
|
|
- mulpd XMM3, XMM7;
|
|
- add ECX, 64;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ * *cptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMulSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMulass_d(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 109% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
- if (aptr < n)
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, value;
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloopa:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- mulpd XMM0, XMM4;
|
|
- mulpd XMM1, XMM4;
|
|
- mulpd XMM2, XMM4;
|
|
- mulpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopa;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMulass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] *= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulass_d(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMulass_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 205% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ECX, bptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movupd XMM4, [ECX];
|
|
- movupd XMM5, [ECX+16];
|
|
- movupd XMM6, [ECX+32];
|
|
- movupd XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- mulpd XMM0, XMM4;
|
|
- mulpd XMM1, XMM5;
|
|
- mulpd XMM2, XMM6;
|
|
- mulpd XMM3, XMM7;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMulass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] *= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] / value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpDivSliceAssign_d()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- /* Multiplying by the reciprocal is faster, but does
|
|
- * not produce as accurate an answer.
|
|
- */
|
|
- T recip = cast(T)1 / value;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 299% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, recip;
|
|
- //movsd XMM4, value
|
|
- //rcpsd XMM4, XMM4
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movupd XMM0, [EAX];
|
|
- movupd XMM1, [EAX+16];
|
|
- movupd XMM2, [EAX+32];
|
|
- movupd XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- mulpd XMM0, XMM4;
|
|
- mulpd XMM1, XMM4;
|
|
- mulpd XMM2, XMM4;
|
|
- mulpd XMM3, XMM4;
|
|
- //divpd XMM0, XMM4;
|
|
- //divpd XMM1, XMM4;
|
|
- //divpd XMM2, XMM4;
|
|
- //divpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- {
|
|
- *aptr++ = *bptr++ / value;
|
|
- //*aptr++ = *bptr++ * recip;
|
|
- }
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpDivSliceAssign_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] / 8;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
|
|
- if (c[i] != cast(T)(a[i] / 8))
|
|
- {
|
|
- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] /= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceDivass_d(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- /* Multiplying by the reciprocal is faster, but does
|
|
- * not produce as accurate an answer.
|
|
- */
|
|
- T recip = cast(T)1 / value;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 version is 65% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movsd XMM4, recip;
|
|
- //movsd XMM4, value
|
|
- //rcpsd XMM4, XMM4
|
|
- shufpd XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloopa:
|
|
- movupd XMM0, [ESI];
|
|
- movupd XMM1, [ESI+16];
|
|
- movupd XMM2, [ESI+32];
|
|
- movupd XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- mulpd XMM0, XMM4;
|
|
- mulpd XMM1, XMM4;
|
|
- mulpd XMM2, XMM4;
|
|
- mulpd XMM3, XMM4;
|
|
- //divpd XMM0, XMM4;
|
|
- //divpd XMM1, XMM4;
|
|
- //divpd XMM2, XMM4;
|
|
- //divpd XMM3, XMM4;
|
|
- movupd [ESI+ 0-64], XMM0;
|
|
- movupd [ESI+16-64], XMM1;
|
|
- movupd [ESI+32-64], XMM2;
|
|
- movupd [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopa;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= recip;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceDivass_d unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] /= 8;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] / 8))
|
|
- {
|
|
- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAddass_d(a, -value, b);
|
|
-}
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- // Handle remainder
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++ * value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAddass_d unittest\n");
|
|
-
|
|
- cpuid = 1;
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 1; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] += a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
|
|
- if (c[i] != cast(T)(b[i] + a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayfloat.d druntime/src/rt/arrayfloat.d
|
|
--- druntime-old/src/rt/arrayfloat.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arrayfloat.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,1435 +0,0 @@
|
|
-/**
|
|
- * Contains SSE2 and MMX versions of certain operations for float.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2008 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, based on code originally written by Burton Radons
|
|
- *
|
|
- * Copyright Digital Mars 2008 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arrayfloat;
|
|
-
|
|
-private import core.cpuid;
|
|
-
|
|
-version (unittest)
|
|
-{
|
|
- private import core.stdc.stdio : printf;
|
|
- /* This is so unit tests will test every CPU variant
|
|
- */
|
|
- int cpuid;
|
|
- const int CPUID_MAX = 5;
|
|
- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); }
|
|
- bool sse() { return cpuid == 2 && core.cpuid.sse(); }
|
|
- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); }
|
|
- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
|
|
-}
|
|
-else
|
|
-{
|
|
- alias core.cpuid.mmx mmx;
|
|
- alias core.cpuid.sse sse;
|
|
- alias core.cpuid.sse2 sse2;
|
|
- alias core.cpuid.amd3dnow amd3dnow;
|
|
-}
|
|
-
|
|
-//version = log;
|
|
-
|
|
-bool disjoint(T)(T[] a, T[] b)
|
|
-{
|
|
- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
-}
|
|
-
|
|
-alias float T;
|
|
-
|
|
-extern (C):
|
|
-
|
|
-/* ======================================================================== */
|
|
-/* ======================================================================== */
|
|
-
|
|
-/* template for the case
|
|
- * a[] = b[] ? c[]
|
|
- * with some binary operator ?
|
|
- */
|
|
-private template CodeGenSliceSliceOp(string opD, string opSSE, string op3DNow)
|
|
-{
|
|
- const CodeGenSliceSliceOp = `
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE version is 834% faster
|
|
- if (sse() && b.length >= 16)
|
|
- {
|
|
- auto n = aptr + (b.length & ~15);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr; // left operand
|
|
- mov ECX, cptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movups XMM0, [EAX];
|
|
- movups XMM1, [EAX+16];
|
|
- movups XMM2, [EAX+32];
|
|
- movups XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- movups XMM4, [ECX];
|
|
- movups XMM5, [ECX+16];
|
|
- movups XMM6, [ECX+32];
|
|
- movups XMM7, [ECX+48];
|
|
- add ESI, 64;
|
|
- ` ~ opSSE ~ ` XMM0, XMM4;
|
|
- ` ~ opSSE ~ ` XMM1, XMM5;
|
|
- ` ~ opSSE ~ ` XMM2, XMM6;
|
|
- ` ~ opSSE ~ ` XMM3, XMM7;
|
|
- add ECX, 64;
|
|
- movups [ESI+ 0-64], XMM0;
|
|
- movups [ESI+16-64], XMM1;
|
|
- movups [ESI+32-64], XMM2;
|
|
- movups [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- // 3DNow! version is only 13% faster
|
|
- if (amd3dnow() && b.length >= 8)
|
|
- {
|
|
- auto n = aptr + (b.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
- mov EAX, bptr; // left operand
|
|
- mov ECX, cptr; // right operand
|
|
-
|
|
- align 4;
|
|
- start3dnow:
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- ` ~ op3DNow ~ ` MM0, [ECX];
|
|
- ` ~ op3DNow ~ ` MM1, [ECX+8];
|
|
- ` ~ op3DNow ~ ` MM2, [ECX+16];
|
|
- ` ~ op3DNow ~ ` MM3, [ECX+24];
|
|
- movq [ESI], MM0;
|
|
- movq [ESI+8], MM1;
|
|
- movq [ESI+16], MM2;
|
|
- movq [ESI+24], MM3;
|
|
- add ECX, 32;
|
|
- add ESI, 32;
|
|
- add EAX, 32;
|
|
- cmp ESI, EDI;
|
|
- jb start3dnow;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- // Handle remainder
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ ` ~ opD ~ ` *cptr++;
|
|
-
|
|
- return a;`;
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceSliceOp!("+", "addps", "pfadd"));
|
|
-}
|
|
-
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceSliceOp!("-", "subps", "pfsub"));
|
|
-}
|
|
-
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceSliceOp!("*", "mulps", "pfmul"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMulSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/* template for the case
|
|
- * a[] ?= value
|
|
- * with some binary operator ?
|
|
- */
|
|
-private template CodeGenExpSliceOpAssign(string opD, string opSSE, string op3DNow)
|
|
-{
|
|
- const CodeGenExpSliceOpAssign = `
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- if (sse() && a.length >= 16)
|
|
- {
|
|
- auto aabeg = cast(T*)((cast(uint)aptr + 15) & ~15); // beginning of paragraph-aligned slice of a
|
|
- auto aaend = cast(T*)((cast(uint)aend) & ~15); // end of paragraph-aligned slice of a
|
|
-
|
|
- int numAligned = cast(int)(aaend - aabeg); // how many floats are in the aligned slice?
|
|
-
|
|
- // are there at least 16 floats in the paragraph-aligned slice?
|
|
- // otherwise we can't do anything with SSE.
|
|
- if (numAligned >= 16)
|
|
- {
|
|
- aaend = aabeg + (numAligned & ~15); // make sure the slice is actually a multiple of 16 floats long
|
|
-
|
|
- // process values up to aligned slice one by one
|
|
- while (aptr < aabeg)
|
|
- *aptr++ ` ~ opD ~ ` value;
|
|
-
|
|
- // process aligned slice with fast SSE operations
|
|
- asm
|
|
- {
|
|
- mov ESI, aabeg;
|
|
- mov EDI, aaend;
|
|
- movss XMM4, value;
|
|
- shufps XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloopa:
|
|
- movaps XMM0, [ESI];
|
|
- movaps XMM1, [ESI+16];
|
|
- movaps XMM2, [ESI+32];
|
|
- movaps XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- ` ~ opSSE ~ ` XMM0, XMM4;
|
|
- ` ~ opSSE ~ ` XMM1, XMM4;
|
|
- ` ~ opSSE ~ ` XMM2, XMM4;
|
|
- ` ~ opSSE ~ ` XMM3, XMM4;
|
|
- movaps [ESI+ 0-64], XMM0;
|
|
- movaps [ESI+16-64], XMM1;
|
|
- movaps [ESI+32-64], XMM2;
|
|
- movaps [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopa;
|
|
- }
|
|
- aptr = aaend;
|
|
- }
|
|
- }
|
|
- else
|
|
- // 3DNow! version is 63% faster
|
|
- if (amd3dnow() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- ulong w = *cast(uint *) &value;
|
|
- ulong v = w | (w << 32L);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, dword ptr [aptr];
|
|
- mov EDI, dword ptr [n];
|
|
- movq MM4, qword ptr [v];
|
|
-
|
|
- align 8;
|
|
- start:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM2, [ESI+16];
|
|
- movq MM3, [ESI+24];
|
|
- ` ~ op3DNow ~ ` MM0, MM4;
|
|
- ` ~ op3DNow ~ ` MM1, MM4;
|
|
- ` ~ op3DNow ~ ` MM2, MM4;
|
|
- ` ~ op3DNow ~ ` MM3, MM4;
|
|
- movq [ESI], MM0;
|
|
- movq [ESI+8], MM1;
|
|
- movq [ESI+16], MM2;
|
|
- movq [ESI+24], MM3;
|
|
- add ESI, 32;
|
|
- cmp ESI, EDI;
|
|
- jb start;
|
|
-
|
|
- emms;
|
|
- mov dword ptr [aptr], ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ ` ~ opD ~ ` value;
|
|
-
|
|
- return a;`;
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceAddass_f(T[] a, T value)
|
|
-{
|
|
- mixin(CodeGenExpSliceOpAssign!("+=", "addps", "pfadd"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceAddass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] += 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinass_f(T[] a, T value)
|
|
-{
|
|
- mixin(CodeGenExpSliceOpAssign!("-=", "subps", "pfsub"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceminass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMulass_f(T[] a, T value)
|
|
-{
|
|
- mixin(CodeGenExpSliceOpAssign!("*=", "mulps", "pfmul"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMulass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] *= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] /= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceDivass_f(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMulass_f(a, 1f / value);
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceDivass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] /= 8;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] / 8))
|
|
- {
|
|
- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-/* ======================================================================== */
|
|
-
|
|
-/* template for the case
|
|
- * a[] = b[] ? value
|
|
- * with some binary operator ?
|
|
- */
|
|
-private template CodeGenSliceExpOp(string opD, string opSSE, string op3DNow)
|
|
-{
|
|
- const CodeGenSliceExpOp = `
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE version is 665% faster
|
|
- if (sse() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movss XMM4, value;
|
|
- shufps XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movups XMM0, [EAX];
|
|
- movups XMM1, [EAX+16];
|
|
- movups XMM2, [EAX+32];
|
|
- movups XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- ` ~ opSSE ~ ` XMM0, XMM4;
|
|
- ` ~ opSSE ~ ` XMM1, XMM4;
|
|
- ` ~ opSSE ~ ` XMM2, XMM4;
|
|
- ` ~ opSSE ~ ` XMM3, XMM4;
|
|
- movups [ESI+ 0-64], XMM0;
|
|
- movups [ESI+16-64], XMM1;
|
|
- movups [ESI+32-64], XMM2;
|
|
- movups [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- // 3DNow! version is 69% faster
|
|
- if (amd3dnow() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- ulong w = *cast(uint *) &value;
|
|
- ulong v = w | (w << 32L);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movq MM4, qword ptr [v];
|
|
-
|
|
- align 8;
|
|
- start3dnow:
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- ` ~ op3DNow ~ ` MM0, MM4;
|
|
- ` ~ op3DNow ~ ` MM1, MM4;
|
|
- ` ~ op3DNow ~ ` MM2, MM4;
|
|
- ` ~ op3DNow ~ ` MM3, MM4;
|
|
- movq [ESI], MM0;
|
|
- movq [ESI+8], MM1;
|
|
- movq [ESI+16], MM2;
|
|
- movq [ESI+24], MM3;
|
|
- add ESI, 32;
|
|
- add EAX, 32;
|
|
- cmp ESI, EDI;
|
|
- jb start3dnow;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ ` ~ opD ~ ` value;
|
|
-
|
|
- return a;`;
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceExpOp!("+", "addps", "pfadd"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpAddSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceExpOp!("-", "subps", "pfsub"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMinSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceExpOp!("*", "mulps", "pfmul"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] / value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAssign_f(a, 1f/value, b);
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpDivSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] / 8;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] / 8))
|
|
- {
|
|
- printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-/* ======================================================================== */
|
|
-
|
|
-private template CodeGenSliceOpAssign(string opD, string opSSE, string op3DNow)
|
|
-{
|
|
- const CodeGenSliceOpAssign = `
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE version is 468% faster
|
|
- if (sse() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov ECX, bptr; // right operand
|
|
- mov ESI, aptr; // destination operand
|
|
- mov EDI, n; // end comparison
|
|
-
|
|
- align 8;
|
|
- startsseloopb:
|
|
- movups XMM0, [ESI];
|
|
- movups XMM1, [ESI+16];
|
|
- movups XMM2, [ESI+32];
|
|
- movups XMM3, [ESI+48];
|
|
- add ESI, 64;
|
|
- movups XMM4, [ECX];
|
|
- movups XMM5, [ECX+16];
|
|
- movups XMM6, [ECX+32];
|
|
- movups XMM7, [ECX+48];
|
|
- add ECX, 64;
|
|
- ` ~ opSSE ~ ` XMM0, XMM4;
|
|
- ` ~ opSSE ~ ` XMM1, XMM5;
|
|
- ` ~ opSSE ~ ` XMM2, XMM6;
|
|
- ` ~ opSSE ~ ` XMM3, XMM7;
|
|
- movups [ESI+ 0-64], XMM0;
|
|
- movups [ESI+16-64], XMM1;
|
|
- movups [ESI+32-64], XMM2;
|
|
- movups [ESI+48-64], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloopb;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- // 3DNow! version is 57% faster
|
|
- if (amd3dnow() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, dword ptr [aptr]; // destination operand
|
|
- mov EDI, dword ptr [n]; // end comparison
|
|
- mov ECX, dword ptr [bptr]; // right operand
|
|
-
|
|
- align 4;
|
|
- start3dnow:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM2, [ESI+16];
|
|
- movq MM3, [ESI+24];
|
|
- ` ~ op3DNow ~ ` MM0, [ECX];
|
|
- ` ~ op3DNow ~ ` MM1, [ECX+8];
|
|
- ` ~ op3DNow ~ ` MM2, [ECX+16];
|
|
- ` ~ op3DNow ~ ` MM3, [ECX+24];
|
|
- movq [ESI], MM0;
|
|
- movq [ESI+8], MM1;
|
|
- movq [ESI+16], MM2;
|
|
- movq [ESI+24], MM3;
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- cmp ESI, EDI;
|
|
- jb start3dnow;
|
|
-
|
|
- emms;
|
|
- mov dword ptr [aptr], ESI;
|
|
- mov dword ptr [bptr], ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ ` ~ opD ~ ` *bptr++;
|
|
-
|
|
- return a;`;
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddass_f(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceOpAssign!("+=", "addps", "pfadd"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] += b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinass_f(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceOpAssign!("-=", "subps", "pfsub"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulass_f(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- mixin(CodeGenSliceOpAssign!("*=", "mulps", "pfmul"));
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMulass_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- c[] *= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = value - b[]
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arrayExpSliceMinSliceAssign_f()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE version is 690% faster
|
|
- if (sse() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- // Unaligned case
|
|
- asm
|
|
- {
|
|
- mov EAX, bptr;
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movss XMM4, value;
|
|
- shufps XMM4, XMM4, 0;
|
|
-
|
|
- align 8;
|
|
- startsseloop:
|
|
- add ESI, 64;
|
|
- movaps XMM5, XMM4;
|
|
- movaps XMM6, XMM4;
|
|
- movups XMM0, [EAX];
|
|
- movups XMM1, [EAX+16];
|
|
- movups XMM2, [EAX+32];
|
|
- movups XMM3, [EAX+48];
|
|
- add EAX, 64;
|
|
- subps XMM5, XMM0;
|
|
- subps XMM6, XMM1;
|
|
- movups [ESI+ 0-64], XMM5;
|
|
- movups [ESI+16-64], XMM6;
|
|
- movaps XMM5, XMM4;
|
|
- movaps XMM6, XMM4;
|
|
- subps XMM5, XMM2;
|
|
- subps XMM6, XMM3;
|
|
- movups [ESI+32-64], XMM5;
|
|
- movups [ESI+48-64], XMM6;
|
|
- cmp ESI, EDI;
|
|
- jb startsseloop;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- // 3DNow! version is 67% faster
|
|
- if (amd3dnow() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- ulong w = *cast(uint *) &value;
|
|
- ulong v = w | (w << 32L);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movq MM4, qword ptr [v];
|
|
-
|
|
- align 8;
|
|
- start3dnow:
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM2, [EAX+16];
|
|
- movq MM3, [EAX+24];
|
|
- pfsubr MM0, MM4;
|
|
- pfsubr MM1, MM4;
|
|
- pfsubr MM2, MM4;
|
|
- pfsubr MM3, MM4;
|
|
- movq [ESI], MM0;
|
|
- movq [ESI+8], MM1;
|
|
- movq [ESI+16], MM2;
|
|
- movq [ESI+24], MM3;
|
|
- add ESI, 32;
|
|
- add EAX, 32;
|
|
- cmp ESI, EDI;
|
|
- jb start3dnow;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = value - *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinSliceAssign_f unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = 6 - a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(6 - a[i]))
|
|
- {
|
|
- printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAddass_f(a, -value, b);
|
|
-}
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- // Handle remainder
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++ * value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAddass_f unittest\n");
|
|
-
|
|
- cpuid = 1;
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 1; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] += a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
|
|
- if (c[i] != cast(T)(b[i] + a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayint.d druntime/src/rt/arrayint.d
|
|
--- druntime-old/src/rt/arrayint.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arrayint.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,2430 +0,0 @@
|
|
-/**
|
|
- * Contains MMX versions of certain operations for dchar, int, and uint ('w',
|
|
- * 'i' and 'k' suffixes).
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2008 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, based on code originally written by Burton Radons
|
|
- *
|
|
- * Copyright Digital Mars 2008 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arrayint;
|
|
-
|
|
-private import core.cpuid;
|
|
-
|
|
-version (unittest)
|
|
-{
|
|
- private import core.stdc.stdio : printf;
|
|
- /* This is so unit tests will test every CPU variant
|
|
- */
|
|
- int cpuid;
|
|
- const int CPUID_MAX = 4;
|
|
- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); }
|
|
- bool sse() { return cpuid == 2 && core.cpuid.sse(); }
|
|
- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); }
|
|
- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
|
|
-}
|
|
-else
|
|
-{
|
|
- alias core.cpuid.mmx mmx;
|
|
- alias core.cpuid.sse sse;
|
|
- alias core.cpuid.sse2 sse2;
|
|
- alias core.cpuid.amd3dnow amd3dnow;
|
|
-}
|
|
-
|
|
-//version = log;
|
|
-
|
|
-bool disjoint(T)(T[] a, T[] b)
|
|
-{
|
|
- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
-}
|
|
-
|
|
-alias int T;
|
|
-
|
|
-extern (C):
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpAddSliceAssign_i(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpAddSliceAssign_i(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpAddSliceAssign_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 380% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 298% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movq MM2, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- paddd MM0, MM2;
|
|
- paddd MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- if (a.length >= 2)
|
|
- {
|
|
- auto n = aptr + (a.length & ~1);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov EDX, value;
|
|
-
|
|
- align 4;
|
|
- start386:
|
|
- add ESI, 8;
|
|
- mov EBX, [EAX];
|
|
- mov ECX, [EAX+4];
|
|
- add EAX, 8;
|
|
- add EBX, EDX;
|
|
- add ECX, EDX;
|
|
- mov [ESI -8], EBX;
|
|
- mov [ESI+4-8], ECX;
|
|
- cmp ESI, EDI;
|
|
- jb start386;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ + value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpAddSliceAssign_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddSliceAssign_i(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddSliceAssign_i(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddSliceAssign_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1710% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 995% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM3, [ECX+8];
|
|
- add EAX, 16;
|
|
- add ECX, 16;
|
|
- paddd MM0, MM2;
|
|
- paddd MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
-normal:
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ + *cptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddSliceAssign_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceAddass_w(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceAddass_i(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceAddass_k(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceAddass_i(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceAddass_i(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 83% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 81% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movq MM2, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- paddd MM0, MM2;
|
|
- paddd MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- if (a.length >= 2)
|
|
- {
|
|
- auto n = aptr + (a.length & ~1);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EDX, value;
|
|
-
|
|
- align 4;
|
|
- start386:
|
|
- mov EBX, [ESI];
|
|
- mov ECX, [ESI+4];
|
|
- add ESI, 8;
|
|
- add EBX, EDX;
|
|
- add ECX, EDX;
|
|
- mov [ESI -8], EBX;
|
|
- mov [ESI+4-8], ECX;
|
|
- cmp ESI, EDI;
|
|
- jb start386;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceAddass_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- a[] += 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(c[i] + 6))
|
|
- {
|
|
- printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddass_w(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddass_i(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddass_k(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddass_i(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddass_i(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddass_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 695% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- paddd XMM0, XMM2;
|
|
- paddd XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 471% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM3, [ECX+8];
|
|
- add ESI, 16;
|
|
- add ECX, 16;
|
|
- paddd MM0, MM2;
|
|
- paddd MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
-normal:
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddass_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] += a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(b[i] + a[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMinSliceAssign_i(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMinSliceAssign_i(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMinSliceAssign_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 400% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 315% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movq MM2, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- psubd MM0, MM2;
|
|
- psubd MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- if (a.length >= 2)
|
|
- {
|
|
- auto n = aptr + (a.length & ~1);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov EDX, value;
|
|
-
|
|
- align 4;
|
|
- start386:
|
|
- add ESI, 8;
|
|
- mov EBX, [EAX];
|
|
- mov ECX, [EAX+4];
|
|
- add EAX, 8;
|
|
- sub EBX, EDX;
|
|
- sub ECX, EDX;
|
|
- mov [ESI -8], EBX;
|
|
- mov [ESI+4-8], ECX;
|
|
- cmp ESI, EDI;
|
|
- jb start386;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ - value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMinSliceAssign_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = value - b[]
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
|
|
-{
|
|
- return _arrayExpSliceMinSliceAssign_i(a, b, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
|
|
-{
|
|
- return _arrayExpSliceMinSliceAssign_i(a, b, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arrayExpSliceMinSliceAssign_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1812% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM2, [EAX];
|
|
- movdqu XMM3, [EAX+16];
|
|
- movdqa XMM0, XMM4;
|
|
- movdqa XMM1, XMM4;
|
|
- add EAX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM4, l;
|
|
- pshufd XMM4, XMM4, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM2, [EAX];
|
|
- movdqa XMM3, [EAX+16];
|
|
- movdqa XMM0, XMM4;
|
|
- movdqa XMM1, XMM4;
|
|
- add EAX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1077% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movq MM4, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM2, [EAX];
|
|
- movq MM3, [EAX+8];
|
|
- movq MM0, MM4;
|
|
- movq MM1, MM4;
|
|
- add EAX, 16;
|
|
- psubd MM0, MM2;
|
|
- psubd MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = value - *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinSliceAssign_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = 6 - a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(6 - a[i]))
|
|
- {
|
|
- printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinSliceAssign_i(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinSliceAssign_i(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1721% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1002% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM3, [ECX+8];
|
|
- add EAX, 16;
|
|
- add ECX, 16;
|
|
- psubd MM0, MM2;
|
|
- psubd MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ - *cptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinSliceAssign_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinass_w(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMinass_i(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinass_k(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMinass_i(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinass_i(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 81% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 81% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movq MM2, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- psubd MM0, MM2;
|
|
- psubd MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- if (a.length >= 2)
|
|
- {
|
|
- auto n = aptr + (a.length & ~1);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EDX, value;
|
|
-
|
|
- align 4;
|
|
- start386:
|
|
- mov EBX, [ESI];
|
|
- mov ECX, [ESI+4];
|
|
- add ESI, 8;
|
|
- sub EBX, EDX;
|
|
- sub ECX, EDX;
|
|
- mov [ESI -8], EBX;
|
|
- mov [ESI+4-8], ECX;
|
|
- cmp ESI, EDI;
|
|
- jb start386;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinass_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- a[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(c[i] - 6))
|
|
- {
|
|
- printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinass_w(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinass_i(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinass_k(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinass_i(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinass_i(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMinass_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 731% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- psubd XMM0, XMM2;
|
|
- psubd XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 441% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM3, [ECX+8];
|
|
- add ESI, 16;
|
|
- add ECX, 16;
|
|
- psubd MM0, MM2;
|
|
- psubd MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinass_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] -= a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(b[i] - a[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAssign_i(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAssign_i(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMulSliceAssign_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (none) // multiplying a pair is not supported by MMX
|
|
- {
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1380% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- // MMX version is 1380% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movq MM2, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- pmuludq MM0, MM2; // only multiplies low 32 bits
|
|
- pmuludq MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ * value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulSliceAssign_i(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulSliceAssign_i(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMulSliceAssign_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (none)
|
|
- {
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 1407% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1029% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM3, [ECX+8];
|
|
- add EAX, 16;
|
|
- add ECX, 16;
|
|
- pmuludq MM0, MM2;
|
|
- pmuludq MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = *bptr++ * *cptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMulSliceAssign_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMulass_w(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMulass_i(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMulass_k(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMulass_i(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMulass_i(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (none)
|
|
- {
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 400% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = value;
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 402% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movq MM2, l;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- pmuludq MM0, MM2;
|
|
- pmuludq MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMulass_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = a[];
|
|
- a[] *= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(b[i] * 6))
|
|
- {
|
|
- printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulass_w(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulass_i(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulass_k(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulass_i(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulass_i(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMulass_i()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (none)
|
|
- {
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 873% faster
|
|
- if (sse2() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- pmuludq XMM0, XMM2;
|
|
- pmuludq XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-/+ BUG: comment out this section until we figure out what is going
|
|
- wrong with the invalid pshufd instructions.
|
|
-
|
|
- else
|
|
- // MMX version is 573% faster
|
|
- if (mmx() && a.length >= 4)
|
|
- {
|
|
- auto n = aptr + (a.length & ~3);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM3, [ECX+8];
|
|
- pxor MM4, MM4;
|
|
- pxor MM5, MM5;
|
|
- punpckldq MM4, MM0;
|
|
- punpckldq MM5, MM2;
|
|
- add ESI, 16;
|
|
- add ECX, 16;
|
|
- pmuludq MM4, MM5;
|
|
- pshufd MM4, MM4, 8; // ?
|
|
- movq [ESI -16], MM4;
|
|
- pxor MM4, MM4;
|
|
- pxor MM5, MM5;
|
|
- punpckldq MM4, MM1;
|
|
- punpckldq MM5, MM3;
|
|
- pmuludq MM4, MM5;
|
|
- pshufd MM4, MM4, 8; // ?
|
|
- movq [ESI+8-16], MM4;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
-+/
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMulass_i unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = a[];
|
|
- a[] *= c[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(b[i] * c[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayreal.d druntime/src/rt/arrayreal.d
|
|
--- druntime-old/src/rt/arrayreal.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arrayreal.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,241 +0,0 @@
|
|
-/**
|
|
- * Contains SSE2 and MMX versions of certain operations for real.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2008 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, based on code originally written by Burton Radons
|
|
- *
|
|
- * Copyright Digital Mars 2008 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arrayreal;
|
|
-
|
|
-import core.cpuid;
|
|
-
|
|
-version (unittest)
|
|
-{
|
|
- private import core.stdc.stdio : printf;
|
|
- /* This is so unit tests will test every CPU variant
|
|
- */
|
|
- int cpuid;
|
|
- const int CPUID_MAX = 1;
|
|
- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); }
|
|
- bool sse() { return cpuid == 2 && core.cpuid.sse(); }
|
|
- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); }
|
|
- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
|
|
-}
|
|
-else
|
|
-{
|
|
- alias core.cpuid.mmx mmx;
|
|
- alias core.cpuid.sse sse;
|
|
- alias core.cpuid.sse2 sse2;
|
|
- alias core.cpuid.amd3dnow amd3dnow;
|
|
-}
|
|
-
|
|
-//version = log;
|
|
-
|
|
-bool disjoint(T)(T[] a, T[] b)
|
|
-{
|
|
- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
-}
|
|
-
|
|
-alias real T;
|
|
-
|
|
-extern (C):
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_r(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- for (int i = 0; i < a.length; i++)
|
|
- a[i] = b[i] + c[i];
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddSliceAssign_r unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %Lg != %Lg + %Lg\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_r(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- for (int i = 0; i < a.length; i++)
|
|
- a[i] = b[i] - c[i];
|
|
- return a;
|
|
-}
|
|
-
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinSliceAssign_r unittest\n");
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %Lg != %Lg - %Lg\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceMinass_r(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAddass_r(a, -value, b);
|
|
-}
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAddass_r(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- // Handle remainder
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++ * value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAddass_r unittest\n");
|
|
-
|
|
- cpuid = 1;
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 1; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] += a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- //printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]);
|
|
- if (c[i] != cast(T)(b[i] + a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %Lg ?= %Lg + %Lg * 6\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/arrayshort.d druntime/src/rt/arrayshort.d
|
|
--- druntime-old/src/rt/arrayshort.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/arrayshort.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,2303 +0,0 @@
|
|
-/**
|
|
- * Contains SSE2 and MMX versions of certain operations for wchar, short,
|
|
- * and ushort ('u', 's' and 't' suffixes).
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2008 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright, based on code originally written by Burton Radons
|
|
- *
|
|
- * Copyright Digital Mars 2008 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.arrayshort;
|
|
-
|
|
-private import core.cpuid;
|
|
-
|
|
-version (unittest)
|
|
-{
|
|
- private import core.stdc.stdio : printf;
|
|
- /* This is so unit tests will test every CPU variant
|
|
- */
|
|
- int cpuid;
|
|
- const int CPUID_MAX = 4;
|
|
- bool mmx() { return cpuid == 1 && core.cpuid.mmx(); }
|
|
- bool sse() { return cpuid == 2 && core.cpuid.sse(); }
|
|
- bool sse2() { return cpuid == 3 && core.cpuid.sse2(); }
|
|
- bool amd3dnow() { return cpuid == 4 && core.cpuid.amd3dnow(); }
|
|
-}
|
|
-else
|
|
-{
|
|
- alias core.cpuid.mmx mmx;
|
|
- alias core.cpuid.sse sse;
|
|
- alias core.cpuid.sse2 sse2;
|
|
- alias core.cpuid.sse2 sse2;
|
|
-}
|
|
-
|
|
-//version = log;
|
|
-
|
|
-bool disjoint(T)(T[] a, T[] b)
|
|
-{
|
|
- return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
|
|
-}
|
|
-
|
|
-alias short T;
|
|
-
|
|
-extern (C):
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpAddSliceAssign_s(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpAddSliceAssign_s(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpAddSliceAssign_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 3343% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 3343% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM2, l;
|
|
- pshufw MM2, MM2, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- paddw MM0, MM2;
|
|
- paddw MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ + value);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpAddSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + 6))
|
|
- {
|
|
- printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] + c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddSliceAssign_s(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddSliceAssign_s(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddSliceAssign_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 3777% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 2068% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- movq MM2, [ECX];
|
|
- movq MM3, [ECX+8];
|
|
- add ECX, 16;
|
|
- paddw MM0, MM2;
|
|
- paddw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ + *cptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] + b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] + b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceAddass_u(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceAddass_s(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceAddass_t(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceAddass_s(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceAddass_s(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 832% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 826% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd MM2, l;
|
|
- pshufw MM2, MM2, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- paddw MM0, MM2;
|
|
- paddw MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceAddass_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- a[] += 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(c[i] + 6))
|
|
- {
|
|
- printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] += b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceAddass_u(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddass_s(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddass_t(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceAddass_s(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceAddass_s(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceAddass_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 2085% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- paddw XMM0, XMM2;
|
|
- paddw XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1022% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- start:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- movq MM2, [ECX];
|
|
- movq MM3, [ECX+8];
|
|
- add ECX, 16;
|
|
- paddw MM0, MM2;
|
|
- paddw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb start;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ += *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceAddass_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] += a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(b[i] + a[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMinSliceAssign_s(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMinSliceAssign_s(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMinSliceAssign_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 3695% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 3049% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM2, l;
|
|
- pshufw MM2, MM2, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- psubw MM0, MM2;
|
|
- psubw MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ - value);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMinSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - 6))
|
|
- {
|
|
- printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = value - b[]
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
|
|
-{
|
|
- return _arrayExpSliceMinSliceAssign_s(a, b, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
|
|
-{
|
|
- return _arrayExpSliceMinSliceAssign_s(a, b, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arrayExpSliceMinSliceAssign_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 4995% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
- movd XMM3, l;
|
|
- pshufd XMM3, XMM3, 0;
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- psubw XMM2, XMM0;
|
|
- psubw XMM3, XMM1;
|
|
- movdqu [ESI -32], XMM2;
|
|
- movdqu [ESI+16-32], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
- movd XMM3, l;
|
|
- pshufd XMM3, XMM3, 0;
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- psubw XMM2, XMM0;
|
|
- psubw XMM3, XMM1;
|
|
- movdqa [ESI -32], XMM2;
|
|
- movdqa [ESI+16-32], XMM3;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 4562% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM4, l;
|
|
- pshufw MM4, MM4, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM2, [EAX];
|
|
- movq MM3, [EAX+8];
|
|
- movq MM0, MM4;
|
|
- movq MM1, MM4;
|
|
- add EAX, 16;
|
|
- psubw MM0, MM2;
|
|
- psubw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(value - *bptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = 6 - a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(6 - a[i]))
|
|
- {
|
|
- printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] - c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinSliceAssign_s(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinSliceAssign_s(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 4129% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 2018% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- movq MM2, [ECX];
|
|
- movq MM3, [ECX+8];
|
|
- add ECX, 16;
|
|
- psubw MM0, MM2;
|
|
- psubw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ - *cptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] - b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] - b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMinass_u(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMinass_s(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinass_t(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMinass_s(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMinass_s(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 835% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= (l << 16);
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startaddsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startaddsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 835% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd MM2, l;
|
|
- pshufw MM2, MM2, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- psubw MM0, MM2;
|
|
- psubw MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMinass_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- a[] = c[];
|
|
- a[] -= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(c[i] - 6))
|
|
- {
|
|
- printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] -= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMinass_u(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinass_s(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinass_t(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMinass_s(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMinass_s(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMinass_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 2121% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm // unaligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm // aligned case
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ECX, 32;
|
|
- psubw XMM0, XMM2;
|
|
- psubw XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1116% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- start:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- movq MM2, [ECX];
|
|
- movq MM3, [ECX+8];
|
|
- add ECX, 16;
|
|
- psubw MM0, MM2;
|
|
- psubw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb start;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ -= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMinass_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = c[];
|
|
- c[] -= a[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(b[i] - a[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * value
|
|
- */
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAssign_s(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
|
|
-{
|
|
- return _arraySliceExpMulSliceAssign_s(a, value, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length);
|
|
- assert(disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceExpMulSliceAssign_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 3733% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= l << 16;
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- add EAX, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 3733% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- movd MM2, l;
|
|
- pshufw MM2, MM2, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM1, [EAX+8];
|
|
- add EAX, 16;
|
|
- pmullw MM0, MM2;
|
|
- pmullw MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ * value);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceExpMulSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * 6))
|
|
- {
|
|
- printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] = b[] * c[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulSliceAssign_s(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulSliceAssign_s(a, c, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
|
|
-in
|
|
-{
|
|
- assert(a.length == b.length && b.length == c.length);
|
|
- assert(disjoint(a, b));
|
|
- assert(disjoint(a, c));
|
|
- assert(disjoint(b, c));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMulSliceAssign_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
- auto cptr = c.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 2515% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- add ESI, 32;
|
|
- movdqu XMM0, [EAX];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [EAX+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- add ESI, 32;
|
|
- movdqa XMM0, [EAX];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [EAX+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add EAX, 32;
|
|
- add ECX, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 2515% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov EAX, bptr;
|
|
- mov ECX, cptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- add ESI, 16;
|
|
- movq MM0, [EAX];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [EAX+8];
|
|
- movq MM3, [ECX+8];
|
|
- add EAX, 16;
|
|
- add ECX, 16;
|
|
- pmullw MM0, MM2;
|
|
- pmullw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, EAX;
|
|
- mov cptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ = cast(T)(*bptr++ * *cptr++);
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMulSliceAssign_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- c[] = a[] * b[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (c[i] != cast(T)(a[i] * b[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= value
|
|
- */
|
|
-
|
|
-T[] _arrayExpSliceMulass_u(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMulass_s(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMulass_t(T[] a, T value)
|
|
-{
|
|
- return _arrayExpSliceMulass_s(a, value);
|
|
-}
|
|
-
|
|
-T[] _arrayExpSliceMulass_s(T[] a, T value)
|
|
-{
|
|
- //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 2044% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
- l |= l << 16;
|
|
-
|
|
- if (((cast(uint) aptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM2;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd XMM2, l;
|
|
- pshufd XMM2, XMM2, 0;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM1, [ESI+16];
|
|
- add ESI, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM2;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 2056% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- uint l = cast(ushort) value;
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- movd MM2, l;
|
|
- pshufw MM2, MM2, 0;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM1, [ESI+8];
|
|
- add ESI, 16;
|
|
- pmullw MM0, MM2;
|
|
- pmullw MM1, MM2;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= value;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arrayExpSliceMulass_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = a[];
|
|
- a[] *= 6;
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(b[i] * 6))
|
|
- {
|
|
- printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-/* ======================================================================== */
|
|
-
|
|
-/***********************
|
|
- * Computes:
|
|
- * a[] *= b[]
|
|
- */
|
|
-
|
|
-T[] _arraySliceSliceMulass_u(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulass_s(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulass_t(T[] a, T[] b)
|
|
-{
|
|
- return _arraySliceSliceMulass_s(a, b);
|
|
-}
|
|
-
|
|
-T[] _arraySliceSliceMulass_s(T[] a, T[] b)
|
|
-in
|
|
-{
|
|
- assert (a.length == b.length);
|
|
- assert (disjoint(a, b));
|
|
-}
|
|
-body
|
|
-{
|
|
- //printf("_arraySliceSliceMulass_s()\n");
|
|
- auto aptr = a.ptr;
|
|
- auto aend = aptr + a.length;
|
|
- auto bptr = b.ptr;
|
|
-
|
|
- version (D_InlineAsm_X86)
|
|
- {
|
|
- // SSE2 aligned version is 2519% faster
|
|
- if (sse2() && a.length >= 16)
|
|
- {
|
|
- auto n = aptr + (a.length & ~15);
|
|
-
|
|
- if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2u:
|
|
- movdqu XMM0, [ESI];
|
|
- movdqu XMM2, [ECX];
|
|
- movdqu XMM1, [ESI+16];
|
|
- movdqu XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM3;
|
|
- movdqu [ESI -32], XMM0;
|
|
- movdqu [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2u;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startsse2a:
|
|
- movdqa XMM0, [ESI];
|
|
- movdqa XMM2, [ECX];
|
|
- movdqa XMM1, [ESI+16];
|
|
- movdqa XMM3, [ECX+16];
|
|
- add ESI, 32;
|
|
- add ECX, 32;
|
|
- pmullw XMM0, XMM2;
|
|
- pmullw XMM1, XMM3;
|
|
- movdqa [ESI -32], XMM0;
|
|
- movdqa [ESI+16-32], XMM1;
|
|
- cmp ESI, EDI;
|
|
- jb startsse2a;
|
|
-
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
- else
|
|
- // MMX version is 1712% faster
|
|
- if (mmx() && a.length >= 8)
|
|
- {
|
|
- auto n = aptr + (a.length & ~7);
|
|
-
|
|
- asm
|
|
- {
|
|
- mov ESI, aptr;
|
|
- mov EDI, n;
|
|
- mov ECX, bptr;
|
|
-
|
|
- align 4;
|
|
- startmmx:
|
|
- movq MM0, [ESI];
|
|
- movq MM2, [ECX];
|
|
- movq MM1, [ESI+8];
|
|
- movq MM3, [ECX+8];
|
|
- add ESI, 16;
|
|
- add ECX, 16;
|
|
- pmullw MM0, MM2;
|
|
- pmullw MM1, MM3;
|
|
- movq [ESI -16], MM0;
|
|
- movq [ESI+8-16], MM1;
|
|
- cmp ESI, EDI;
|
|
- jb startmmx;
|
|
-
|
|
- emms;
|
|
- mov aptr, ESI;
|
|
- mov bptr, ECX;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- while (aptr < aend)
|
|
- *aptr++ *= *bptr++;
|
|
-
|
|
- return a;
|
|
-}
|
|
-
|
|
-unittest
|
|
-{
|
|
- printf("_arraySliceSliceMulass_s unittest\n");
|
|
-
|
|
- for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
|
|
- {
|
|
- version (log) printf(" cpuid %d\n", cpuid);
|
|
-
|
|
- for (int j = 0; j < 2; j++)
|
|
- {
|
|
- const int dim = 67;
|
|
- T[] a = new T[dim + j]; // aligned on 16 byte boundary
|
|
- a = a[j .. dim + j]; // misalign for second iteration
|
|
- T[] b = new T[dim + j];
|
|
- b = b[j .. dim + j];
|
|
- T[] c = new T[dim + j];
|
|
- c = c[j .. dim + j];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- { a[i] = cast(T)i;
|
|
- b[i] = cast(T)(i + 7);
|
|
- c[i] = cast(T)(i * 2);
|
|
- }
|
|
-
|
|
- b[] = a[];
|
|
- a[] *= c[];
|
|
-
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- if (a[i] != cast(T)(b[i] * c[i]))
|
|
- {
|
|
- printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
|
|
- assert(0);
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh.c druntime/src/rt/deh.c
|
|
--- druntime-old/src/rt/deh.c 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/deh.c 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,734 +0,0 @@
|
|
-/**
|
|
- * Implementation of exception handling support routines for Windows.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 1999 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright
|
|
- *
|
|
- * Copyright Digital Mars 1999 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-#include <stdio.h>
|
|
-#include <string.h>
|
|
-#include <assert.h>
|
|
-#include <stdlib.h>
|
|
-
|
|
-/* ======================== Win32 =============================== */
|
|
-
|
|
-#if _WIN32
|
|
-
|
|
-#include <excpt.h>
|
|
-#include <windows.h>
|
|
-
|
|
-//#include "\sc\src\include\ehsup.h"
|
|
-
|
|
-/*** From Digital Mars C runtime library ***/
|
|
-EXCEPTION_DISPOSITION __cdecl _local_except_handler (EXCEPTION_RECORD *ExceptionRecord,
|
|
- void* EstablisherFrame,
|
|
- void *ContextRecord,
|
|
- void *DispatcherContext
|
|
- );
|
|
-void __cdecl _global_unwind(void *frame,EXCEPTION_RECORD *eRecord);
|
|
-#define EXCEPTION_UNWIND 6 // Flag to indicate if the system is unwinding
|
|
-
|
|
-extern DWORD _except_list;
|
|
-/*** ***/
|
|
-
|
|
-#include "mars.h"
|
|
-
|
|
-extern ClassInfo D6object9Throwable7__ClassZ;
|
|
-#define _Class_9Throwable D6object9Throwable7__ClassZ;
|
|
-
|
|
-extern ClassInfo D6object5Error7__ClassZ;
|
|
-#define _Class_5Error D6object5Error7__ClassZ
|
|
-
|
|
-typedef int (__pascal *fp_t)(); // function pointer in ambient memory model
|
|
-
|
|
-void _d_setunhandled(Object*);
|
|
-
|
|
-// The layout of DEstablisherFrame is the same for C++
|
|
-
|
|
-struct DEstablisherFrame
|
|
-{
|
|
- void *prev; // pointer to previous exception list
|
|
- void *handler; // pointer to routine for exception handler
|
|
- DWORD table_index; // current index into handler_info[]
|
|
- DWORD ebp; // this is EBP of routine
|
|
-};
|
|
-
|
|
-struct DHandlerInfo
|
|
-{
|
|
- int prev_index; // previous table index
|
|
- unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch)
|
|
- void *finally_code; // pointer to finally code to execute
|
|
- // (!=0 if try-finally)
|
|
-};
|
|
-
|
|
-// Address of DHandlerTable is passed in EAX to _d_framehandler()
|
|
-
|
|
-struct DHandlerTable
|
|
-{
|
|
- void *fptr; // pointer to start of function
|
|
- unsigned espoffset; // offset of ESP from EBP
|
|
- unsigned retoffset; // offset from start of function to return code
|
|
- struct DHandlerInfo handler_info[1];
|
|
-};
|
|
-
|
|
-struct DCatchBlock
|
|
-{
|
|
- ClassInfo *type; // catch type
|
|
- unsigned bpoffset; // EBP offset of catch var
|
|
- void *code; // catch handler code
|
|
-};
|
|
-
|
|
-// Create one of these for each try-catch
|
|
-struct DCatchInfo
|
|
-{
|
|
- unsigned ncatches; // number of catch blocks
|
|
- struct DCatchBlock catch_block[1]; // data for each catch block
|
|
-};
|
|
-
|
|
-// Macro to make our own exception code
|
|
-#define MAKE_EXCEPTION_CODE(severity, facility, exception) \
|
|
- (((severity) << 30) | (1 << 29) | (0 << 28) | ((facility) << 16) | (exception))
|
|
-
|
|
-#define STATUS_DIGITAL_MARS_D_EXCEPTION MAKE_EXCEPTION_CODE(3,'D',1)
|
|
-
|
|
-Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record);
|
|
-void __cdecl _d_local_unwind(struct DHandlerTable *handler_table, struct DEstablisherFrame *frame, int stop_index);
|
|
-
|
|
-
|
|
-/***********************************
|
|
- * The frame handler, this is called for each frame that has been registered
|
|
- * in the OS except_list.
|
|
- * Input:
|
|
- * EAX the handler table for the frame
|
|
- */
|
|
-
|
|
-EXCEPTION_DISPOSITION _d_framehandler(
|
|
- EXCEPTION_RECORD *exception_record,
|
|
- struct DEstablisherFrame *frame,
|
|
- CONTEXT *context,
|
|
- void *dispatcher_context)
|
|
-{
|
|
- struct DHandlerTable *handler_table;
|
|
-
|
|
- __asm { mov handler_table,EAX }
|
|
-
|
|
- if (exception_record->ExceptionFlags & EXCEPTION_UNWIND)
|
|
- {
|
|
- // Call all the finally blocks in this frame
|
|
- _d_local_unwind(handler_table, frame, -1);
|
|
- }
|
|
- else
|
|
- {
|
|
- // Jump to catch block if matching one is found
|
|
-
|
|
- int ndx,prev_ndx,i;
|
|
- struct DHandlerInfo *phi;
|
|
- struct DCatchInfo *pci;
|
|
- struct DCatchBlock *pcb;
|
|
- unsigned ncatches; // number of catches in the current handler
|
|
- Object *pti;
|
|
- ClassInfo *ci;
|
|
-
|
|
- ci = NULL; // only compute it if we need it
|
|
-
|
|
- // walk through handler table, checking each handler
|
|
- // with an index smaller than the current table_index
|
|
- for (ndx = frame->table_index; ndx != -1; ndx = prev_ndx)
|
|
- {
|
|
- phi = &handler_table->handler_info[ndx];
|
|
- prev_ndx = phi->prev_index;
|
|
- if (phi->cioffset)
|
|
- {
|
|
- // this is a catch handler (no finally)
|
|
- pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset);
|
|
- ncatches = pci->ncatches;
|
|
- for (i = 0; i < ncatches; i++)
|
|
- {
|
|
- pcb = &pci->catch_block[i];
|
|
-
|
|
- if (!ci)
|
|
- {
|
|
- // This code must match the translation code
|
|
- if (exception_record->ExceptionCode == STATUS_DIGITAL_MARS_D_EXCEPTION)
|
|
- {
|
|
- //printf("ei[0] = %p\n", exception_record->ExceptionInformation[0]);
|
|
- ci = **(ClassInfo ***)(exception_record->ExceptionInformation[0]);
|
|
- }
|
|
- else
|
|
- ci = &_Class_9Throwable;
|
|
- }
|
|
-
|
|
- if (_d_isbaseof(ci, pcb->type))
|
|
- {
|
|
- // Matched the catch type, so we've found the handler.
|
|
- int regebp;
|
|
-
|
|
- pti = _d_translate_se_to_d_exception(exception_record);
|
|
-
|
|
- // Initialize catch variable
|
|
- regebp = (int)&frame->ebp; // EBP for this frame
|
|
- *(void **)(regebp + (pcb->bpoffset)) = pti;
|
|
-
|
|
- _d_setunhandled(pti);
|
|
-
|
|
- // Have system call all finally blocks in intervening frames
|
|
- _global_unwind(frame, exception_record);
|
|
-
|
|
- // Call all the finally blocks skipped in this frame
|
|
- _d_local_unwind(handler_table, frame, ndx);
|
|
-
|
|
- _d_setunhandled(NULL);
|
|
-
|
|
- frame->table_index = prev_ndx; // we are out of this handler
|
|
-
|
|
- // Jump to catch block. Does not return.
|
|
- {
|
|
- unsigned catch_esp;
|
|
- fp_t catch_addr;
|
|
-
|
|
- catch_addr = (fp_t)(pcb->code);
|
|
- catch_esp = regebp - handler_table->espoffset - sizeof(fp_t);
|
|
- _asm
|
|
- {
|
|
- mov EAX,catch_esp
|
|
- mov ECX,catch_addr
|
|
- mov [EAX],ECX
|
|
- mov EBP,regebp
|
|
- mov ESP,EAX // reset stack
|
|
- ret // jump to catch block
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- return ExceptionContinueSearch;
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * Exception filter for use in __try..__except block
|
|
- * surrounding call to Dmain()
|
|
- */
|
|
-
|
|
-int _d_exception_filter(struct _EXCEPTION_POINTERS *eptrs,
|
|
- int retval,
|
|
- Object **exception_object)
|
|
-{
|
|
- *exception_object = _d_translate_se_to_d_exception(eptrs->ExceptionRecord);
|
|
- return retval;
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * Throw a D object.
|
|
- */
|
|
-
|
|
-void __stdcall _d_throw(Object *h)
|
|
-{
|
|
- //printf("_d_throw(h = %p, &h = %p)\n", h, &h);
|
|
- //printf("\tvptr = %p\n", *(void **)h);
|
|
- RaiseException(STATUS_DIGITAL_MARS_D_EXCEPTION,
|
|
- EXCEPTION_NONCONTINUABLE,
|
|
- 1, (DWORD *)&h);
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * Create an exception object
|
|
- */
|
|
-
|
|
-Object *_d_create_exception_object(ClassInfo *ci, char *msg)
|
|
-{
|
|
- Throwable *exc;
|
|
-
|
|
- exc = (Throwable *)_d_newclass(ci);
|
|
- // BUG: what if _d_newclass() throws an out of memory exception?
|
|
-
|
|
- if (msg)
|
|
- {
|
|
- exc->msglen = strlen(msg);
|
|
- exc->msg = msg;
|
|
- }
|
|
- return (Object *)exc;
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * Converts a Windows Structured Exception code to a D Exception Object.
|
|
- */
|
|
-
|
|
-Object *_d_translate_se_to_d_exception(EXCEPTION_RECORD *exception_record)
|
|
-{
|
|
- Object *pti;
|
|
-
|
|
- switch (exception_record->ExceptionCode) {
|
|
- case STATUS_DIGITAL_MARS_D_EXCEPTION:
|
|
- // Generated D exception
|
|
- pti = (Object *)(exception_record->ExceptionInformation[0]);
|
|
- break;
|
|
-
|
|
- case STATUS_INTEGER_DIVIDE_BY_ZERO:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Integer Divide by Zero");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_DIVIDE_BY_ZERO:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Float Divide by Zero");
|
|
- break;
|
|
-
|
|
- case STATUS_ACCESS_VIOLATION:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Access Violation");
|
|
- break;
|
|
-
|
|
- case STATUS_STACK_OVERFLOW:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Stack Overflow");
|
|
- break;
|
|
-
|
|
- case STATUS_DATATYPE_MISALIGNMENT:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Datatype Misalignment");
|
|
- break;
|
|
-
|
|
- case STATUS_ARRAY_BOUNDS_EXCEEDED:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Array Bounds Exceeded");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_INVALID_OPERATION:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Invalid Floating Point Operation");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_DENORMAL_OPERAND:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Denormal Operand");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_INEXACT_RESULT:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Inexact Result");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_OVERFLOW:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Overflow");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_UNDERFLOW:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Underflow");
|
|
- break;
|
|
-
|
|
- case STATUS_FLOAT_STACK_CHECK:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Floating Point Stack Check");
|
|
- break;
|
|
-
|
|
- case STATUS_PRIVILEGED_INSTRUCTION:
|
|
- if (*((unsigned char *)(exception_record->ExceptionAddress))==0xF4) { // HLT
|
|
- pti = _d_create_exception_object(&_Class_5Error, "assert(0) or HLT instruction");
|
|
- } else {
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Privileged Instruction");
|
|
- }
|
|
- break;
|
|
-
|
|
- case STATUS_ILLEGAL_INSTRUCTION:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Illegal Instruction");
|
|
- break;
|
|
-
|
|
- case STATUS_BREAKPOINT:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Breakpoint");
|
|
- break;
|
|
-
|
|
- case STATUS_IN_PAGE_ERROR:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Win32 In Page Exception");
|
|
- break;
|
|
-/*
|
|
- case STATUS_INTEGER_OVERFLOW: // not supported on any x86 processor
|
|
- case STATUS_INVALID_DISPOSITION:
|
|
- case STATUS_NONCONTINUABLE_EXCEPTION:
|
|
- case STATUS_SINGLE_STEP:
|
|
- case DBG_CONTROL_C: // only when a debugger is attached
|
|
- // In DMC, but not in Microsoft docs
|
|
- case STATUS_GUARD_PAGE_VIOLATION:
|
|
- case STATUS_INVALID_HANDLE:
|
|
-*/
|
|
- // convert all other exception codes into a Win32Exception
|
|
- default:
|
|
- pti = _d_create_exception_object(&_Class_5Error, "Win32 Exception");
|
|
- break;
|
|
- }
|
|
-
|
|
- return pti;
|
|
-}
|
|
-
|
|
-/**************************************
|
|
- * Call finally blocks in the current stack frame until stop_index.
|
|
- * This is roughly equivalent to _local_unwind() for C in \src\win32\ehsup.c
|
|
- */
|
|
-
|
|
-void __cdecl _d_local_unwind(struct DHandlerTable *handler_table,
|
|
- struct DEstablisherFrame *frame, int stop_index)
|
|
-{
|
|
- struct DHandlerInfo *phi;
|
|
- struct DCatchInfo *pci;
|
|
- int i;
|
|
-
|
|
- // Set up a special exception handler to catch double-fault exceptions.
|
|
- __asm
|
|
- {
|
|
- push dword ptr -1
|
|
- push dword ptr 0
|
|
- push offset _local_except_handler // defined in src\win32\ehsup.c
|
|
- push dword ptr fs:_except_list
|
|
- mov FS:_except_list,ESP
|
|
- }
|
|
-
|
|
- for (i = frame->table_index; i != -1 && i != stop_index; i = phi->prev_index)
|
|
- {
|
|
- phi = &handler_table->handler_info[i];
|
|
- if (phi->finally_code)
|
|
- {
|
|
- // Note that it is unnecessary to adjust the ESP, as the finally block
|
|
- // accesses all items on the stack as relative to EBP.
|
|
-
|
|
- DWORD *catch_ebp = &frame->ebp;
|
|
- void *blockaddr = phi->finally_code;
|
|
-
|
|
- _asm
|
|
- {
|
|
- push EBX
|
|
- mov EBX,blockaddr
|
|
- push EBP
|
|
- mov EBP,catch_ebp
|
|
- call EBX
|
|
- pop EBP
|
|
- pop EBX
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- _asm
|
|
- {
|
|
- pop FS:_except_list
|
|
- add ESP,12
|
|
- }
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * external version of the unwinder
|
|
- */
|
|
-
|
|
-__declspec(naked) void __cdecl _d_local_unwind2()
|
|
-{
|
|
- __asm
|
|
- {
|
|
- jmp _d_local_unwind
|
|
- }
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * The frame handler, this is called for each frame that has been registered
|
|
- * in the OS except_list.
|
|
- * Input:
|
|
- * EAX the handler table for the frame
|
|
- */
|
|
-
|
|
-EXCEPTION_DISPOSITION _d_monitor_handler(
|
|
- EXCEPTION_RECORD *exception_record,
|
|
- struct DEstablisherFrame *frame,
|
|
- CONTEXT *context,
|
|
- void *dispatcher_context)
|
|
-{
|
|
- if (exception_record->ExceptionFlags & EXCEPTION_UNWIND)
|
|
- {
|
|
- _d_monitorexit((Object *)frame->table_index);
|
|
- }
|
|
- else
|
|
- {
|
|
- }
|
|
- return ExceptionContinueSearch;
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- */
|
|
-
|
|
-void _d_monitor_prolog(void *x, void *y, Object *h)
|
|
-{
|
|
- __asm
|
|
- {
|
|
- push EAX
|
|
- }
|
|
- //printf("_d_monitor_prolog(x=%p, y=%p, h=%p)\n", x, y, h);
|
|
- _d_monitorenter(h);
|
|
- __asm
|
|
- {
|
|
- pop EAX
|
|
- }
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- */
|
|
-
|
|
-void _d_monitor_epilog(void *x, void *y, Object *h)
|
|
-{
|
|
- //printf("_d_monitor_epilog(x=%p, y=%p, h=%p)\n", x, y, h);
|
|
- __asm
|
|
- {
|
|
- push EAX
|
|
- push EDX
|
|
- }
|
|
- _d_monitorexit(h);
|
|
- __asm
|
|
- {
|
|
- pop EDX
|
|
- pop EAX
|
|
- }
|
|
-}
|
|
-
|
|
-#endif
|
|
-
|
|
-/* ======================== linux =============================== */
|
|
-
|
|
-#if linux
|
|
-
|
|
-#include "mars.h"
|
|
-
|
|
-extern ClassInfo D6object9Throwable7__ClassZ;
|
|
-#define _Class_9Throwable D6object9Throwable7__ClassZ;
|
|
-
|
|
-extern ClassInfo D6object5Error7__ClassZ;
|
|
-#define _Class_5Error D6object5Error7__ClassZ
|
|
-
|
|
-typedef int (*fp_t)(); // function pointer in ambient memory model
|
|
-
|
|
-struct DHandlerInfo
|
|
-{
|
|
- unsigned offset; // offset from function address to start of guarded section
|
|
- int prev_index; // previous table index
|
|
- unsigned cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch)
|
|
- void *finally_code; // pointer to finally code to execute
|
|
- // (!=0 if try-finally)
|
|
-};
|
|
-
|
|
-// Address of DHandlerTable, searched for by eh_finddata()
|
|
-
|
|
-struct DHandlerTable
|
|
-{
|
|
- void *fptr; // pointer to start of function
|
|
- unsigned espoffset; // offset of ESP from EBP
|
|
- unsigned retoffset; // offset from start of function to return code
|
|
- unsigned nhandlers; // dimension of handler_info[]
|
|
- struct DHandlerInfo handler_info[1];
|
|
-};
|
|
-
|
|
-struct DCatchBlock
|
|
-{
|
|
- ClassInfo *type; // catch type
|
|
- unsigned bpoffset; // EBP offset of catch var
|
|
- void *code; // catch handler code
|
|
-};
|
|
-
|
|
-// Create one of these for each try-catch
|
|
-struct DCatchInfo
|
|
-{
|
|
- unsigned ncatches; // number of catch blocks
|
|
- struct DCatchBlock catch_block[1]; // data for each catch block
|
|
-};
|
|
-
|
|
-// One of these is generated for each function with try-catch or try-finally
|
|
-
|
|
-struct FuncTable
|
|
-{
|
|
- void *fptr; // pointer to start of function
|
|
- struct DHandlerTable *handlertable; // eh data for this function
|
|
- unsigned size; // size of function in bytes
|
|
-};
|
|
-
|
|
-extern struct FuncTable *table_start;
|
|
-extern struct FuncTable *table_end;
|
|
-
|
|
-void terminate()
|
|
-{
|
|
-// _asm
|
|
-// {
|
|
-// hlt
|
|
-// }
|
|
-}
|
|
-
|
|
-/*******************************************
|
|
- * Given address that is inside a function,
|
|
- * figure out which function it is in.
|
|
- * Return DHandlerTable if there is one, NULL if not.
|
|
- */
|
|
-
|
|
-struct DHandlerTable *__eh_finddata(void *address)
|
|
-{
|
|
- struct FuncTable *ft;
|
|
-
|
|
- for (ft = (struct FuncTable *)table_start;
|
|
- ft < (struct FuncTable *)table_end;
|
|
- ft++)
|
|
- {
|
|
- if (ft->fptr <= address &&
|
|
- address < (void *)((char *)ft->fptr + ft->size))
|
|
- {
|
|
- return ft->handlertable;
|
|
- }
|
|
- }
|
|
- return NULL;
|
|
-}
|
|
-
|
|
-
|
|
-/******************************
|
|
- * Given EBP, find return address to caller, and caller's EBP.
|
|
- * Input:
|
|
- * regbp Value of EBP for current function
|
|
- * *pretaddr Return address
|
|
- * Output:
|
|
- * *pretaddr return address to caller
|
|
- * Returns:
|
|
- * caller's EBP
|
|
- */
|
|
-
|
|
-unsigned __eh_find_caller(unsigned regbp, unsigned *pretaddr)
|
|
-{
|
|
- unsigned bp = *(unsigned *)regbp;
|
|
-
|
|
- if (bp) // if not end of call chain
|
|
- {
|
|
- // Perform sanity checks on new EBP.
|
|
- // If it is screwed up, terminate() hopefully before we do more damage.
|
|
- if (bp <= regbp)
|
|
- // stack should grow to smaller values
|
|
- terminate();
|
|
-
|
|
- *pretaddr = *(unsigned *)(regbp + sizeof(int));
|
|
- }
|
|
- return bp;
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * Throw a D object.
|
|
- */
|
|
-
|
|
-void __stdcall _d_throw(Object *h)
|
|
-{
|
|
- unsigned regebp;
|
|
-
|
|
- //printf("_d_throw(h = %p, &h = %p)\n", h, &h);
|
|
- //printf("\tvptr = %p\n", *(void **)h);
|
|
-
|
|
- regebp = _EBP;
|
|
-
|
|
- while (1) // for each function on the stack
|
|
- {
|
|
- struct DHandlerTable *handler_table;
|
|
- struct FuncTable *pfunc;
|
|
- struct DHandlerInfo *phi;
|
|
- unsigned retaddr;
|
|
- unsigned funcoffset;
|
|
- unsigned spoff;
|
|
- unsigned retoffset;
|
|
- int index;
|
|
- int dim;
|
|
- int ndx;
|
|
- int prev_ndx;
|
|
-
|
|
- regebp = __eh_find_caller(regebp,&retaddr);
|
|
- if (!regebp)
|
|
- // if end of call chain
|
|
- break;
|
|
-
|
|
- handler_table = __eh_finddata((void *)retaddr); // find static data associated with function
|
|
- if (!handler_table) // if no static data
|
|
- {
|
|
- continue;
|
|
- }
|
|
- funcoffset = (unsigned)handler_table->fptr;
|
|
- spoff = handler_table->espoffset;
|
|
- retoffset = handler_table->retoffset;
|
|
-
|
|
-#ifdef DEBUG
|
|
- printf("retaddr = x%x\n",(unsigned)retaddr);
|
|
- printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n",
|
|
- regebp,funcoffset,spoff,retoffset);
|
|
-#endif
|
|
-
|
|
- // Find start index for retaddr in static data
|
|
- dim = handler_table->nhandlers;
|
|
- index = -1;
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- phi = &handler_table->handler_info[i];
|
|
-
|
|
- if ((unsigned)retaddr >= funcoffset + phi->offset)
|
|
- index = i;
|
|
- }
|
|
-
|
|
- // walk through handler table, checking each handler
|
|
- // with an index smaller than the current table_index
|
|
- for (ndx = index; ndx != -1; ndx = prev_ndx)
|
|
- {
|
|
- phi = &handler_table->handler_info[ndx];
|
|
- prev_ndx = phi->prev_index;
|
|
- if (phi->cioffset)
|
|
- {
|
|
- // this is a catch handler (no finally)
|
|
- struct DCatchInfo *pci;
|
|
- int ncatches;
|
|
- int i;
|
|
-
|
|
- pci = (struct DCatchInfo *)((char *)handler_table + phi->cioffset);
|
|
- ncatches = pci->ncatches;
|
|
- for (i = 0; i < ncatches; i++)
|
|
- {
|
|
- struct DCatchBlock *pcb;
|
|
- ClassInfo *ci = **(ClassInfo ***)h;
|
|
-
|
|
- pcb = &pci->catch_block[i];
|
|
-
|
|
- if (_d_isbaseof(ci, pcb->type))
|
|
- { // Matched the catch type, so we've found the handler.
|
|
-
|
|
- // Initialize catch variable
|
|
- *(void **)(regebp + (pcb->bpoffset)) = h;
|
|
-
|
|
- // Jump to catch block. Does not return.
|
|
- {
|
|
- unsigned catch_esp;
|
|
- fp_t catch_addr;
|
|
-
|
|
- catch_addr = (fp_t)(pcb->code);
|
|
- catch_esp = regebp - handler_table->espoffset - sizeof(fp_t);
|
|
- _asm
|
|
- {
|
|
- mov EAX,catch_esp
|
|
- mov ECX,catch_addr
|
|
- mov [EAX],ECX
|
|
- mov EBP,regebp
|
|
- mov ESP,EAX // reset stack
|
|
- ret // jump to catch block
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- else if (phi->finally_code)
|
|
- { // Call finally block
|
|
- // Note that it is unnecessary to adjust the ESP, as the finally block
|
|
- // accesses all items on the stack as relative to EBP.
|
|
-
|
|
- void *blockaddr = phi->finally_code;
|
|
-
|
|
- _asm
|
|
- {
|
|
- push EBX
|
|
- mov EBX,blockaddr
|
|
- push EBP
|
|
- mov EBP,regebp
|
|
- call EBX
|
|
- pop EBP
|
|
- pop EBX
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-
|
|
-#endif
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/deh2.d druntime/src/rt/deh2.d
|
|
--- druntime-old/src/rt/deh2.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/deh2.d 1970-01-01 03:00:00.000000000 +0300
|
|
@@ -1,322 +0,0 @@
|
|
-/**
|
|
- * Implementation of exception handling support routines for Posix.
|
|
- *
|
|
- * Copyright: Copyright Digital Mars 2000 - 2009.
|
|
- * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
|
|
- * Authors: Walter Bright
|
|
- *
|
|
- * Copyright Digital Mars 2000 - 2009.
|
|
- * Distributed under the Boost Software License, Version 1.0.
|
|
- * (See accompanying file LICENSE_1_0.txt or copy at
|
|
- * http://www.boost.org/LICENSE_1_0.txt)
|
|
- */
|
|
-module rt.deh2;
|
|
-
|
|
-//debug=1;
|
|
-
|
|
-extern (C)
|
|
-{
|
|
- extern __gshared
|
|
- {
|
|
- void* _deh_beg;
|
|
- void* _deh_end;
|
|
- }
|
|
-
|
|
- int _d_isbaseof(ClassInfo oc, ClassInfo c);
|
|
-
|
|
- void _d_setunhandled(Object* o);
|
|
-}
|
|
-
|
|
-alias int (*fp_t)(); // function pointer in ambient memory model
|
|
-
|
|
-struct DHandlerInfo
|
|
-{
|
|
- uint offset; // offset from function address to start of guarded section
|
|
- uint endoffset; // offset of end of guarded section
|
|
- int prev_index; // previous table index
|
|
- uint cioffset; // offset to DCatchInfo data from start of table (!=0 if try-catch)
|
|
- void *finally_code; // pointer to finally code to execute
|
|
- // (!=0 if try-finally)
|
|
-}
|
|
-
|
|
-// Address of DHandlerTable, searched for by eh_finddata()
|
|
-
|
|
-struct DHandlerTable
|
|
-{
|
|
- void *fptr; // pointer to start of function
|
|
- uint espoffset; // offset of ESP from EBP
|
|
- uint retoffset; // offset from start of function to return code
|
|
- uint nhandlers; // dimension of handler_info[]
|
|
- DHandlerInfo handler_info[1];
|
|
-}
|
|
-
|
|
-struct DCatchBlock
|
|
-{
|
|
- ClassInfo type; // catch type
|
|
- uint bpoffset; // EBP offset of catch var
|
|
- void *code; // catch handler code
|
|
-}
|
|
-
|
|
-// Create one of these for each try-catch
|
|
-struct DCatchInfo
|
|
-{
|
|
- uint ncatches; // number of catch blocks
|
|
- DCatchBlock catch_block[1]; // data for each catch block
|
|
-}
|
|
-
|
|
-// One of these is generated for each function with try-catch or try-finally
|
|
-
|
|
-struct FuncTable
|
|
-{
|
|
- void *fptr; // pointer to start of function
|
|
- DHandlerTable *handlertable; // eh data for this function
|
|
- uint fsize; // size of function in bytes
|
|
-}
|
|
-
|
|
-void terminate()
|
|
-{
|
|
- asm
|
|
- {
|
|
- hlt ;
|
|
- }
|
|
-}
|
|
-
|
|
-/*******************************************
|
|
- * Given address that is inside a function,
|
|
- * figure out which function it is in.
|
|
- * Return DHandlerTable if there is one, NULL if not.
|
|
- */
|
|
-
|
|
-DHandlerTable *__eh_finddata(void *address)
|
|
-{
|
|
- FuncTable *ft;
|
|
-
|
|
-// debug printf("__eh_finddata(address = x%x)\n", address);
|
|
-// debug printf("_deh_beg = x%x, _deh_end = x%x\n", &_deh_beg, &_deh_end);
|
|
- for (ft = cast(FuncTable *)&_deh_beg;
|
|
- ft < cast(FuncTable *)&_deh_end;
|
|
- ft++)
|
|
- {
|
|
-// debug printf("\tfptr = x%x, fsize = x%03x, handlertable = x%x\n",
|
|
-// ft.fptr, ft.fsize, ft.handlertable);
|
|
-
|
|
- if (ft.fptr <= address &&
|
|
- address < cast(void *)(cast(char *)ft.fptr + ft.fsize))
|
|
- {
|
|
-// debug printf("\tfound handler table\n");
|
|
- return ft.handlertable;
|
|
- }
|
|
- }
|
|
-// debug printf("\tnot found\n");
|
|
- return null;
|
|
-}
|
|
-
|
|
-
|
|
-/******************************
|
|
- * Given EBP, find return address to caller, and caller's EBP.
|
|
- * Input:
|
|
- * regbp Value of EBP for current function
|
|
- * *pretaddr Return address
|
|
- * Output:
|
|
- * *pretaddr return address to caller
|
|
- * Returns:
|
|
- * caller's EBP
|
|
- */
|
|
-
|
|
-uint __eh_find_caller(uint regbp, uint *pretaddr)
|
|
-{
|
|
- uint bp = *cast(uint *)regbp;
|
|
-
|
|
- if (bp) // if not end of call chain
|
|
- {
|
|
- // Perform sanity checks on new EBP.
|
|
- // If it is screwed up, terminate() hopefully before we do more damage.
|
|
- if (bp <= regbp)
|
|
- // stack should grow to smaller values
|
|
- terminate();
|
|
-
|
|
- *pretaddr = *cast(uint *)(regbp + int.sizeof);
|
|
- }
|
|
- return bp;
|
|
-}
|
|
-
|
|
-/***********************************
|
|
- * Throw a D object.
|
|
- */
|
|
-
|
|
-extern (Windows) void _d_throw(Object *h)
|
|
-{
|
|
- uint regebp;
|
|
-
|
|
- debug
|
|
- {
|
|
- printf("_d_throw(h = %p, &h = %p)\n", h, &h);
|
|
- printf("\tvptr = %p\n", *cast(void **)h);
|
|
- }
|
|
-
|
|
- asm
|
|
- {
|
|
- mov regebp,EBP ;
|
|
- }
|
|
-
|
|
- _d_setunhandled(h);
|
|
-
|
|
-//static uint abc;
|
|
-//if (++abc == 2) *(char *)0=0;
|
|
-
|
|
-//int count = 0;
|
|
- while (1) // for each function on the stack
|
|
- {
|
|
- DHandlerTable *handler_table;
|
|
- FuncTable *pfunc;
|
|
- DHandlerInfo *phi;
|
|
- uint retaddr;
|
|
- uint funcoffset;
|
|
- uint spoff;
|
|
- uint retoffset;
|
|
- int index;
|
|
- int dim;
|
|
- int ndx;
|
|
- int prev_ndx;
|
|
-
|
|
- regebp = __eh_find_caller(regebp,&retaddr);
|
|
- if (!regebp)
|
|
- { // if end of call chain
|
|
- debug printf("end of call chain\n");
|
|
- break;
|
|
- }
|
|
-
|
|
- debug printf("found caller, EBP = x%x, retaddr = x%x\n", regebp, retaddr);
|
|
-//if (++count == 12) *(char*)0=0;
|
|
- handler_table = __eh_finddata(cast(void *)retaddr); // find static data associated with function
|
|
- if (!handler_table) // if no static data
|
|
- {
|
|
- debug printf("no handler table\n");
|
|
- continue;
|
|
- }
|
|
- funcoffset = cast(uint)handler_table.fptr;
|
|
- spoff = handler_table.espoffset;
|
|
- retoffset = handler_table.retoffset;
|
|
-
|
|
- debug
|
|
- {
|
|
- printf("retaddr = x%x\n",cast(uint)retaddr);
|
|
- printf("regebp=x%04x, funcoffset=x%04x, spoff=x%x, retoffset=x%x\n",
|
|
- regebp,funcoffset,spoff,retoffset);
|
|
- }
|
|
-
|
|
- // Find start index for retaddr in static data
|
|
- dim = handler_table.nhandlers;
|
|
-
|
|
- debug
|
|
- {
|
|
- printf("handler_info[]:\n");
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- phi = &handler_table.handler_info[i];
|
|
- printf("\t[%d]: offset = x%04x, endoffset = x%04x, prev_index = %d, cioffset = x%04x, finally_code = %x\n",
|
|
- i, phi.offset, phi.endoffset, phi.prev_index, phi.cioffset, phi.finally_code);
|
|
- }
|
|
- }
|
|
-
|
|
- index = -1;
|
|
- for (int i = 0; i < dim; i++)
|
|
- {
|
|
- phi = &handler_table.handler_info[i];
|
|
-
|
|
- debug printf("i = %d, phi.offset = %04x\n", i, funcoffset + phi.offset);
|
|
- if (cast(uint)retaddr > funcoffset + phi.offset &&
|
|
- cast(uint)retaddr <= funcoffset + phi.endoffset)
|
|
- index = i;
|
|
- }
|
|
- debug printf("index = %d\n", index);
|
|
-
|
|
- // walk through handler table, checking each handler
|
|
- // with an index smaller than the current table_index
|
|
- for (ndx = index; ndx != -1; ndx = prev_ndx)
|
|
- {
|
|
- phi = &handler_table.handler_info[ndx];
|
|
- prev_ndx = phi.prev_index;
|
|
- if (phi.cioffset)
|
|
- {
|
|
- // this is a catch handler (no finally)
|
|
- DCatchInfo *pci;
|
|
- int ncatches;
|
|
- int i;
|
|
-
|
|
- pci = cast(DCatchInfo *)(cast(char *)handler_table + phi.cioffset);
|
|
- ncatches = pci.ncatches;
|
|
- for (i = 0; i < ncatches; i++)
|
|
- {
|
|
- DCatchBlock *pcb;
|
|
- ClassInfo ci = **cast(ClassInfo **)h;
|
|
-
|
|
- pcb = &pci.catch_block[i];
|
|
-
|
|
- if (_d_isbaseof(ci, pcb.type))
|
|
- { // Matched the catch type, so we've found the handler.
|
|
-
|
|
- _d_setunhandled(null);
|
|
-
|
|
- // Initialize catch variable
|
|
- *cast(void **)(regebp + (pcb.bpoffset)) = h;
|
|
-
|
|
- // Jump to catch block. Does not return.
|
|
- {
|
|
- uint catch_esp;
|
|
- fp_t catch_addr;
|
|
-
|
|
- catch_addr = cast(fp_t)(pcb.code);
|
|
- catch_esp = regebp - handler_table.espoffset - fp_t.sizeof;
|
|
- asm
|
|
- {
|
|
- mov EAX,catch_esp ;
|
|
- mov ECX,catch_addr ;
|
|
- mov [EAX],ECX ;
|
|
- mov EBP,regebp ;
|
|
- mov ESP,EAX ; // reset stack
|
|
- ret ; // jump to catch block
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- else if (phi.finally_code)
|
|
- { // Call finally block
|
|
- // Note that it is unnecessary to adjust the ESP, as the finally block
|
|
- // accesses all items on the stack as relative to EBP.
|
|
-
|
|
- void *blockaddr = phi.finally_code;
|
|
-
|
|
- version (OSX)
|
|
- {
|
|
- asm
|
|
- {
|
|
- sub ESP,4 ; // align stack to 16
|
|
- push EBX ;
|
|
- mov EBX,blockaddr ;
|
|
- push EBP ;
|
|
- mov EBP,regebp ;
|
|
- call EBX ;
|
|
- pop EBP ;
|
|
- pop EBX ;
|
|
- add ESP,4 ;
|
|
- }
|
|
- }
|
|
- else
|
|
- {
|
|
- asm
|
|
- {
|
|
- push EBX ;
|
|
- mov EBX,blockaddr ;
|
|
- push EBP ;
|
|
- mov EBP,regebp ;
|
|
- call EBX ;
|
|
- pop EBP ;
|
|
- pop EBX ;
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/eh.d druntime/src/rt/eh.d
|
|
--- druntime-old/src/rt/eh.d 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/src/rt/eh.d 2010-10-03 18:29:58.099624002 +0400
|
|
@@ -0,0 +1,428 @@
|
|
+/**
|
|
+ * This module contains functions and structures required for
|
|
+ * exception handling.
|
|
+ */
|
|
+module eh;
|
|
+
|
|
+private import core.stdc.stdio;
|
|
+private import core.stdc.stdlib;
|
|
+private import rt.util.console;
|
|
+private import ldc.cstdarg;
|
|
+
|
|
+// debug = EH_personality;
|
|
+// debug = EH_personality_verbose;
|
|
+
|
|
+// current EH implementation works on x86
|
|
+// if it has a working unwind runtime
|
|
+version(X86) {
|
|
+ version(linux) version=X86_UNWIND;
|
|
+ version(darwin) version=X86_UNWIND;
|
|
+ version(solaris) version=X86_UNWIND;
|
|
+}
|
|
+version(X86_64) {
|
|
+ version(linux) version=X86_UNWIND;
|
|
+ version(darwin) version=X86_UNWIND;
|
|
+ version(solaris) version=X86_UNWIND;
|
|
+}
|
|
+
|
|
+//version = HP_LIBUNWIND;
|
|
+
|
|
+// D runtime functions
|
|
+extern(C) {
|
|
+ int _d_isbaseof(ClassInfo oc, ClassInfo c);
|
|
+}
|
|
+
|
|
+// libunwind headers
|
|
+extern(C)
|
|
+{
|
|
+ enum _Unwind_Reason_Code : int
|
|
+ {
|
|
+ NO_REASON = 0,
|
|
+ FOREIGN_EXCEPTION_CAUGHT = 1,
|
|
+ FATAL_PHASE2_ERROR = 2,
|
|
+ FATAL_PHASE1_ERROR = 3,
|
|
+ NORMAL_STOP = 4,
|
|
+ END_OF_STACK = 5,
|
|
+ HANDLER_FOUND = 6,
|
|
+ INSTALL_CONTEXT = 7,
|
|
+ CONTINUE_UNWIND = 8
|
|
+ }
|
|
+
|
|
+ enum _Unwind_Action : int
|
|
+ {
|
|
+ SEARCH_PHASE = 1,
|
|
+ CLEANUP_PHASE = 2,
|
|
+ HANDLER_FRAME = 4,
|
|
+ FORCE_UNWIND = 8
|
|
+ }
|
|
+
|
|
+ alias void* _Unwind_Context_Ptr;
|
|
+
|
|
+ alias void function(_Unwind_Reason_Code, _Unwind_Exception*) _Unwind_Exception_Cleanup_Fn;
|
|
+
|
|
+ struct _Unwind_Exception
|
|
+ {
|
|
+ ulong exception_class;
|
|
+ _Unwind_Exception_Cleanup_Fn exception_cleanup;
|
|
+ ptrdiff_t private_1;
|
|
+ ptrdiff_t private_2;
|
|
+ }
|
|
+
|
|
+// interface to HP's libunwind from http://www.nongnu.org/libunwind/
|
|
+version(HP_LIBUNWIND)
|
|
+{
|
|
+ void __libunwind_Unwind_Resume(_Unwind_Exception *);
|
|
+ _Unwind_Reason_Code __libunwind_Unwind_RaiseException(_Unwind_Exception *);
|
|
+ ptrdiff_t __libunwind_Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr
|
|
+ context);
|
|
+ ptrdiff_t __libunwind_Unwind_GetIP(_Unwind_Context_Ptr context);
|
|
+ ptrdiff_t __libunwind_Unwind_SetIP(_Unwind_Context_Ptr context,
|
|
+ ptrdiff_t new_value);
|
|
+ ptrdiff_t __libunwind_Unwind_SetGR(_Unwind_Context_Ptr context, int index,
|
|
+ ptrdiff_t new_value);
|
|
+ ptrdiff_t __libunwind_Unwind_GetRegionStart(_Unwind_Context_Ptr context);
|
|
+
|
|
+ alias __libunwind_Unwind_Resume _Unwind_Resume;
|
|
+ alias __libunwind_Unwind_RaiseException _Unwind_RaiseException;
|
|
+ alias __libunwind_Unwind_GetLanguageSpecificData
|
|
+ _Unwind_GetLanguageSpecificData;
|
|
+ alias __libunwind_Unwind_GetIP _Unwind_GetIP;
|
|
+ alias __libunwind_Unwind_SetIP _Unwind_SetIP;
|
|
+ alias __libunwind_Unwind_SetGR _Unwind_SetGR;
|
|
+ alias __libunwind_Unwind_GetRegionStart _Unwind_GetRegionStart;
|
|
+}
|
|
+else version(X86_UNWIND)
|
|
+{
|
|
+ void _Unwind_Resume(_Unwind_Exception*);
|
|
+ _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*);
|
|
+ ptrdiff_t _Unwind_GetLanguageSpecificData(_Unwind_Context_Ptr context);
|
|
+ ptrdiff_t _Unwind_GetIP(_Unwind_Context_Ptr context);
|
|
+ ptrdiff_t _Unwind_SetIP(_Unwind_Context_Ptr context, ptrdiff_t new_value);
|
|
+ ptrdiff_t _Unwind_SetGR(_Unwind_Context_Ptr context, int index,
|
|
+ ptrdiff_t new_value);
|
|
+ ptrdiff_t _Unwind_GetRegionStart(_Unwind_Context_Ptr context);
|
|
+}
|
|
+else
|
|
+{
|
|
+ // runtime calls these directly
|
|
+ void _Unwind_Resume(_Unwind_Exception*)
|
|
+ {
|
|
+ console("_Unwind_Resume is not implemented on this platform.\n");
|
|
+ }
|
|
+ _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception*)
|
|
+ {
|
|
+ console("_Unwind_RaiseException is not implemented on this platform.\n");
|
|
+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
|
|
+ }
|
|
+}
|
|
+
|
|
+}
|
|
+
|
|
+// error and exit
|
|
+extern(C) private void fatalerror(in char* format, ...)
|
|
+{
|
|
+ va_list args;
|
|
+ va_start(args, format);
|
|
+ printf("Fatal error in EH code: ");
|
|
+ vprintf(format, args);
|
|
+ printf("\n");
|
|
+ abort();
|
|
+}
|
|
+
|
|
+
|
|
+// helpers for reading certain DWARF data
|
|
+private ubyte* get_uleb128(ubyte* addr, ref size_t res)
|
|
+{
|
|
+ res = 0;
|
|
+ size_t bitsize = 0;
|
|
+
|
|
+ // read as long as high bit is set
|
|
+ while(*addr & 0x80) {
|
|
+ res |= (*addr & 0x7f) << bitsize;
|
|
+ bitsize += 7;
|
|
+ addr += 1;
|
|
+ if(bitsize >= size_t.sizeof*8)
|
|
+ fatalerror("tried to read uleb128 that exceeded size of size_t");
|
|
+ }
|
|
+ // read last
|
|
+ if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize)
|
|
+ fatalerror("Fatal error in EH code: tried to read uleb128 that exceeded size of size_t");
|
|
+ res |= (*addr) << bitsize;
|
|
+
|
|
+ return addr + 1;
|
|
+}
|
|
+
|
|
+private ubyte* get_sleb128(ubyte* addr, ref ptrdiff_t res)
|
|
+{
|
|
+ res = 0;
|
|
+ size_t bitsize = 0;
|
|
+
|
|
+ // read as long as high bit is set
|
|
+ while(*addr & 0x80) {
|
|
+ res |= (*addr & 0x7f) << bitsize;
|
|
+ bitsize += 7;
|
|
+ addr += 1;
|
|
+ if(bitsize >= size_t.sizeof*8)
|
|
+ fatalerror("tried to read sleb128 that exceeded size of size_t");
|
|
+ }
|
|
+ // read last
|
|
+ if(bitsize != 0 && *addr >= 1 << size_t.sizeof*8 - bitsize)
|
|
+ fatalerror("tried to read sleb128 that exceeded size of size_t");
|
|
+ res |= (*addr) << bitsize;
|
|
+
|
|
+ // take care of sign
|
|
+ if(bitsize < size_t.sizeof*8 && ((*addr) & 0x40))
|
|
+ res |= cast(ptrdiff_t)(-1) ^ ((1 << (bitsize+7)) - 1);
|
|
+
|
|
+ return addr + 1;
|
|
+}
|
|
+
|
|
+
|
|
+// exception struct used by the runtime.
|
|
+// _d_throw allocates a new instance and passes the address of its
|
|
+// _Unwind_Exception member to the unwind call. The personality
|
|
+// routine is then able to get the whole struct by looking at the data
|
|
+// surrounding the unwind info.
|
|
+struct _d_exception
|
|
+{
|
|
+ Object exception_object;
|
|
+ _Unwind_Exception unwind_info;
|
|
+}
|
|
+
|
|
+// the 8-byte string identifying the type of exception
|
|
+// the first 4 are for vendor, the second 4 for language
|
|
+//TODO: This may be the wrong way around
|
|
+char[8] _d_exception_class = "LLDCD1\0\0";
|
|
+
|
|
+
|
|
+//
|
|
+// x86 unwind specific implementation of personality function
|
|
+// and helpers
|
|
+//
|
|
+version(X86_UNWIND)
|
|
+{
|
|
+
|
|
+// the personality routine gets called by the unwind handler and is responsible for
|
|
+// reading the EH tables and deciding what to do
|
|
+extern(C) _Unwind_Reason_Code _d_eh_personality(int ver, _Unwind_Action actions, ulong exception_class, _Unwind_Exception* exception_info, _Unwind_Context_Ptr context)
|
|
+{
|
|
+ debug(EH_personality_verbose) printf("entering personality function. context: %p\n", context);
|
|
+ // check ver: the C++ Itanium ABI only allows ver == 1
|
|
+ if(ver != 1)
|
|
+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
|
|
+
|
|
+ // check exceptionClass
|
|
+ //TODO: Treat foreign exceptions with more respect
|
|
+ if((cast(char*)&exception_class)[0..8] != _d_exception_class)
|
|
+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
|
|
+
|
|
+ // find call site table, action table and classinfo table
|
|
+ // Note: callsite and action tables do not contain static-length
|
|
+ // data and will be parsed as needed
|
|
+ // Note: classinfo_table points past the end of the table
|
|
+ ubyte* callsite_table;
|
|
+ ubyte* action_table;
|
|
+ ClassInfo* classinfo_table;
|
|
+ _d_getLanguageSpecificTables(context, callsite_table, action_table, classinfo_table);
|
|
+ if (callsite_table is null)
|
|
+ return _Unwind_Reason_Code.CONTINUE_UNWIND;
|
|
+
|
|
+ /*
|
|
+ find landing pad and action table index belonging to ip by walking
|
|
+ the callsite_table
|
|
+ */
|
|
+ ubyte* callsite_walker = callsite_table;
|
|
+
|
|
+ // get the instruction pointer
|
|
+ // will be used to find the right entry in the callsite_table
|
|
+ // -1 because it will point past the last instruction
|
|
+ ptrdiff_t ip = _Unwind_GetIP(context) - 1;
|
|
+
|
|
+ // address block_start is relative to
|
|
+ ptrdiff_t region_start = _Unwind_GetRegionStart(context);
|
|
+
|
|
+ // table entries
|
|
+ uint block_start_offset, block_size;
|
|
+ ptrdiff_t landing_pad;
|
|
+ size_t action_offset;
|
|
+
|
|
+ while(true) {
|
|
+ // if we've gone through the list and found nothing...
|
|
+ if(callsite_walker >= action_table)
|
|
+ return _Unwind_Reason_Code.CONTINUE_UNWIND;
|
|
+
|
|
+ block_start_offset = *cast(uint*)callsite_walker;
|
|
+ block_size = *(cast(uint*)callsite_walker + 1);
|
|
+ landing_pad = *(cast(uint*)callsite_walker + 2);
|
|
+ if(landing_pad)
|
|
+ landing_pad += region_start;
|
|
+ callsite_walker = get_uleb128(callsite_walker + 3*uint.sizeof, action_offset);
|
|
+
|
|
+ debug(EH_personality_verbose) printf("ip=%llx %d %d %llx\n", ip, block_start_offset, block_size, landing_pad);
|
|
+
|
|
+ // since the list is sorted, as soon as we're past the ip
|
|
+ // there's no handler to be found
|
|
+ if(ip < region_start + block_start_offset)
|
|
+ return _Unwind_Reason_Code.CONTINUE_UNWIND;
|
|
+
|
|
+ // if we've found our block, exit
|
|
+ if(ip < region_start + block_start_offset + block_size)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ debug(EH_personality) printf("Found correct landing pad and actionOffset %d\n", action_offset);
|
|
+
|
|
+ // now we need the exception's classinfo to find a handler
|
|
+ // the exception_info is actually a member of a larger _d_exception struct
|
|
+ // the runtime allocated. get that now
|
|
+ _d_exception* exception_struct = cast(_d_exception*)(cast(ubyte*)exception_info - _d_exception.unwind_info.offsetof);
|
|
+
|
|
+ // if there's no action offset and no landing pad, continue unwinding
|
|
+ if(!action_offset && !landing_pad)
|
|
+ return _Unwind_Reason_Code.CONTINUE_UNWIND;
|
|
+
|
|
+ // if there's no action offset but a landing pad, this is a cleanup handler
|
|
+ else if(!action_offset && landing_pad)
|
|
+ return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context);
|
|
+
|
|
+ /*
|
|
+ walk action table chain, comparing classinfos using _d_isbaseof
|
|
+ */
|
|
+ ubyte* action_walker = action_table + action_offset - 1;
|
|
+
|
|
+ ptrdiff_t ti_offset, next_action_offset;
|
|
+ while(true) {
|
|
+ action_walker = get_sleb128(action_walker, ti_offset);
|
|
+ // it is intentional that we not modify action_walker here
|
|
+ // next_action_offset is from current action_walker position
|
|
+ get_sleb128(action_walker, next_action_offset);
|
|
+
|
|
+ // negative are 'filters' which we don't use
|
|
+ if(!(ti_offset >= 0))
|
|
+ fatalerror("Filter actions are unsupported");
|
|
+
|
|
+ // zero means cleanup, which we require to be the last action
|
|
+ if(ti_offset == 0) {
|
|
+ if(!(next_action_offset == 0))
|
|
+ fatalerror("Cleanup action must be last in chain");
|
|
+ return _d_eh_install_finally_context(actions, landing_pad, exception_struct, context);
|
|
+ }
|
|
+
|
|
+ // get classinfo for action and check if the one in the
|
|
+ // exception structure is a base
|
|
+ ClassInfo catch_ci = *(classinfo_table - ti_offset);
|
|
+ debug(EH_personality) printf("Comparing catch %s to exception %s\n", catch_ci.name.ptr, exception_struct.exception_object.classinfo.name.ptr);
|
|
+ if(_d_isbaseof(exception_struct.exception_object.classinfo, catch_ci))
|
|
+ return _d_eh_install_catch_context(actions, ti_offset, landing_pad, exception_struct, context);
|
|
+
|
|
+ // we've walked through all actions and found nothing...
|
|
+ if(next_action_offset == 0)
|
|
+ return _Unwind_Reason_Code.CONTINUE_UNWIND;
|
|
+ else
|
|
+ action_walker += next_action_offset;
|
|
+ }
|
|
+
|
|
+ fatalerror("reached unreachable");
|
|
+ return _Unwind_Reason_Code.FATAL_PHASE1_ERROR;
|
|
+}
|
|
+
|
|
+// These are the register numbers for SetGR that
|
|
+// llvm's eh.exception and eh.selector intrinsics
|
|
+// will pick up.
|
|
+// Hints for these can be found by looking at the
|
|
+// EH_RETURN_DATA_REGNO macro in GCC, careful testing
|
|
+// is required though.
|
|
+version (X86_64)
|
|
+{
|
|
+ private int eh_exception_regno = 0;
|
|
+ private int eh_selector_regno = 1;
|
|
+} else {
|
|
+ private int eh_exception_regno = 0;
|
|
+ private int eh_selector_regno = 2;
|
|
+}
|
|
+
|
|
+private _Unwind_Reason_Code _d_eh_install_catch_context(_Unwind_Action actions, ptrdiff_t switchval, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context)
|
|
+{
|
|
+ debug(EH_personality) printf("Found catch clause!\n");
|
|
+
|
|
+ if(actions & _Unwind_Action.SEARCH_PHASE)
|
|
+ return _Unwind_Reason_Code.HANDLER_FOUND;
|
|
+
|
|
+ else if(actions & _Unwind_Action.CLEANUP_PHASE)
|
|
+ {
|
|
+ debug(EH_personality) printf("Setting switch value to: %d!\n", switchval);
|
|
+ _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)cast(void*)(exception_struct.exception_object));
|
|
+ _Unwind_SetGR(context, eh_selector_regno, cast(ptrdiff_t)switchval);
|
|
+ _Unwind_SetIP(context, landing_pad);
|
|
+ return _Unwind_Reason_Code.INSTALL_CONTEXT;
|
|
+ }
|
|
+
|
|
+ fatalerror("reached unreachable");
|
|
+ return _Unwind_Reason_Code.FATAL_PHASE2_ERROR;
|
|
+}
|
|
+
|
|
+private _Unwind_Reason_Code _d_eh_install_finally_context(_Unwind_Action actions, ptrdiff_t landing_pad, _d_exception* exception_struct, _Unwind_Context_Ptr context)
|
|
+{
|
|
+ // if we're merely in search phase, continue
|
|
+ if(actions & _Unwind_Action.SEARCH_PHASE)
|
|
+ return _Unwind_Reason_Code.CONTINUE_UNWIND;
|
|
+
|
|
+ debug(EH_personality) printf("Calling cleanup routine...\n");
|
|
+
|
|
+ _Unwind_SetGR(context, eh_exception_regno, cast(ptrdiff_t)exception_struct);
|
|
+ _Unwind_SetGR(context, eh_selector_regno, 0);
|
|
+ _Unwind_SetIP(context, landing_pad);
|
|
+ return _Unwind_Reason_Code.INSTALL_CONTEXT;
|
|
+}
|
|
+
|
|
+private void _d_getLanguageSpecificTables(_Unwind_Context_Ptr context, ref ubyte* callsite, ref ubyte* action, ref ClassInfo* ci)
|
|
+{
|
|
+ ubyte* data = cast(ubyte*)_Unwind_GetLanguageSpecificData(context);
|
|
+ if (data is null)
|
|
+ {
|
|
+ //printf("language specific data was null\n");
|
|
+ callsite = null;
|
|
+ action = null;
|
|
+ ci = null;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ //TODO: Do proper DWARF reading here
|
|
+ if(*data++ != 0xff)
|
|
+ fatalerror("DWARF header has unexpected format 1");
|
|
+
|
|
+ if(*data++ != 0x00)
|
|
+ fatalerror("DWARF header has unexpected format 2");
|
|
+ size_t cioffset;
|
|
+ data = get_uleb128(data, cioffset);
|
|
+ ci = cast(ClassInfo*)(data + cioffset);
|
|
+
|
|
+ if(*data++ != 0x03)
|
|
+ fatalerror("DWARF header has unexpected format 3");
|
|
+ size_t callsitelength;
|
|
+ data = get_uleb128(data, callsitelength);
|
|
+ action = data + callsitelength;
|
|
+
|
|
+ callsite = data;
|
|
+}
|
|
+
|
|
+} // end of x86 Linux specific implementation
|
|
+
|
|
+
|
|
+extern(C) void _d_throw_exception(Object e)
|
|
+{
|
|
+ if (e !is null)
|
|
+ {
|
|
+ _d_exception* exc_struct = new _d_exception;
|
|
+ exc_struct.unwind_info.exception_class = *cast(ulong*)_d_exception_class.ptr;
|
|
+ exc_struct.exception_object = e;
|
|
+ _Unwind_Reason_Code ret = _Unwind_RaiseException(&exc_struct.unwind_info);
|
|
+ console("_Unwind_RaiseException failed with reason code: ")(ret)("\n");
|
|
+ }
|
|
+ abort();
|
|
+}
|
|
+
|
|
+extern(C) void _d_eh_resume_unwind(_d_exception* exception_struct)
|
|
+{
|
|
+ _Unwind_Resume(&exception_struct.unwind_info);
|
|
+}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/lifetime.d druntime/src/rt/lifetime.d
|
|
--- druntime-old/src/rt/lifetime.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/lifetime.d 2010-10-08 14:55:56.581547002 +0400
|
|
@@ -81,6 +81,28 @@
|
|
MAXSMALLSIZE = 256-SMALLPAD,
|
|
MAXMEDSIZE = (PAGESIZE / 2) - MEDPAD
|
|
}
|
|
+
|
|
+ version( LDC )
|
|
+ {
|
|
+ size_t length_adjust(size_t sizeelem, size_t newlength)
|
|
+ {
|
|
+ size_t newsize = void;
|
|
+ static if (size_t.sizeof < ulong.sizeof)
|
|
+ {
|
|
+ ulong s = cast(ulong)sizeelem * cast(ulong)newlength;
|
|
+ if (s > size_t.max)
|
|
+ onOutOfMemoryError();
|
|
+ newsize = cast(size_t)s;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ newsize = sizeelem * newlength;
|
|
+ if (newsize / newlength != sizeelem)
|
|
+ onOutOfMemoryError();
|
|
+ }
|
|
+ return newsize;
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
|
|
@@ -92,6 +114,13 @@
|
|
return gc_malloc(sz);
|
|
}
|
|
|
|
+/**
|
|
+ * for allocating a single POD value
|
|
+ */
|
|
+extern (C) void* _d_allocmemoryT(TypeInfo ti)
|
|
+{
|
|
+ return gc_malloc(ti.tsize(), !(ti.flags() & 1) ? BlkAttr.NO_SCAN : 0);
|
|
+}
|
|
|
|
/**
|
|
*
|
|
@@ -670,7 +699,7 @@
|
|
* ti is the type of the resulting array, or pointer to element.
|
|
* (For when the array is initialized to 0)
|
|
*/
|
|
-extern (C) ulong _d_newarrayT(TypeInfo ti, size_t length)
|
|
+extern (C) void[] _d_newarrayT(TypeInfo ti, size_t length)
|
|
{
|
|
ulong result;
|
|
auto size = ti.next.tsize(); // array element size
|
|
@@ -702,7 +731,7 @@
|
|
__setArrayAllocLength(info, size, isshared);
|
|
result = cast(ulong)length + (cast(ulong)cast(size_t)arrstart << 32);
|
|
}
|
|
- return result;
|
|
+ return *cast(void[]*)&result;
|
|
|
|
Loverflow:
|
|
onOutOfMemoryError();
|
|
@@ -711,7 +740,7 @@
|
|
/**
|
|
* For when the array has a non-zero initializer.
|
|
*/
|
|
-extern (C) ulong _d_newarrayiT(TypeInfo ti, size_t length)
|
|
+extern (C) void[] _d_newarrayiT(TypeInfo ti, size_t length)
|
|
{
|
|
ulong result;
|
|
auto size = ti.next.tsize(); // array element size
|
|
@@ -764,7 +793,7 @@
|
|
__setArrayAllocLength(info, size, isshared);
|
|
result = cast(ulong)length + (cast(ulong)cast(uint)arrstart << 32);
|
|
}
|
|
- return result;
|
|
+ return *cast(void[]*)&result;
|
|
|
|
Loverflow:
|
|
onOutOfMemoryError();
|
|
@@ -773,7 +802,7 @@
|
|
/**
|
|
*
|
|
*/
|
|
-extern (C) ulong _d_newarraymT(TypeInfo ti, int ndims, ...)
|
|
+extern (C) void[] _d_newarraymT(TypeInfo ti, int ndims, ...)
|
|
{
|
|
ulong result;
|
|
|
|
@@ -823,14 +852,14 @@
|
|
}
|
|
va_end(q);
|
|
}
|
|
- return result;
|
|
+ return *cast(void[]*)&result;
|
|
}
|
|
|
|
|
|
/**
|
|
*
|
|
*/
|
|
-extern (C) ulong _d_newarraymiT(TypeInfo ti, int ndims, ...)
|
|
+extern (C) void[] _d_newarraymiT(TypeInfo ti, int ndims, ...)
|
|
{
|
|
ulong result;
|
|
|
|
@@ -881,10 +910,9 @@
|
|
}
|
|
va_end(q);
|
|
}
|
|
- return result;
|
|
+ return *cast(void[]*)&result;
|
|
}
|
|
|
|
-
|
|
/**
|
|
*
|
|
*/
|
|
@@ -1046,7 +1074,7 @@
|
|
/**
|
|
* Resize dynamic arrays with 0 initializers.
|
|
*/
|
|
-extern (C) byte[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p)
|
|
+extern (C) void[] _d_arraysetlengthT(TypeInfo ti, size_t newlength, Array *p)
|
|
in
|
|
{
|
|
assert(ti);
|
|
@@ -1206,7 +1234,7 @@
|
|
* initsize size of initializer
|
|
* ... initializer
|
|
*/
|
|
-extern (C) byte[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p)
|
|
+extern (C) void[] _d_arraysetlengthiT(TypeInfo ti, size_t newlength, Array *p)
|
|
in
|
|
{
|
|
assert(!p.length || p.data);
|
|
@@ -1376,12 +1404,11 @@
|
|
onOutOfMemoryError();
|
|
}
|
|
|
|
-
|
|
/**
|
|
* Append y[] to array pointed to by px
|
|
* size is size of each array element.
|
|
*/
|
|
-extern (C) long _d_arrayappendT(TypeInfo ti, Array *px, byte[] y)
|
|
+extern (C) void[] _d_arrayappendT(TypeInfo ti, Array *px, byte[] y)
|
|
{
|
|
// only optimize array append where ti is not a shared type
|
|
auto sizeelem = ti.next.tsize(); // array element size
|
|
@@ -1468,10 +1495,9 @@
|
|
L1:
|
|
px.length = newlength;
|
|
memcpy(px.data + length * sizeelem, y.ptr, y.length * sizeelem);
|
|
- return *cast(long*)px;
|
|
+ return *cast(void[]*)px;
|
|
}
|
|
|
|
-
|
|
/**
|
|
*
|
|
*/
|
|
@@ -1552,21 +1578,36 @@
|
|
return newcap;
|
|
}
|
|
|
|
+version (LDC)
|
|
+{
|
|
+
|
|
+/**
|
|
+ * Appends a single element to an array.
|
|
+ */
|
|
+extern (C) void[] _d_arrayappendcT(TypeInfo ti, byte[] *x, byte *argp)
|
|
+{
|
|
+ return _d_arrayappendT(ti, cast(Array*)x, argp[0..1]);
|
|
+}
|
|
+
|
|
+}
|
|
+else
|
|
+{
|
|
|
|
/**
|
|
*
|
|
*/
|
|
-extern (C) long _d_arrayappendcT(TypeInfo ti, Array *x, ...)
|
|
+extern (C) void[] _d_arrayappendcT(TypeInfo ti, Array *x, ...)
|
|
{
|
|
byte *argp = cast(byte*)(&ti + 2);
|
|
return _d_arrayappendT(ti, x, argp[0..1]);
|
|
}
|
|
|
|
+}
|
|
|
|
/**
|
|
* Append dchar to char[]
|
|
*/
|
|
-extern (C) long _d_arrayappendcd(ref char[] x, dchar c)
|
|
+extern (C) void[] _d_arrayappendcd(ref char[] x, dchar c)
|
|
{
|
|
// c could encode into from 1 to 4 characters
|
|
char[4] buf = void;
|
|
@@ -1612,7 +1653,7 @@
|
|
/**
|
|
* Append dchar to wchar[]
|
|
*/
|
|
-extern (C) long _d_arrayappendwd(ref wchar[] x, dchar c)
|
|
+extern (C) void[] _d_arrayappendwd(ref wchar[] x, dchar c)
|
|
{
|
|
// c could encode into from 1 to 2 w characters
|
|
wchar[2] buf = void;
|
|
@@ -1641,7 +1682,6 @@
|
|
return _d_arrayappendT(typeid(shared wchar[]), cast(Array *)&x, appendthis);
|
|
}
|
|
|
|
-
|
|
/**
|
|
*
|
|
*/
|
|
@@ -1794,11 +1834,10 @@
|
|
void* ptr;
|
|
}
|
|
|
|
-
|
|
/**
|
|
*
|
|
*/
|
|
-extern (C) long _adDupT(TypeInfo ti, Array2 a)
|
|
+extern (C) void[] _adDupT(TypeInfo ti, void[] a)
|
|
out (result)
|
|
{
|
|
auto sizeelem = ti.next.tsize(); // array element size
|
|
@@ -1819,7 +1858,7 @@
|
|
r.length = a.length;
|
|
memcpy(r.ptr, a.ptr, size);
|
|
}
|
|
- return *cast(long*)(&r);
|
|
+ return *cast(void[]*)(&r);
|
|
}
|
|
|
|
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort.d druntime/src/rt/qsort.d
|
|
--- druntime-old/src/rt/qsort.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/qsort.d 2010-10-07 13:59:06.815253002 +0400
|
|
@@ -44,7 +44,7 @@
|
|
structures. The default value is optimized for a high cost for compares. */
|
|
|
|
|
|
-extern (C) long _adSort(Array a, TypeInfo ti)
|
|
+extern (C) void[] _adSort(void[] a, TypeInfo ti)
|
|
{
|
|
byte* base;
|
|
byte*[40] stack; // stack
|
|
@@ -124,7 +124,7 @@
|
|
limit = sp[1];
|
|
}
|
|
else // else stack empty, all done
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
assert(0);
|
|
}
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/qsort2.d druntime/src/rt/qsort2.d
|
|
--- druntime-old/src/rt/qsort2.d 2010-08-05 05:39:06.000000000 +0400
|
|
+++ druntime/src/rt/qsort2.d 2010-10-07 14:01:41.359253001 +0400
|
|
@@ -31,14 +31,14 @@
|
|
return tiglobal.compare(p1, p2);
|
|
}
|
|
|
|
-extern (C) long _adSort(Array a, TypeInfo ti)
|
|
+extern (C) void[] _adSort(void[] a, TypeInfo ti)
|
|
{
|
|
synchronized
|
|
{
|
|
tiglobal = ti;
|
|
qsort(a.ptr, a.length, cast(size_t)ti.tsize(), &cmp);
|
|
}
|
|
- return *cast(long*)(&a);
|
|
+ return a;
|
|
}
|
|
|
|
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/rt/trace.d druntime/src/rt/trace.d
|
|
--- druntime-old/src/rt/trace.d 2010-08-07 09:46:06.000000000 +0400
|
|
+++ druntime/src/rt/trace.d 2010-10-01 21:01:58.444892002 +0400
|
|
@@ -855,7 +855,7 @@
|
|
version (OSX)
|
|
{ // 16 byte align stack
|
|
asm
|
|
- { naked ;
|
|
+ {
|
|
pushad ;
|
|
sub ESP,12 ;
|
|
}
|
|
@@ -870,7 +870,7 @@
|
|
else
|
|
{
|
|
asm
|
|
- { naked ;
|
|
+ {
|
|
pushad ;
|
|
}
|
|
trace_epi();
|
|
diff -U 3 -H -d -r -N -x '*.mak' -x tk -x backend -x debug -x release -x '*_pch.h' -x Makefile -x '*.rej' -x '*~' -x '*.log' -x .svn -x '*pro.user' -x .directory -x cmake_install -x CMakeFiles -x .preprocessed.tmp -x 'Makefile.*' -x '*.orig' -- druntime-old/src/std/intrinsic.d druntime/src/std/intrinsic.d
|
|
--- druntime-old/src/std/intrinsic.d 1970-01-01 03:00:00.000000000 +0300
|
|
+++ druntime/src/std/intrinsic.d 2010-10-03 20:07:21.183624002 +0400
|
|
@@ -0,0 +1,212 @@
|
|
+/*
|
|
+ * D phobos intrinsics for LDC
|
|
+ *
|
|
+ * From GDC ... public domain!
|
|
+ */
|
|
+module std.intrinsic;
|
|
+
|
|
+// Check for the right compiler
|
|
+version(LDC)
|
|
+{
|
|
+ // OK
|
|
+}
|
|
+else
|
|
+{
|
|
+ static assert(false, "This module is only valid for LDC");
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Scans the bits in v starting with bit 0, looking
|
|
+ * for the first set bit.
|
|
+ * Returns:
|
|
+ * The bit number of the first bit set.
|
|
+ * The return value is undefined if v is zero.
|
|
+ */
|
|
+nothrow int bsf(uint v)
|
|
+{
|
|
+ uint m = 1;
|
|
+ uint i;
|
|
+ for (i = 0; i < 32; i++,m<<=1) {
|
|
+ if (v&m)
|
|
+ return i;
|
|
+ }
|
|
+ return i; // supposed to be undefined
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Scans the bits in v from the most significant bit
|
|
+ * to the least significant bit, looking
|
|
+ * for the first set bit.
|
|
+ * Returns:
|
|
+ * The bit number of the first bit set.
|
|
+ * The return value is undefined if v is zero.
|
|
+ * Example:
|
|
+ * ---
|
|
+ * import std.intrinsic;
|
|
+ *
|
|
+ * int main()
|
|
+ * {
|
|
+ * uint v;
|
|
+ * int x;
|
|
+ *
|
|
+ * v = 0x21;
|
|
+ * x = bsf(v);
|
|
+ * printf("bsf(x%x) = %d\n", v, x);
|
|
+ * x = bsr(v);
|
|
+ * printf("bsr(x%x) = %d\n", v, x);
|
|
+ * return 0;
|
|
+ * }
|
|
+ * ---
|
|
+ * Output:
|
|
+ * bsf(x21) = 0<br>
|
|
+ * bsr(x21) = 5
|
|
+ */
|
|
+nothrow int bsr(uint v)
|
|
+{
|
|
+ uint m = 0x80000000;
|
|
+ uint i;
|
|
+ for (i = 32; i ; i--,m>>>=1) {
|
|
+ if (v&m)
|
|
+ return i-1;
|
|
+ }
|
|
+ return i; // supposed to be undefined
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Tests the bit.
|
|
+ */
|
|
+nothrow int bt(uint *p, uint bitnum)
|
|
+{
|
|
+ return (p[bitnum / (uint.sizeof*8)] & (1<<(bitnum & ((uint.sizeof*8)-1)))) ? -1 : 0 ;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Tests and complements the bit.
|
|
+ */
|
|
+nothrow int btc(uint *p, uint bitnum)
|
|
+{
|
|
+ uint * q = p + (bitnum / (uint.sizeof*8));
|
|
+ uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1));
|
|
+ int result = *q & mask;
|
|
+ *q ^= mask;
|
|
+ return result ? -1 : 0;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Tests and resets (sets to 0) the bit.
|
|
+ */
|
|
+nothrow int btr(uint *p, uint bitnum)
|
|
+{
|
|
+ uint * q = p + (bitnum / (uint.sizeof*8));
|
|
+ uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1));
|
|
+ int result = *q & mask;
|
|
+ *q &= ~mask;
|
|
+ return result ? -1 : 0;
|
|
+}
|
|
+
|
|
+
|
|
+/**
|
|
+ * Tests and sets the bit.
|
|
+ * Params:
|
|
+ * p = a non-NULL pointer to an array of uints.
|
|
+ * index = a bit number, starting with bit 0 of p[0],
|
|
+ * and progressing. It addresses bits like the expression:
|
|
+---
|
|
+p[index / (uint.sizeof*8)] & (1 << (index & ((uint.sizeof*8) - 1)))
|
|
+---
|
|
+ * Returns:
|
|
+ * A non-zero value if the bit was set, and a zero
|
|
+ * if it was clear.
|
|
+ *
|
|
+ * Example:
|
|
+ * ---
|
|
+import std.intrinsic;
|
|
+
|
|
+int main()
|
|
+{
|
|
+ uint array[2];
|
|
+
|
|
+ array[0] = 2;
|
|
+ array[1] = 0x100;
|
|
+
|
|
+ printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
|
|
+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
+
|
|
+ printf("btc(array, 35) = %d\n", <b>btc</b>(array, 35));
|
|
+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
+
|
|
+ printf("bts(array, 35) = %d\n", <b>bts</b>(array, 35));
|
|
+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
+
|
|
+ printf("btr(array, 35) = %d\n", <b>btr</b>(array, 35));
|
|
+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
+
|
|
+ printf("bt(array, 1) = %d\n", <b>bt</b>(array, 1));
|
|
+ printf("array = [0]:x%x, [1]:x%x\n", array[0], array[1]);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+ * ---
|
|
+ * Output:
|
|
+<pre>
|
|
+btc(array, 35) = 0
|
|
+array = [0]:x2, [1]:x108
|
|
+btc(array, 35) = -1
|
|
+array = [0]:x2, [1]:x100
|
|
+bts(array, 35) = 0
|
|
+array = [0]:x2, [1]:x108
|
|
+btr(array, 35) = -1
|
|
+array = [0]:x2, [1]:x100
|
|
+bt(array, 1) = -1
|
|
+array = [0]:x2, [1]:x100
|
|
+</pre>
|
|
+ */
|
|
+nothrow int bts(uint *p, uint bitnum)
|
|
+{
|
|
+ uint * q = p + (bitnum / (uint.sizeof*8));
|
|
+ uint mask = 1 << (bitnum & ((uint.sizeof*8) - 1));
|
|
+ int result = *q & mask;
|
|
+ *q |= mask;
|
|
+ return result ? -1 : 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Swaps bytes in a 4 byte uint end-to-end, i.e. byte 0 becomes
|
|
+ * byte 3, byte 1 becomes byte 2, byte 2 becomes byte 1, byte 3
|
|
+ * becomes byte 0.
|
|
+ */
|
|
+pragma(intrinsic, "llvm.bswap.i32")
|
|
+ uint bswap(uint val);
|
|
+
|
|
+/**
|
|
+ * Reads I/O port at port_address.
|
|
+ */
|
|
+ubyte inp(uint p) { throw new Exception("inp intrinsic not yet implemented"); }
|
|
+
|
|
+/**
|
|
+ * ditto
|
|
+ */
|
|
+ushort inpw(uint p) { throw new Exception("inpw intrinsic not yet implemented"); }
|
|
+
|
|
+/**
|
|
+ * ditto
|
|
+ */
|
|
+uint inpl(uint p) { throw new Exception("inpl intrinsic not yet implemented"); }
|
|
+
|
|
+/**
|
|
+ * ditto
|
|
+ */
|
|
+ubyte outp(uint p, ubyte v) { throw new Exception("outp intrinsic not yet implemented"); }
|
|
+
|
|
+/**
|
|
+ * ditto
|
|
+ */
|
|
+ushort outpw(uint p, ushort v) { throw new Exception("outpw intrinsic not yet implemented"); }
|
|
+
|
|
+/**
|
|
+ * ditto
|
|
+ */
|
|
+uint outpl(uint p, uint v) { throw new Exception("outpl intrinsic not yet implemented"); }
|