Hi,
In the sample code below there are two for loops iterating over <vararray>s and a simple user written class, performing Y[] = X[] * c. The vararrays version takes half the time of the user struct version; and I'm trying to work out why:
1 2 3
|
V:\>test
Valarray explicit took 0.100871531 seconds
SizedArray explicit took 0.159492015 seconds
|
The assembler output from the compiler for the two loop bodies is identical excepting register selection:
1 2 3 4 5 6 7
|
$LL6@main:
movsdx xmm0, QWORD PTR [r8+rcx]
add rcx, 8
sub rdx, 1
mulsd xmm0, xmm6
movsdx QWORD PTR [rcx-8], xmm0
jne SHORT $LL6@main
|
1 2 3 4 5 6 7
|
$LL3@main:
movsdx xmm0, QWORD PTR [r12+rbx]
add rbx, 8
sub rsi, 1
mulsd xmm0, xmm6
movsdx QWORD PTR [rbx-8], xmm0
jne SHORT $LL3@main
|
I'm running the loops over 20,000,000 elements, so everything outside of the above code is moot.
I thought alignment might be a problem as the user struct used malloc, so I switch that to aligned_malloc; and applied __declspec( align( 32) ) to the struct definition, but the performance difference remains.
Switching the ordering of the two loops doesn't change anything; so I don't think this is down to cold versus warm cache.
Basically, I've run out of ideas for why two apparently identical loops should run consistently at such different speeds; and I'm hoping someone here will have some ideas.
Note: The built-in timing code is being cross checked by also instrumenting the code externally using VerySleepyCS and the results are always consistent:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
|
#pragma warning(disable: 4530 )
#include <conio.h>
#include <cstdlib>
#include <cstdio>
#include <valarray>
#include <vector>
#include <windows.h>
#include <intrin.h>
using namespace std;
template<typename T>
struct __declspec( align( 16 ) ) SizedArray {
T *p;
size_t elems;
SizedArray( size_t size ) {
p = (T*)_aligned_malloc( sizeof( T ) *size, 32 );
elems = size;
};
T &operator[]( size_t i ) { return p[ i ]; };
const T operator[]( size_t i ) const { return p[ i ]; };
size_t size() { return elems; };
};
const size_t VSIZE = 20000000I64;
int main() {
valarray<double> X( VSIZE ), Y( VSIZE ), Z( VSIZE );
SizedArray<double> Xs( VSIZE ), Ys( VSIZE );
double const c = 3.141592653;
__int64 start, stop;
for( int i=0; i < VSIZE; ++i ) X[ i ] = Xs[ i ] = rand();
getch();
SetThreadAffinityMask( GetCurrentThread(), 1 ); Sleep( 0 );
SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
start = __rdtsc(); _ReadWriteBarrier();
0.14s for( size_t i=0; i < VSIZE; ++i ) Y[ i ] = X[ i ] * c;
_ReadWriteBarrier(); stop = __rdtsc();
printf( "Valarray explicit took %.9f seconds\n", (double)( stop - start ) / 2.4e9 );
start = __rdtsc(); _ReadWriteBarrier();
0.27s for( size_t i=0; i < VSIZE; ++i ) Ys[ i ] = Xs[ i ] * c;
_ReadWriteBarrier(); stop = __rdtsc();
printf( "SizedArray explicit took %.9f seconds\n", (double)( stop - start ) / 2.4e9 );
return 0;
}
|