1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
void mmul_sse_nsize(const float * a, const float * b, float * r, int size)
{
__m128 a_line, b_line, r_line;
int bound = size*size;
int rows = size;
for (int i = 0; i < bound; i += 4)
{
// unroll the first step of the loop to avoid having to initialize r_line to zero
a_line = _mm_load_ps(a); // a_line = vec4(column(a, 0))
b_line = _mm_set1_ps(b[i]); // b_line = vec4(b[i][0])
r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
for (int j = 1; j< rows ; j++)
{
a_line = _mm_load_ps(&a[j * 4]); // a_line = vec4(column(a, j))
b_line = _mm_set1_ps(b[i + j]); // b_line = vec4(b[i][j])
// r_line += a_line * b_line
r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
}
_mm_store_ps(&r[i], r_line); // r[i] = r_line
}
}
int main(int argc, _TCHAR* argv[])
{
__int64 ctr1 = 0, ctr2 = 0, freq = 0;
__int64 ctrptr1 = 0, ctrptr2 = 0, freqptr = 0;
int N_size = 40;
float **b = new float*[N_size], **a = new float*[N_size], **r = new float*[N_size];
const float X = 100.0;
ofstream myfile("plots_vec.m" , ios::out |ios::app);
//__m128 vec1 = { 1.0, 1.0, 1.0, 1.0 }; //_mm_set1_ps( b);
//__m128 vec2 = { 2.0, 2.0, 2.0, 2.0 }; // _mm_set1_ps( a);
//myfile << "# SSE Dot Product\n";
myfile << "\nN= " << N_size<<"\n";
myfile << "x=[";
/* initialize matrix a and b to avoid segfault*/
for (int i = 0; i < N_size; ++i) {
a[i] = new float[N_size];
b[i] = new float[N_size];
r[i] = new float[N_size];
}
/* init matrix a*/
for (int i = 0; i < N_size; i++)
{
//a[i] = new float [N_size];
for (int j = 0; j < N_size; j++)
a[i][j] = (static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / X)));
//a[i] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / X));
}
/* init matrix b */
for (int i = 0; i < N_size; i++)
{
//a[i] = new float [N_size];
for (int j = 0; j < N_size; j++)
b[i][j] = (static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / X)));
//a[i] = static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / X));
}
// Start timing the code.
if (QueryPerformanceCounter((LARGE_INTEGER *)&ctrptr1) != 0)
{
// Code segment is being timed.
//mmul_sse( *a , *b, *r );
// mmul_sse_nsize(*a, *b, *r, N_size);
// SSEMultAlligned(*a, *b, *r, N_size );
matrixMultIntrin(*a, *b, N_size, *r);
...
|