Why Windows C++ muti-threading IOPS is much faster than IOMeter?

Greetings,
Can anybody help me a little out of my difficulty?

I have a SSD and I am trying to use it to simulate my program I/O performance, however, IOPS calculated from my program is much much faster than IOMeter.

My SSD is PLEXTOR PX-128M3S, by IOMeter, its max 512B random read IOPS is around 94k (queue depth is 32).
However my program (32 windows threads) can reach around 500k 512B IOPS, around 5 times of IOMeter!!! I did data validation but didn't find any error in data fetching. It's because my data fetching in order?

I paste my code belwo (it mainly fetch 512B from file and release it; I did use 4bytes (an int) to validate program logic and didn't find problem), can anybody help me figure out where I am wrong?

Thanks so much in advance!!

Nai Yan.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#include <stdio.h>
#include <Windows.h>

/* 
**  Purpose: Verify file random read IOPS in comparison with IOMeter    
**  Author:  Nai Yan
**  Date:    Feb. 9th, 2012
**/

//Global variables
long completeIOs = 0; 
long completeBytes = 0;
int  threadCount = 32;
unsigned long long length = 1073741824;                  //4G test file

int interval = 1024;

int resultArrayLen = 320000;

int *result = new int[resultArrayLen];

//Method declarison
double GetSecs(void);					           //Calculate out duration
int InitPool(long long,char*,int);		     		  //Initialize test data for testing, if successful, return 1; otherwise, return a non 1 value. 
int * FileRead(char * path);
unsigned int DataVerification(int*, int sampleItem);		                 //Verify data fetched from pool

int main()
{
	int sampleItem = 0x1;
	char * fPath = "G:\\workspace\\4G.bin";
	unsigned int invalidIO = 0;

	if (InitPool(length,fPath,sampleItem)!= 1)
	   printf("File write err... \n");

	//start do random I/Os from initialized file
	double start = GetSecs();

	int * fetchResult = FileRead(fPath);
	
	double end = GetSecs();

	printf("File read IOPS is %.4f per second.. \n",completeIOs/(end - start));

	//start data validation, for 4 bytes fetch only

//	invalidIO = DataVerification(fetchResult,sampleItem);

//	if (invalidIO !=0)
//	{
//		printf("Total invalid data fetch IOs are %d", invalidIO);
//	}

	return 0;
}



int InitPool(long long length, char* path, int sample)
{
	printf("Start initializing test data ... \n");

	FILE * fp = fopen(path,"wb");

	if (fp == NULL)
	{
		printf("file open err... \n");
		exit (-1);
	}

	else									//initialize file for testing
	{
		fseek(fp,0L,SEEK_SET);

		for (int i=0; i<length; i++)
		{
			fwrite(&sample,sizeof(int),1,fp);
		}

		fclose(fp);

		fp = NULL;

		printf("Data initialization is complete...\n");

		return 1;

	}
}

double GetSecs(void)

{
    LARGE_INTEGER frequency;
    LARGE_INTEGER start;

    if(! QueryPerformanceFrequency(&frequency)) 
        printf("QueryPerformanceFrequency Failed\n");

    if(! QueryPerformanceCounter(&start))
        printf("QueryPerformanceCounter Failed\n");
	
	return ((double)start.QuadPart/(double)frequency.QuadPart);
    
}

class input
{
public:
	char *path;
	int starting;

	input (int st, char * filePath):starting(st),path(filePath){}

};

//Workers
DWORD WINAPI FileReadThreadEntry(LPVOID lpThreadParameter)
{
	input * in = (input*) lpThreadParameter; 

	char* path = in->path;

	FILE * fp = fopen(path,"rb");

	int sPos = in->starting;

//	int * result = in->r;

	if(fp != NULL)
	{
		fpos_t pos;
		for (int i=0; i<resultArrayLen/threadCount;i++)
		{

			pos = i * interval;
			fsetpos(fp,&pos);
			//For 512 bytes fetch each time
			unsigned char *c =new unsigned char [512];
			if (fread(c,512,1,fp) ==1)
			{
				InterlockedIncrement(&completeIOs);
				delete c;
			}

			//For 4 bytes fetch each time
			/*if (fread(&result[sPos + i],sizeof(int),1,fp) ==1)
			{
				InterlockedIncrement(&completeIOs);
			}*/

			else
			{
				printf("file read err...\n");
				exit(-1);
			}
		}

		fclose(fp);
		fp = NULL;
		}

	else
	{
		printf("File open err... \n");
		exit(-1);
	}
}

int * FileRead(char * p)
{
	printf("Starting reading file ... \n");
	
		
	HANDLE mWorkThread[256];                      //max 256 threads
	completeIOs = 0;
		
	int slice = int (resultArrayLen/threadCount);

	for(int i = 0; i < threadCount; i++)
	{
		mWorkThread[i] = CreateThread(
					NULL,
					0,
					FileReadThreadEntry,
					(LPVOID)(new input(i*slice,p)),
					0, 
					NULL);
	}

   WaitForMultipleObjects(threadCount, mWorkThread, TRUE, INFINITE);
   
   printf("File read complete... \n");

   return result;

}

unsigned int DataVerification(int* result, int sampleItem)
{
	unsigned int invalid = 0;
	for (int i=0; i< resultArrayLen/interval;i++)
	{
		if (result[i]!=sampleItem)
		{
			invalid ++;
			continue;
		}
	}

	return invalid;
}
I didn't take a look at your code, but maybe the result is better due to some active caches.
The tools that measure the speed either can disable the caches or perform actions in a way that renders caching ineffective.

I'm no expert in this field but I would read from a great number of files (>100).
It seems you only read from one file. This is where caches excel I suppose ;-).
Topic archived. No new replies allowed.