Erreur dans le calcul du temps d'une fonction

**Neolex** · 20/09/2017, 19h39

Bonjour
J'ai un petit problème avec un TP,
mon code OpenCL pour calculer le produit de deux matrices fonctionne bien mais le calcul du temps à cette ligne m'affiche 0.000ms je ne comprends pas pourquoi....
printf("\nExecution time in milliseconds = %0.3f ms\n", (total_time / 1000000.0) );

Voici mon code :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <CL/cl.h>
#include <stdio.h>
 
#define CL_CHECK(_expr)  \
do { cl_int _err = _expr;  \
        if (_err == CL_SUCCESS) break; \
        fprintf(stderr, "OpenCL Error: '%s'returned %d!\n", #_expr, (int)_err); \
        abort(); \
     } while (0)
 
 
const char* programSource[] = {
"__kernel void simpleMultiply(__global float* outputC,			",
"                             __global float* inputA,			",
"                             __global float* inputB,                   ",
"                              int width){	            		",
"	int row = get_global_id(1);					",
"	int col = get_global_id(0);					",
"	float sum = 0.0f;						",
" 	for (int i = 0; i < width; i++) 				",
"  		sum += inputA[row*width+i] * inputB[col*width+i];	",
" 	outputC[row*width+col] = sum;					",
"}									"
};
int main() {
	/* Generation des matrices a multiplier*/
	const int tailleMatrices = 4;
 
	int HA = tailleMatrices;
	int HB = tailleMatrices;
	int WB = tailleMatrices;
	int WA = tailleMatrices;
 
	int datasize = tailleMatrices*tailleMatrices*sizeof(float);
 
	float *MatriceA =(float*) malloc(tailleMatrices*tailleMatrices*sizeof(float));
	for(int i=0;i<tailleMatrices;i++)
	{
		for(int j=0;j<tailleMatrices;j++)
		{
			MatriceA[i*tailleMatrices+j]=i+j;
		}
 
	}
 
 
 
 
	float *MatriceB =(float*) malloc(tailleMatrices*tailleMatrices*sizeof(float));
	for(int i=0;i<tailleMatrices;i++)
	{
		for(int j=0;j<tailleMatrices;j++)
		{
			MatriceB[i*tailleMatrices+j]=i+j;
		}
 
	}
 
 
    float *MatriceResult = (float*)malloc(tailleMatrices*tailleMatrices*sizeof(float));
    for(int i=0;i<tailleMatrices;i++)
    {
        for(int j=0;j<tailleMatrices;j++)
        {
            MatriceResult[i*tailleMatrices+j]=0;
        }
    }
 
	cl_ulong time_start=0.0f, time_end=0.0f;
	long total_time=0.0f;
	cl_event event;
 
 
	// Use this to check the output of each API call
	cl_int status; 
 
	////////////////////////// STEP 1 //////////////////////////////////////////
	cl_uint numPlatforms = 0;
	cl_platform_id *platforms = NULL;
	// Use clGetPlatformIDs() to retrieve the number of platforms
	 status = clGetPlatformIDs(0, NULL,&numPlatforms);
	// Allocate enough space for each platform
	 platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
	// Fill in platforms with clGetPlatformIDs()
	 status = clGetPlatformIDs(numPlatforms, platforms, NULL);
 
	///////////////////////// STEP 2 //////////////////////////////////////////
	cl_uint numDevices = 0;
	cl_device_id *devices = NULL;
	// Use clGetDeviceIDs() to retrieve the number of devices present
	status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,0, NULL, &numDevices);
	// Allocate enough space for each device
	devices = (cl_device_id*)malloc(numDevices *sizeof(cl_device_id));
	// Fill in devices with clGetDeviceIDs()
	status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,numDevices, devices,NULL);
 
	////////////////////////// STEP 3 //////////////////////////////////////////
	cl_context context = NULL;
	// Create a context using clCreateContext() and associate it with the devices
	context = clCreateContext(NULL, numDevices,devices, NULL, NULL, &status);
 
	////////////////////////// STEP 4 //////////////////////////////////////////
	cl_command_queue cmdQueue;
	// Create a command queue using
 	// clCreateCommandQueue(), and associate it with
 	// the device you want to  execute on
 	cmdQueue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &status); // Add CL_QUEUE_PROFILING_ENABLE
	////////////////////////// STEP 5 //////////////////////////////////////////
	// Use clCreateBuffer() to create buffer objects 
	// that will contain the data from the host arrays
	cl_mem bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY, WA*HA*sizeof(float), NULL, &status);
	cl_mem bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY, WB*HB*sizeof(float), NULL, &status);
	cl_mem bufferResult= clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status);
 
 
	////////////////////////// STEP 6 //////////////////////////////////////////
	// Use clEnqueueWriteBuffer() to write input array A
	status = clEnqueueWriteBuffer(cmdQueue,bufferA, CL_TRUE, 0, WA*HA*sizeof(float), (void *)MatriceA, 0, NULL, NULL);
	status = clEnqueueWriteBuffer(cmdQueue,bufferB, CL_TRUE, 0, WB*HB*sizeof(float), (void *)MatriceB, 0, NULL, NULL);
	////////////////////////// STEP 7 //////////////////////////////////////////
	// Create a program using
	// clCreateProgramWithSource()
	cl_program program = clCreateProgramWithSource(context, sizeof(programSource)/sizeof(*programSource), programSource, NULL, &status);
	// Build (compile) the program for the devices with  clBuildProgram()
	status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
 
	/* 
		2. 	Ajouter des appels à clGetProgramBuildInfo 	et tester la compilation dynamique en introduisant des erreurs dans le kernel
	*/
 
	for(unsigned int i=0;i<numDevices;i++)
	{
		size_t len;
		clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
		if(len>2)
		{
			char *buffer = (char *)malloc(len);
			clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
			printf("[*] Error kernel %d: %s\n",i,buffer);
		}	
	}
	// Fin question 2
 
	////////////////////////// STEP 8 //////////////////////////////////////////
	cl_kernel kernel = NULL;
	// Use clCreateKernel() to create a kernel from the vector addition function (named "vecadd")
	kernel = clCreateKernel(program, "simpleMultiply", &status);
 
 
	////////////////////////// STEP 9 //////////////////////////////////////////
	// Associate the input and output buffers with the kernel using clSetKernelArg()
	status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferResult);
	status  |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferA);
	status  |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferB);
        status |= clSetKernelArg(kernel, 3, sizeof(tailleMatrices), &tailleMatrices);
	////////////////////////// STEP 10 //////////////////////////////////////////
	// Define an index space (global work size) of work
	// items for execution. A workgroup size (local work
	// size) is not required, but can be used.
	size_t globalWorkSize[2];
	// There are 'elements' work-items
	globalWorkSize[0] = WB;
	globalWorkSize[1] = HA;
 
	////////////////////////// STEP 11 //////////////////////////////////////////
	// Execute the kernel by using clEnqueueNDRangeKernel().
	// 'globalWorkSize' is the 1D dimension of the work-items
 
 /* question 3 */
 
	status = clEnqueueNDRangeKernel(cmdQueue,kernel, 2, NULL, globalWorkSize,NULL, 0, NULL, &event); // add &event
 
 
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
 
	printf("\nExecution time in milliseconds = %0.3f ms\n", (total_time / 1000000.0) );
/* fin question 3 */	
 
        clEnqueueReadBuffer(cmdQueue, bufferResult, CL_TRUE, 0, datasize, MatriceResult, 0, NULL, NULL);
        for(int i=0;i<tailleMatrices;i++)
        {
            for(int j=0;j<tailleMatrices;j++)
            {
                printf("%f -",MatriceResult[i*tailleMatrices+j]);
            }
            printf("\n");
        }
 
	////////////////////////// STEP 13 //////////////////////////////////////////
	// Free OpenCL ressources
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(cmdQueue);
	clReleaseMemObject(bufferA);
	clReleaseMemObject(bufferB);
	clReleaseContext(context);
	// Free host ressources
	free(MatriceA);
	free(MatriceB);
	free(platforms);
	free(devices);
}

J'ai vraiment cherché mais je ne trouve pas d'où vient ce probleme.. J'espere que vous pourriez m'aider...

Merci !

**Piraaate** · 10/04/2018, 16h54

J'arrive longtemps après, mais si quelqu'un tombe sur cette question...

il manque un appel à clWaitForEvents après avoir fait appel à clEnqueueNDRangeKernel et avant de faire appel aux clGetEventProfilingInfo
Vu que le process est déporté (sur GPU habituellement) il faut attendre le signal que cet event s'est bien produit avant de pouvoir récupérer des mesures de profiling.

Erreur dans le calcul du temps d'une fonction

OpenCL

Discussions similaires

Partager

Partager