différence de perfs énorme entre Linux/Windows, ou est le problème ?

**nikopol74** · 26/04/2012, 18h53

Bonjour, après avoir réalisé l'algorithme de Floyd (plus court chemins) avec mémoire partagée, j'ai vu une énorme différence de performance entre windows et linux.

Voila une capture d'écran qui illustre le phénomène :
http://www.zimagez.com/zimage/cudaperf.php

Voila le code source ( si quelqu'un veut essayer )

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
 
 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
 
using namespace std;
 
int infini = 9999;
double shared, global, seq;
 
__global__ void floydSharedPhase_1(int* sommet,int* adjacence,int taille, int k);
__global__ void floydSharedPhase_2(int* sommet,int* adjacence,int taille,int k);
__global__ void floydSharedPhase_3(int* sommet,int* adjacence,int taille,int k);
void floydParShared(int *sommet, int *adjacence,int taille, int nthreads);
 
 
void affichage(int* tab, int dim){
	for(int y = 0; y< dim; y++){
		for(int x = 0; x<dim;x++){
				cout << "    " << tab[x + y*dim];
		}
		cout << endl;
	}
}
void barreChargement(long double palier,long double courant,long double max,long double* valCourante){
//	cout << " courant = " << courant<< endl;
	if(courant > *valCourante)
	{
		*valCourante = *valCourante + palier;
		cout << "-" ;
		if(*valCourante > max)
			cout << "|";
	}
}
void remplirTableau(int* tab, int* adja,int tX){
	long double progression = 0;
	long double valCourant = 0;
	long double max = tX*tX;
	long double palier = (tX*tX)/10;
	cout << "Initialisation        |";
 
	for(int y = 0; y < tX; y++)
		for(int x = 0; x< tX; x++){
			adja[x+tX*y] = -1;
			if(x==y)
				tab[x+tX*y] = 0;
			else{
				if(rand() % 5 == 0)
					tab[x+tX*y] = infini;
				else
					tab[x+tX*y] =10+  (rand() % infini/10 );
			}
			progression++;
			barreChargement(palier,progression,max,&valCourant);
		}
		tab[0] =0;	tab[1] =infini;	tab[2] =infini;	tab[3] =infini; tab[4] =3;	tab[5] = infini;
		tab[6] =9 ;	tab[7] = 0;	tab[8] =3;	tab[9] = infini;	tab[ 10] = infini;	tab[ 11] = infini;
		tab[12] =5;	tab[13] = infini;	tab[14] = 0;	tab[15] = 2;	tab[16] = infini;tab[17 ] =infini ;
		tab[ 18] =12 ;tab[ 19] =infini ;tab[ 20] =4 ;tab[21 ] = 0;tab[22 ] =6 ;tab[ 23] = 1;
		tab[24 ] =infini ;tab[ 25] =infini ;tab[ 26] =1 ;tab[ 27] =infini ;tab[28 ] =0 ;tab[29 ] =infini ;
		tab[30 ] = infini;tab[ 31] = infini;tab[32 ] = infini;tab[33 ] = infini;tab[34 ] =2 ;tab[35 ] = 0;
		cout << endl;
 
}
 
// ------------------- Floyd sur 1 CPU ----------------------------------------------------------
void floydSequentiel(int* sommet, int*adjacence,int tX){
	long double temp = tX;
    long double progression = 0;
	long double valCourant = 0;
	long double max = (temp*temp*temp);
	long double palier = (temp*temp*temp)/10;
	clock_t start,end;
	cout << ">Floyd Sequentiel     |";
	start = clock();
	 for(int k=0; k<tX; k++)
		 for(int y=0; y<tX;y++)
			 for(int x=0; x<tX;x++){
				 if(sommet[x + y*tX] > (sommet[x + k*tX] + sommet[k + y*tX])){
					 sommet[x + y*tX] =  sommet[x + k*tX] + sommet[k + y*tX];
					 adjacence [x + y*tX] = k;
				 }
				 progression++;
			     barreChargement(palier,progression,max,&valCourant);
			 }
	end = clock();
	double calc = ( (double)(end - start) / CLOCKS_PER_SEC);
	cout <<" CPU :" << calc  << "s" << endl;
	seq = calc;
}
 
//------------------- Floyd sur GPU sans mémoire partagée ---------------------------------------
__global__ void KernelPar(int* sommet,int* adjacence,int taille,int k)
{
    int indY =  blockDim.x*blockIdx.x + threadIdx.x ;
	int indX =  blockDim.y*blockIdx.y + threadIdx.y   ;
	if(indX < taille && indY < taille)
	if((*(sommet+indX*taille + indY) > *(sommet+indX*taille + k) + *(sommet+k*taille + indY)) ){
		*(sommet+indX*taille + indY) = *(sommet+indX*taille + k) + *(sommet+k*taille + indY);
		*(adjacence+indX*taille + indY) = k;
	}
}
void floydPar(int *sommet, int *adjacence,int tX,int nthreads){
		int* dev_sommet;
		int* dev_adjacence;
		float tempscalcul=0.0;
		double tempsmemoire=0.0;
		long double progression = 0;
		long double valCourant = 0;
		long double max = tX;
		long double palier = tX/10;
		 clock_t start, finish; 
		start = clock();
		cout << ">Paralelle Global     |";
		cudaMalloc((void**)&dev_sommet,tX*tX*sizeof(int));   
		cudaMalloc((void**)&dev_adjacence,tX*tX*sizeof(int));   
		cudaMemcpy(dev_sommet,sommet,tX*tX*sizeof(int), cudaMemcpyHostToDevice);
		cudaMemcpy(dev_adjacence,adjacence,tX*tX*sizeof(int), cudaMemcpyHostToDevice);
		finish = clock(); 
		tempsmemoire +=  (double)(finish - start) / CLOCKS_PER_SEC;
		dim3 blockdim(nthreads,nthreads);
		int reste = (tX % nthreads > 0);
		dim3 griddim(tX/nthreads+reste,tX/nthreads+reste);
		//cout << " parallele, nombre de blocks = " << tX/nthreads+reste << endl;
		 for(int i = 0; i<tX;i++){
			start = clock();
			KernelPar<<<griddim,blockdim>>>(dev_sommet,dev_adjacence,tX,i);
			cudaDeviceSynchronize();
			finish = clock(); 
			tempscalcul +=  (double)(finish - start) / CLOCKS_PER_SEC;
			progression++;
			barreChargement(palier,progression,max,&valCourant);
		 }
 
		start = clock();
		cudaMemcpy((void**)sommet, dev_sommet,tX*tX*sizeof(int), cudaMemcpyDeviceToHost);
		cudaMemcpy((void**)adjacence, dev_adjacence,tX*tX*sizeof(int), cudaMemcpyDeviceToHost);
		cudaFree(dev_sommet);
		cudaFree(dev_adjacence);
		finish = clock(); 
		tempsmemoire +=  (double)(finish - start) / CLOCKS_PER_SEC;
		global = tempscalcul;
		cout <<" GPU :" <<tempscalcul<< "s" <<"   Memoire: " << tempsmemoire<< endl;
}
 
// ------------------ Floyd avec mémoire partagée -----------------------------------------------
void floydParShared(int *sommet, int *adjacence,int taille, int nthreads,int affichageRes){
	cout << ">Paralelle Shared     |";
	int* dev_sommet;
	int* dev_adjacence;
	int i = 0;
	int nblocks = 1;
	int reste = (taille % nthreads > 0);
	double tPhase1 = 0.0;
	double tPhase2 = 0.0;
	double tPhase3 = 0.0; 
	long double progression = 0;
	long double valCourant = 0;
	long double max = 3*((taille/nthreads)+reste);
	long double palier = (3*(taille/nthreads)+reste)/10*((taille/nthreads)+reste > 10) + (3*(taille/nthreads+reste))*((taille/nthreads)+reste < 10);
	clock_t finish,start;
	// nombre de threads
	dim3 threads(nthreads,nthreads);
 
	// taille du tableau des sommets
	size_t tailleSommet = taille*taille*sizeof(int);
 
	// taille de la mémoire partagée pour la phase 1
	size_t tailleBlock = nthreads*nthreads*sizeof(int);
 
	// allocation et copie en mémoire
	cudaMalloc((void**)&dev_sommet,tailleSommet);
	cudaMalloc((void**)&dev_adjacence,tailleSommet);
	cudaMemcpy(dev_sommet,sommet,tailleSommet,cudaMemcpyHostToDevice);
	cudaMemcpy(dev_adjacence,adjacence,tailleSommet,cudaMemcpyHostToDevice);
	dim3 nBlocks_phase3((taille/nthreads)+reste,(taille/nthreads)+reste);
	tailleSommet = taille*taille*sizeof(int);
	nblocks = 2*((taille/nthreads)+reste);
	for(i = 0; i<(taille/nthreads)+reste;i++){
		// execution de la phase 1 
		start = clock();
		floydSharedPhase_1<<<1,threads,tailleBlock>>>(dev_sommet,dev_adjacence,taille,i);
		cudaDeviceSynchronize();
		finish = clock(); 
		tPhase1 +=  (double)(finish - start) / CLOCKS_PER_SEC;
 
		if(affichageRes){
			cudaMemcpy(sommet,dev_sommet,tailleSommet,cudaMemcpyDeviceToHost);
			cout << " sommets après la phase 1, iteration" << i <<endl;
			affichage(sommet,taille);
			cout << endl;
			cudaMemcpy(adjacence,dev_adjacence,tailleSommet,cudaMemcpyDeviceToHost);
			cout << " adjacence après la phase 1, iteration" << i <<endl;
			affichage(adjacence,taille);
			cout << endl;
		}else{
			progression++;
			barreChargement(palier,progression,max,&valCourant);
		}
 
		// execution de la phase 2
		start = clock();
		floydSharedPhase_2<<<nblocks,threads,2*tailleBlock>>>(dev_sommet,dev_adjacence,taille,i);
		cudaDeviceSynchronize();
		finish = clock(); 
		tPhase2 +=  (double)(finish - start) / CLOCKS_PER_SEC;
		if(affichageRes){
			cudaMemcpy(sommet,dev_sommet,tailleSommet,cudaMemcpyDeviceToHost);
			cout << " sommets après la phase 2, iteration" << i <<endl;
			affichage(sommet,taille);
			cout << endl;
			cudaMemcpy(adjacence,dev_adjacence,tailleSommet,cudaMemcpyDeviceToHost);
			cout << " adjacence après la phase 2, iteration" << i <<endl;
			affichage(adjacence,taille);
			cout << endl;
		}else{
			progression++;
			barreChargement(palier,progression,max,&valCourant);
		}
 
		// execution de la phase 3
		start = clock();
		floydSharedPhase_3<<<nBlocks_phase3,threads,3*tailleBlock>>>(dev_sommet,dev_adjacence,taille,i);
		cudaDeviceSynchronize();
		finish = clock(); 
		tPhase3 +=  (double)(finish - start) / CLOCKS_PER_SEC;
		if(affichageRes){
			cudaMemcpy(sommet,dev_sommet,tailleSommet,cudaMemcpyDeviceToHost);
			cout << " sommets après la phase 3, iteration" << i <<endl;
			affichage(sommet,taille);
			cout << endl;
			cudaMemcpy(adjacence,dev_adjacence,tailleSommet,cudaMemcpyDeviceToHost);
			cout << " adjacence après la phase 3, iteration" << i <<endl;
			affichage(adjacence,taille);
			cout << endl<<endl<<endl;
		}else{
			progression++;
			barreChargement(palier,progression,max,&valCourant);
		}
	}
	cudaMemcpy(sommet,dev_sommet,tailleSommet,cudaMemcpyDeviceToHost);
	cudaMemcpy(adjacence,dev_adjacence,tailleSommet,cudaMemcpyDeviceToHost);
	shared = tPhase1+tPhase2+tPhase3;
	cout << " GPU :" <<shared <<"s P1:" <<tPhase1 << " P2:" <<tPhase2 << " P3:" << tPhase3 <<endl;
}
__global__ void floydSharedPhase_1(int* sommet,int* adjacence,int taille, int k){
	int ind1DLoc = threadIdx.x + threadIdx.y*blockDim.y;
	int ind2DLoc_x = threadIdx.x;
	int ind2DLoc_y = threadIdx.y;
	int offset = blockDim.x*blockDim.y;
	// indice correcte pour accéder en fonction de l'itération k aux bonnes valeurs dans sommet
	int ind1DGlo = (k*blockDim.x + ind2DLoc_x) + ((k*blockDim.x+ ind2DLoc_y)*taille) ;
 
	// mémoire partagée ou l'on copiera les valeurs pour faire l'algo de floyd
	extern __shared__  int blockCourant[];
	if(ind1DGlo < taille*taille){
		// récupère en fonction de k, les valeurs
		blockCourant[ind1DLoc] = sommet[ ind1DGlo ];
		//blockCourant[ind1DLoc + offset] = adjacence[ ind1DGlo ];
		syncthreads();
		if(ind2DLoc_x < taille*taille)
			for(int i = 0; i<blockDim.x; i++)
			{
				if(blockCourant[ind2DLoc_x + i*blockDim.x] + blockCourant[i + ind2DLoc_y*blockDim.y] < blockCourant[ind1DLoc] ){
					blockCourant[ind1DLoc] = blockCourant[ind2DLoc_x + i*blockDim.x] + blockCourant[i + ind2DLoc_y*blockDim.y];
					//adjacence[ind1DGlo] = (ind1DGlo % taille) - ind2DLoc_x + i;  // juste !
					adjacence[ind1DGlo] = k*blockDim.x + i;
					//blockCourant[ind1DLoc + offset] =  k*blockDim.x + i; // adjacence
				}
				syncthreads();
			}
		sommet[ ind1DGlo ] = blockCourant[ind1DLoc] ;
		//adjacence[ ind1DGlo] = blockCourant[ind1DLoc + offset];
	}
}
__global__ void floydSharedPhase_2(int* sommet,int* adjacence,int taille,int k){
	// indices global, par rapport a la grille de blocks
	int ind2DGlo_x, ind2DGlo_y ;
	int	ind2DLoc_x = threadIdx.x;
	int ind2DLoc_y = threadIdx.y;
	int ind1DLoc   = ind2DLoc_x + ind2DLoc_y*blockDim.x;
	int id,ind1DGlo;
	extern __shared__  int memPartagee[];
	int offset = blockDim.x*blockDim.y;
 
	// trouve les bon indices pour la ligne
	if(blockIdx.x < gridDim.x/2)
	{
		ind2DGlo_x = ind2DLoc_x + blockIdx.x*blockDim.x;
		ind2DGlo_y = (k*blockDim.x+ind2DLoc_y);
		id = blockIdx.x;
	}else
	{ 
	// trouve les bons indices pour la colone
		ind2DGlo_x = ind2DLoc_x + k*blockDim.x;
		ind2DGlo_y = blockIdx.x*blockDim.x - blockDim.x*(gridDim.x/2) + ind2DLoc_y;
		id = blockIdx.x - gridDim.x/2;
	}
	ind1DGlo = ind2DGlo_x + ind2DGlo_y*taille;
	if((ind1DGlo < taille*taille) && (id != k) )
	{
		memPartagee[ind1DLoc] = sommet[ind1DGlo];
		memPartagee[ind1DLoc+offset] = sommet[ (k*blockDim.x+ind2DLoc_x) + (k*blockDim.x+ind2DLoc_y)*taille];
		//memPartagee[ind1DLoc + 2*offset] = adjacence[ ind1DGlo ];  // adjacence
		syncthreads();
		for(int i =0; i<blockDim.x;i++)
		{
			if(blockIdx.x < gridDim.x/2) // ligne
			{
				if(memPartagee[ind1DLoc] > memPartagee[ind2DLoc_x + i*blockDim.x] +memPartagee[offset+i + ind2DLoc_y*blockDim.x])
				{
					memPartagee[ind1DLoc] = memPartagee[ind2DLoc_x + i*blockDim.x] + memPartagee[offset+i + ind2DLoc_y*blockDim.x];
					adjacence[ind1DGlo] = k*blockDim.x+i; // juste !?
					//memPartagee[ind1DLoc + 2*offset] = k*blockDim.x+i; // adjacence
				}
			}
			else // colone
			{
				if(memPartagee[ind1DLoc] > memPartagee[i + ind2DLoc_y*blockDim.x] + memPartagee[offset +ind2DLoc_x + i*blockDim.x])
				{
					memPartagee[ind1DLoc] = memPartagee[i + ind2DLoc_y*blockDim.x] + memPartagee[offset +ind2DLoc_x + i*blockDim.x];
					adjacence[ind1DGlo] = k*blockDim.x+i;//ind2DGlo_x - ind2DLoc_x + i; k*blockDim.x+i;  // juste les deux!?
					//memPartagee[ind1DLoc + 2*offset] = k*blockDim.x+i; // adjacence
				}
			}	
		}
		sommet[ind1DGlo] = memPartagee[ind1DLoc];
		//adjacence[ind1DGlo] = memPartagee[ind1DLoc + 2*offset];
	}
}
__global__ void floydSharedPhase_3(int* sommet,int* adjacence,int taille,int k)
{
	int	ind2DLoc_x = threadIdx.x;
	int ind2DLoc_y = threadIdx.y;
	int ind1DLoc = ind2DLoc_x + ind2DLoc_y*blockDim.x;
	int ind2DGlo_x = blockIdx.x*blockDim.x + ind2DLoc_x;
	int ind2DGlo_y = blockIdx.y*blockDim.y + ind2DLoc_y;
	int ind1DGlo = ind2DGlo_x + ind2DGlo_y*taille;
	int offset = blockDim.x*blockDim.y;
	extern __shared__  int memPartagee[];
 
	if(blockIdx.x != k && blockIdx.y != k && ind1DGlo < taille*taille){
		memPartagee[ind1DLoc]            = sommet[ind1DGlo];
		memPartagee[ind1DLoc + offset]   = sommet[k*blockDim.x+ind2DLoc_x + ind2DGlo_y*taille];
		memPartagee[ind1DLoc + 2*offset] = sommet[ind2DGlo_x + (ind2DLoc_y+k*blockDim.x)*taille];
	//	memPartagee[ind1DLoc + 3*offset] = adjacence[ind1DGlo];
		syncthreads();
		for(int i=0; i<blockDim.x;i++){
			if(memPartagee[ind1DLoc] > memPartagee[offset + i + ind2DLoc_y*blockDim.x]  + memPartagee[2*offset+ind2DLoc_x + i*blockDim.x])
			{
				memPartagee[ind1DLoc] = memPartagee[offset + i + ind2DLoc_y*blockDim.x] + memPartagee[2*offset + ind2DLoc_x + i*blockDim.x];
				adjacence[ind1DGlo] = blockDim.x*k + i ; // juste
				//memPartagee[ind1DLoc + 3*offset] = blockDim.x*k + i ; //juste
			}
		}
		syncthreads();
		sommet[ind1DGlo] = memPartagee[ind1DLoc];
		//adjacence[ind1DGlo] = memPartagee[ind1DLoc + 3*offset] ;
	}
}
 
// ----------------- Fonction de vérification -------------------------------------
int verif(int* tab11, int* tab12,int* tab21, int* tab22, int tX){
	long double progression = 0;
	long double valCourant = 0;
	long double max = tX*tX;
	long double palier = (tX*tX)/10;
	int err = 0;
	for(int i = 0; i<tX*tX;i++){
		if(tab11[i] != tab12[i])
			err++;
		if(tab21[i] != tab22[i])
			err++;
		progression++;
		barreChargement(palier,progression,max,&valCourant);
	}
	return err;
}
 
int main()
{
	srand(5);
	int tX = 512;
	int nthreads = 32;
	srand(25);
	int erreurs;
	//while(nthreads <= 16){
	while(tX <= 8192){
		cout <<endl<< "  --- DIMENSION = " << tX <<" --- Threads par blocks = "<< nthreads << " ------" <<endl ; 
		 int* sommet = new int[tX*tX];
		 int* adjacence = new int[tX*tX];
		 int* sommet2 = new int[tX*tX];
		 int* adjacence2 = new int[tX*tX];
		 int* sommet3 = new int[tX*tX];
		 int* adjacence3 = new int[tX*tX];
 
		 remplirTableau(sommet,adjacence,tX);
		 for(int i = 0; i<tX*tX;i++){
				 sommet2[i] = sommet[i];
				 adjacence2[i]= adjacence[i];
				 sommet3[i] =  sommet[i];
				 adjacence3[i] = adjacence[i];
		}
 
		floydParShared(sommet2,adjacence2,tX,nthreads,0);
		floydPar( sommet3, adjacence3,tX,nthreads);
		/* verification des erreurs */
	/*	floydSequentiel( sommet, adjacence,tX);
		cout << "   Check: Shared      |";
		erreurs = verif(sommet,sommet2,adjacence,adjacence,tX) ;
		cout << " " << erreurs << " erreurs" << endl;
		cout << "   Check: Global      |" ;
		erreurs = verif(sommet,sommet3,adjacence,adjacence3,tX) ;
		cout << " " << erreurs << " erreurs" << endl;
		cout << " --- > Speed-Up CPU/GPU (memoire Globale)  = " << seq/global << endl;
		cout << " --- > Speed-Up CPU/GPU (memoire Partagee) = " << seq/shared << endl;
		cout << " --- > Speed-Up GPU/GPU (Globale Partagee) =  " << global/shared << endl;*/
		//getchar();
		tX = tX*2;
		//nthreads = nthreads*2;
	}
	//getchar();
}

La différence est vraiment falgrante : 67 sec sous windows pour trouver les plus courts chemins , contre 15 sec sous Linux.

J'utilise Fedora 16 avec les derniers pilotes nvidia (295.41 de tête)
à propos de windows j 'utilise windows 7 ultimate avec les pilotes 296.10.

Si quelqu'un à une idée conçernant cette différence de perf, je suis preneur

**gbdivers** · 26/04/2012, 21h40

Bonjour et bienvenue sur le forum

J'avais déjà constaté ça mais de mémoire, la différence était moindre. Je pense que le problème ne vient pas de cuda et de nvcc en particulier mais probablement plus d'une plus grande lourdeur de windows (troll interdit).
As tu fais du profiling pour voir l'origine du problème ?

différence de perfs énorme entre Linux/Windows, ou est le problème ?

CUDA

Discussions similaires

Partager

Partager