CUDA - Dev sur le GPU

**pepito3364** · 15/04/2014, 11h59

Bonjour,

Je cherche à utiliser la puissance des cartes graphiques pour faire du calcul parallèle.
Je commence donc avec une petite appli pour comparer la multiplication entre 2 matrices (n*p) et (p*n) entre une implémentation séquentielle sur le CPU et une parallèle sur le GPU.

Pour des petits jeux de données, ça marche bien mais dès que je passe à des matrices de plus de 1000 par 1000, je me retrouve avec un écran noir et je suis obligé de rebooter ma machine..

main.cpp

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
 
#include "stdafx.h"
#include "test.h"
 
#include <iostream>
#include <Windows.h>
 
#include <ppl.h>
 
using namespace std;
 
void MultiplicationMatriceCPU(double *matrice1, double *matrice2,int n, int p, double *matriceR)
{
	double tempo = 0.0;
	for (int i = 0; i < n; i++)
	{
		for (int j = 0; j < n; j++)
		{
			tempo = 0.0;
			for (int k = 0; k < p; k++)
				tempo += matrice1[k * n + i] + matrice2[j * p + k];
			matriceR[j * n + i] = tempo;
		}
	}
}
 
void MultiplicationMatriceCPUParallel(double *matrice1, double *matrice2,int n, int p, double *matriceR)
{
	Concurrency::parallel_for(0, n, 1, [&](int i)
	{
		double tempo = 0.0;
		for (int j = 0; j < n; j++)
		{
			tempo = 0.0;
			for (int k = 0; k < p; k++)
				tempo += matrice1[k * n + i] + matrice2[j * p + k];
			matriceR[j * n + i] = tempo;
		}
	});
}
 
void Afifchage(double *matrice, int n, int p, string prompt)
{
	cout<<prompt.c_str()<<endl;
	for (int i = 0; i < n; i++)
	{
		for (int j = 0; j < p; j++)
		{
			cout<<matrice[j * n + i]<<"  ";
		}
		cout<<endl;
	}
	cout<<endl;
}
 
int _tmain(int argc, _TCHAR* argv[])
{
	int n = 10;
	int p = 10;
 
	double *matrice1 = new double[n * p];
	double *matrice2 = new double[n * p];
 
	double *matriceR1 = new double[n * n];
	double *matriceR2 = new double[n * n];
	double *matriceR3 = new double[n * n];
 
	for (int i = 0; i < n * p; ++i)
	{
		matrice1[i] = rand() % 4;
		matrice2[i] = rand() % 4;
	}
 
	//multiplication sur le GPU
	double t1 = GetTickCount();
	MultiplicationMatriceGPU(matrice1, matrice2, matriceR1, n, p);
	t1 = GetTickCount() - t1;
 
 
	//multiplication sur le CPU
	double t2 = GetTickCount();
	MultiplicationMatriceCPU(matrice1, matrice2, n, p, matriceR2);
	t2 = GetTickCount() - t2;
 
	//multiplication sur le CPU parallélisé
	double t3 = GetTickCount();
	MultiplicationMatriceCPUParallel(matrice1, matrice2, n, p, matriceR3);
	t3 = GetTickCount() - t3;		
 
	//Affichage des résultats
	int nbDiff = 0;
	for (int i = 0; i < n * n; ++i)
	{
		if (matriceR2[i] != matriceR3[i] || matriceR2[i] != matriceR1[i])
			++nbDiff;
	}
 
	cout<<"Resultats :"<<endl;
	if (nbDiff > 0)
	{
		cout<<"  Le code n'est pas correct. ("<<nbDiff<<" differences)"<<endl;
 
		if (n < 10)
		{
			Afifchage(matrice1, n, p, "matrice1:");
			Afifchage(matrice2, p, n, "matrice2:");
			Afifchage(matriceR1, n, n, "matriceR1 (GPU):");
			Afifchage(matriceR3, n, n, "matriceR2 (CPU):");
			Afifchage(matriceR3, n, n, "matriceR3 (CPU multicore):");
		}
	}
	else
	{
		cout<<"  - Les résultats sont corrects."<<endl;
		cout<<"  - Temps de traitement:"<<endl;
		cout<<"    * GPU: "<<t1<<endl;
		cout<<"    * CPU: "<<t2<<endl;
		cout<<"    * CPU (multicore): "<<t3<<endl;
	}
 
	delete [] matrice1;
	delete [] matrice2;
 
	delete [] matriceR1;
	delete [] matriceR2;
	delete [] matriceR3;
 
	return 1;
}

test.cu:

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
 
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
 
__global__ 
	void MatrixMulKernel(double* matrice1, 
	double* matrice2, 
	double* matriceResult, 
	int n,
	int p)
{
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	const int j = blockIdx.y * blockDim.y + threadIdx.y;
 
	if (i > n || j > n) 
		return;
 
	double P_val = 0.0;
	for (int k = 0; k < p; ++k)
	{
		P_val += matrice1[k * n + i] + matrice2[j * p + k];
	}
 
	matriceResult[j * n + i] = P_val;
}
 
void MultiplicationMatriceGPU(double *matrice1, double *matrice2, double *matriceResult, int n, int p)
{
	int BLOCK_SIZE = 16;
 
	//préparation des matrices (allocation + initialisation)
 
	//préparation du lien avec le GPU
 
	//allocation mémoire dans le GPU
	double *cuda_matrix1;
	double *cuda_matrix2;
	double *cuda_matrixResult;
 
	cudaMalloc((void**)&cuda_matrix1, sizeof(double) * n * p);
	cudaMalloc((void**)&cuda_matrix2, sizeof(double) * p * n);
	cudaMalloc((void**)&cuda_matrixResult, sizeof(double) * n * n);
 
	cudaMemcpy(cuda_matrix1, matrice1, sizeof(double) * n * p, cudaMemcpyHostToDevice);
	cudaMemcpy(cuda_matrix2, matrice2, sizeof(double) * p * n, cudaMemcpyHostToDevice);
 
	if (n / double(BLOCK_SIZE) > int(n / double(BLOCK_SIZE)))
		BLOCK_SIZE = 1;
	else if (p / double(BLOCK_SIZE) > int(p / double(BLOCK_SIZE)))
		BLOCK_SIZE = 1;
 
	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
	dim3 dimGrid((n + dimBlock.x - 1) / dimBlock.x, (n + dimBlock.y - 1) / dimBlock.y);
 
	//<<<...,...>> kernel launch”
	MatrixMulKernel<<<dimGrid, dimBlock>>>(cuda_matrix1, cuda_matrix2, cuda_matrixResult, n, p);
 
 
	//copie du GPU vers la RAM
	cudaMemcpy(matriceResult, cuda_matrixResult, sizeof(double) * n * n, cudaMemcpyDeviceToHost);
 
 
	cudaFree(cuda_matrix1);
	cudaFree(cuda_matrix2);
	cudaFree(cuda_matrixResult);
 
}

test.h

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

void MultiplicationMatriceGPU(double *matrix1, double *matrix2, double *matrixResult, int N, int P);

Voilà voilà tout mon code!

D'un point de vue config, je dev sur visual 2010 sous windows 7 avec une GeForce GT 620 (1 Go de DRam et 48 coeurs).

Mon problème d'écran noir provient de quoi? Est-ce que c'est la mémoire du GPU qui explose, le nombre de threads, de blocks?

Je débute tout juste, et je suis preneur de toute aide et conseil.

Merci beaucoup!!

CUDA - Dev sur le GPU

C++

Mode arborescent

Discussions similaires

Partager

Partager