1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
|
__global__ void MatrixMulKernel2(double * m1, double * m2, double * Pd, int m, int n, int p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < m)
{
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (j < p)
{
double Pvaleur = 0;
for (int k = 0; k < n; ++k)
Pvaleur += m1[k * m + i] * m2[j * n + k];
Pd[j * m + i] = Pvaleur;
}
}
}
int MatrixMulOnDevice(cudaDeviceProp &cDP, double * matrice1, double * matrice2, double * matriceResult, int m, int n, int p)
{
//calcul de la taille des matrices
double *cuda_matrix1;
double *cuda_matrix2;
double *cuda_matrixResult;
int size1 = sizeof(double) * m * n;
int size2 = sizeof(double) * n * p;
int size3 = sizeof(double) * m * p;
cudaError_t cudaStatus = cudaMalloc((void**)&cuda_matrix1, size1);
if (cudaStatus != cudaSuccess)
return cudaStatus;
cudaStatus = cudaMalloc((void**)&cuda_matrix2, size2);
if (cudaStatus != cudaSuccess)
return cudaStatus;
cudaStatus = cudaMalloc((void**)&cuda_matrixResult, size3);
if (cudaStatus != cudaSuccess)
return cudaStatus;
cudaStatus = cudaMemcpy(cuda_matrix1, matrice1, size1, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
return cudaStatus;
cudaStatus = cudaMemcpy(cuda_matrix2, matrice2, size2, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
return cudaStatus;
int BLOCK_WIDTH = 16;
dim3 dimGrid((m / BLOCK_WIDTH) + ( (m % BLOCK_WIDTH) == 0 ? 0 : 1), (p / BLOCK_WIDTH) + ( (p % BLOCK_WIDTH) == 0 ? 0 : 1));
if (dimGrid.x < cDP.maxGridSize[0] && dimGrid.y < cDP.maxGridSize[1])
{
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
if (BLOCK_WIDTH * BLOCK_WIDTH < cDP.maxThreadsPerBlock)
{
MatrixMulKernel2<<<dimGrid, dimBlock>>>(cuda_matrix1, cuda_matrix2, cuda_matrixResult, m, n, p);
cudaMemcpy(matriceResult, cuda_matrixResult, size3, cudaMemcpyDeviceToHost);
}
else
return -2;
}
else
return -1;
cudaFree(cuda_matrix1);
cudaFree(cuda_matrix2);
cudaFree(cuda_matrixResult);
return 0;
} |
Partager