1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
|
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t compVect(unsigned char *c, const int *lona, const int *lata, const int *lonb, const int *latb, unsigned int sizeA, unsigned int sizeB);
__global__ void compKernel(unsigned char *c, const int *lona, const int *lata, const int *lonb, const int *latb, int nbA, int nbB)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int k = i + j * nbA;
while (k < nbA*nbB)
{
if (lona[i] == lonb[j] && lata[i] == latb[j])
c[k] = 1;
else
c[k] = 0;
i += gridDim.x*blockDim.x;
j += gridDim.y*blockDim.y;
k = i + j * nbA;
}
}
int main()
{
int *lon[2];
int *lat[2];
int nb[2];
#ifdef TEST
lon[0] = new int[15];
lat[0] = new int[15];
lon[1] = new int[7];
lat[1] = new int[7];
nb[0] = 15;
nb[1] = 7;
for (int i = 0; i < 15; i++)
{
lon[0][i] = i + 1;
lat[0][i] = 2 * i;
}
for (int i = 0; i < 7; i++)
{
lon[1][i] = 3 * i + 1;
lat[1][i] = 4 * i;
}
lon[0][14] = lon[1][6];
lat[0][14] = lat[1][6];
for (int i = 0; i < 15; i++)
{
printf("L=%i l=%i ", lon[0][i], lat[0][i]);
}
printf("\n");
for (int i = 0; i < 7; i++)
{
printf("L=%i l=%i ", lon[1][i], lat[1][i]);
}
unsigned char *c = new unsigned char[nb[0] * nb[1]];
// Add vectors in parallel.
cudaError_t cudaStatus = compVect(c, lon[0], lat[0], lon[1], lat[1], nb[0], nb[1]);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
for (int i = 0; i < nb[0] * nb[1]; i++)
{
if (c[i] == 1)
{
int y = i / nb[0];
int x = i % nb[0];
printf("A=%i B=%i", x, y);
}
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t compVect(unsigned char *c, const int *lona, const int *lata, const int *lonb, const int *latb, unsigned int sizeA, unsigned int sizeB)
{
int *dev_lona = 0;
int *dev_lonb = 0;
int *dev_lata = 0;
int *dev_latb = 0;
unsigned char *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, sizeA*sizeB * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_lona, sizeA * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_lata, sizeA * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_lonb, sizeB * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_latb, sizeB * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_lona, lona, sizeA * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_lata, lata, sizeA * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_lonb, lonb, sizeB * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_latb, latb, sizeB * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
dim3 grid(2, 2);
compKernel << <grid, grid >> > (dev_c, dev_lona, dev_lata, dev_lonb, dev_latb,sizeA,sizeB);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, sizeA * sizeB * sizeof(unsigned char), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_lona);
cudaFree(dev_lonb);
cudaFree(dev_lata);
cudaFree(dev_latb);
return cudaStatus;
} |
Partager