1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
| #include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <cuda_runtime.h>
__global__ void setup_kernel(curandState *state)
{
int id = threadIdx.x + blockIdx.x * 64;
/* Each thread gets same seed, a different sequence
number, no offset */
curand_init(1234, id, 0, &state[id]);
}
__global__ void generate_kernel(curandState *state,
int n,
unsigned int *result)
{
int id = threadIdx.x + blockIdx.x * 64;
int count = 0;
unsigned int x;
/* Copy state to local memory for efficiency */
curandState localState = state[id];
/* Generate pseudo-random unsigned ints */
for(int i = 0; i < n; i++) {
x = curand(&localState);
/* Check if low bit set */
result[id]=x;
}
/* Copy state back to global memory */
state[id] = localState;
/* Store results */
// result[id] += count;
}
int main(){
curandState *devStates;
unsigned int *devResults, *hostResults;
unsigned int total;
int i;
int sampleCount = 10000;
hostResults = (unsigned int *)calloc(64 * 64, sizeof(int));
cudaMalloc((void **)&devStates, 64 * 64 *
sizeof(curandState));
cudaMalloc((void **)&devResults, 64 * 64 *
sizeof(unsigned int));
cudaMemset(devResults, 0, 64 * 64 *
sizeof(unsigned int));
setup_kernel<<<64, 64>>>(devStates);
for(i = 0; i < 50; i++) {
generate_kernel<<<64, 64>>>(devStates, sampleCount, devResults);
}
cudaMemcpy(hostResults, devResults, 64 * 64 *
sizeof(unsigned int), cudaMemcpyDeviceToHost);
total = 0;
for(i = 0; i < 64 * 64; i++) {
total += hostResults[i];
printf("total : %i\n",total);
}
cudaFree(devResults);
free(hostResults);
return 0;
} |
Partager