#include <vector>
#include <cuda.h>
#include <cutil.h>
#include <cstdio>


__global__ void testCUDA(int* v, int N)
{
    int i = threadIdx.x;

    if (i < N)
        v[i] *= v[i];
}


extern "C" void testVoidCu()
{
    // Allocation sur l'hte
    std::vector<int> v_host;
    int N = 10;

    for (int i = 0; i < N; ++i)
        v_host.push_back(i);

    // Allocation dans le device
    int* v_dev;
    cudaMalloc((void**) &v_dev, v_host.size() * sizeof(int));
    // Copie
    cudaMemcpy(v_dev, &v_host[0], v_host.size() * sizeof(int), cudaMemcpyHostToDevice);
    
    // Lancement du noyau
    int NBlocks   = 1;
    int BlockSize = 16;

    testCUDA <<< NBlocks, BlockSize >>> (v_dev, N);

    // Rcupration des donnes
    cudaMemcpy(&v_host[0], v_dev, v_host.size() * sizeof(int), cudaMemcpyDeviceToHost);

    // Affichage des donnes :
    for (int i = 0; i < N; ++i) {
        printf("%d %d\n", i, v_host[i]);
    }

    cudaFree(v_dev);
}