1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
| #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#include <CL/cl.h>
#include <stdio.h>
#define CL_CHECK(_expr) \
do { cl_int _err = _expr; \
if (_err == CL_SUCCESS) break; \
fprintf(stderr, "OpenCL Error: '%s'returned %d!\n", #_expr, (int)_err); \
abort(); \
} while (0)
const char* programSource[] = {
"__kernel void simpleMultiply(__global float* outputC, ",
" __global float* inputA, ",
" __global float* inputB, ",
" int width){ ",
" int row = get_global_id(1); ",
" int col = get_global_id(0); ",
" float sum = 0.0f; ",
" for (int i = 0; i < width; i++) ",
" sum += inputA[row*width+i] * inputB[col*width+i]; ",
" outputC[row*width+col] = sum; ",
"} "
};
int main() {
/* Generation des matrices a multiplier*/
const int tailleMatrices = 4;
int HA = tailleMatrices;
int HB = tailleMatrices;
int WB = tailleMatrices;
int WA = tailleMatrices;
int datasize = tailleMatrices*tailleMatrices*sizeof(float);
float *MatriceA =(float*) malloc(tailleMatrices*tailleMatrices*sizeof(float));
for(int i=0;i<tailleMatrices;i++)
{
for(int j=0;j<tailleMatrices;j++)
{
MatriceA[i*tailleMatrices+j]=i+j;
}
}
float *MatriceB =(float*) malloc(tailleMatrices*tailleMatrices*sizeof(float));
for(int i=0;i<tailleMatrices;i++)
{
for(int j=0;j<tailleMatrices;j++)
{
MatriceB[i*tailleMatrices+j]=i+j;
}
}
float *MatriceResult = (float*)malloc(tailleMatrices*tailleMatrices*sizeof(float));
for(int i=0;i<tailleMatrices;i++)
{
for(int j=0;j<tailleMatrices;j++)
{
MatriceResult[i*tailleMatrices+j]=0;
}
}
cl_ulong time_start=0.0f, time_end=0.0f;
long total_time=0.0f;
cl_event event;
// Use this to check the output of each API call
cl_int status;
////////////////////////// STEP 1 //////////////////////////////////////////
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
// Use clGetPlatformIDs() to retrieve the number of platforms
status = clGetPlatformIDs(0, NULL,&numPlatforms);
// Allocate enough space for each platform
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
// Fill in platforms with clGetPlatformIDs()
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
///////////////////////// STEP 2 //////////////////////////////////////////
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
// Use clGetDeviceIDs() to retrieve the number of devices present
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,0, NULL, &numDevices);
// Allocate enough space for each device
devices = (cl_device_id*)malloc(numDevices *sizeof(cl_device_id));
// Fill in devices with clGetDeviceIDs()
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL,numDevices, devices,NULL);
////////////////////////// STEP 3 //////////////////////////////////////////
cl_context context = NULL;
// Create a context using clCreateContext() and associate it with the devices
context = clCreateContext(NULL, numDevices,devices, NULL, NULL, &status);
////////////////////////// STEP 4 //////////////////////////////////////////
cl_command_queue cmdQueue;
// Create a command queue using
// clCreateCommandQueue(), and associate it with
// the device you want to execute on
cmdQueue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &status); // Add CL_QUEUE_PROFILING_ENABLE
////////////////////////// STEP 5 //////////////////////////////////////////
// Use clCreateBuffer() to create buffer objects
// that will contain the data from the host arrays
cl_mem bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY, WA*HA*sizeof(float), NULL, &status);
cl_mem bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY, WB*HB*sizeof(float), NULL, &status);
cl_mem bufferResult= clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status);
////////////////////////// STEP 6 //////////////////////////////////////////
// Use clEnqueueWriteBuffer() to write input array A
status = clEnqueueWriteBuffer(cmdQueue,bufferA, CL_TRUE, 0, WA*HA*sizeof(float), (void *)MatriceA, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue,bufferB, CL_TRUE, 0, WB*HB*sizeof(float), (void *)MatriceB, 0, NULL, NULL);
////////////////////////// STEP 7 //////////////////////////////////////////
// Create a program using
// clCreateProgramWithSource()
cl_program program = clCreateProgramWithSource(context, sizeof(programSource)/sizeof(*programSource), programSource, NULL, &status);
// Build (compile) the program for the devices with clBuildProgram()
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
/*
2. Ajouter des appels à clGetProgramBuildInfo et tester la compilation dynamique en introduisant des erreurs dans le kernel
*/
for(unsigned int i=0;i<numDevices;i++)
{
size_t len;
clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
if(len>2)
{
char *buffer = (char *)malloc(len);
clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
printf("[*] Error kernel %d: %s\n",i,buffer);
}
}
// Fin question 2
////////////////////////// STEP 8 //////////////////////////////////////////
cl_kernel kernel = NULL;
// Use clCreateKernel() to create a kernel from the vector addition function (named "vecadd")
kernel = clCreateKernel(program, "simpleMultiply", &status);
////////////////////////// STEP 9 //////////////////////////////////////////
// Associate the input and output buffers with the kernel using clSetKernelArg()
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferResult);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferA);
status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferB);
status |= clSetKernelArg(kernel, 3, sizeof(tailleMatrices), &tailleMatrices);
////////////////////////// STEP 10 //////////////////////////////////////////
// Define an index space (global work size) of work
// items for execution. A workgroup size (local work
// size) is not required, but can be used.
size_t globalWorkSize[2];
// There are 'elements' work-items
globalWorkSize[0] = WB;
globalWorkSize[1] = HA;
////////////////////////// STEP 11 //////////////////////////////////////////
// Execute the kernel by using clEnqueueNDRangeKernel().
// 'globalWorkSize' is the 1D dimension of the work-items
/* question 3 */
status = clEnqueueNDRangeKernel(cmdQueue,kernel, 2, NULL, globalWorkSize,NULL, 0, NULL, &event); // add &event
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("\nExecution time in milliseconds = %0.3f ms\n", (total_time / 1000000.0) );
/* fin question 3 */
clEnqueueReadBuffer(cmdQueue, bufferResult, CL_TRUE, 0, datasize, MatriceResult, 0, NULL, NULL);
for(int i=0;i<tailleMatrices;i++)
{
for(int j=0;j<tailleMatrices;j++)
{
printf("%f -",MatriceResult[i*tailleMatrices+j]);
}
printf("\n");
}
////////////////////////// STEP 13 //////////////////////////////////////////
// Free OpenCL ressources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseContext(context);
// Free host ressources
free(MatriceA);
free(MatriceB);
free(platforms);
free(devices);
} |
Partager