@KT:Listing 3 @LI:$$donumber #define __CL_ENABLE_EXCEPTIONS #include "convolve.hpp" #include "timer.hpp" #include "CL/cl.hpp" #include // uchar max, min #include #include /** * The OpenCL kernel for image convolution. */ const char* kernelSource = "\ __kernel void convolveKernel(\ global uchar *in,\ uint inWidth,\ uint inHeight,\ global uint *out,\ uint outWidth,\ uint outHeight,\ global float *convKernel,\ uint convKernelWidth,\ uint convKernelHeight)\ {\ size_t x = get_global_id(0);\ size_t y = get_global_id(1);\ \ /* Kill unneeded threads */\ if(x >= outWidth || y >= outHeight)\ {\ return;\ }\ \ float convolutionSum = 0.0f;\ for(size_t ky = 0; ky < convKernelHeight; ++ky)\ {\ for(size_t kx = 0; kx < convKernelWidth; ++kx)\ {\ convolutionSum += (float) in[(y + ky) * inWidth + (x + kx)]\ * convKernel[ky * convKernelWidth + kx];\ }\ }\ out[y * outWidth + x] = (uint) clamp(convolutionSum, 0, 255);\ }"; /** * Convolve a grayscale image with a convolution kernel on the GPU using OpenCL. */ grayImage convolveGPU(grayImage in, convolutionKernel convKernel) { grayImage out; out.width = in.width - (convKernel.width - 1); out.height = in.height - (convKernel.height - 1); out.data = new uchar[out.height * out.width]; // Platforms std::vector< cl::Platform > platforms; cl::Platform::get(&platforms); assert(platforms.size() > 0); // Devices std::vector devices; platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); assert(devices.size() > 0); assert(devices[0].getInfo() == CL_DEVICE_TYPE_GPU); // Context cl::Context context(devices); // Create GPU buffers cl::Buffer inGPU(context, CL_MEM_READ_ONLY, in.width * in.height * sizeof(uchar)); cl::Buffer convKernelGPU(context, CL_MEM_READ_ONLY, convKernel.width * convKernel.height * sizeof(float)); cl::Buffer outGPU(context, CL_MEM_WRITE_ONLY, out.width * out.height * sizeof(uint)); // Commandqueue cl::CommandQueue queue(context, devices[0], 0); // Upload in.data to inGPU queue.enqueueWriteBuffer( inGPU, false, // FIFO 0, in.width * in.height * sizeof(uchar), in.data); // Upload kernel.data to convKernelGPU queue.enqueueWriteBuffer( convKernelGPU, true, // Blocking for correct timing 0, convKernel.width * convKernel.height * sizeof(float), convKernel.data); // Program cl::Program::Sources source(1, std::make_pair(kernelSource, strlen(kernelSource))); cl::Program program(context, source); program.build(devices); // Ranges size_t localWidth = 16; size_t localHeight = 16; cl::NDRange localRange(localWidth, localHeight); cl::NDRange globalRange(((out.width-1)/localWidth+1) * localWidth, ((out.height-1)/localHeight+1) * localHeight); // Run kernel cl::Kernel kernel(program, "convolveKernel"); cl::KernelFunctor func = kernel.bind(queue, globalRange, localRange); cl::Event event = func(inGPU, in.width, in.height, outGPU, out.width, out.height, convKernelGPU, convKernel.width, convKernel.height); event.wait(); // Download result uint *outTemp = new uint[out.width * out.height]; queue.enqueueReadBuffer( outGPU, true, 0, out.width * out.height * sizeof(uint), outTemp); // Convert uint array to uchar array for(size_t i = 0; i < out.width * out.height; ++i) { out.data[i] = (uchar) outTemp[i]; } delete outTemp; return out; } @KE: