@KT:Listing 3
@LI:$$donumber
#define __CL_ENABLE_EXCEPTIONS
#include "convolve.hpp"
#include "timer.hpp"

#include "CL/cl.hpp"
#include <limits> // uchar max, min
#include <assert.h>
#include <iostream>

/**
 * The OpenCL kernel for image convolution.
 */
const char* kernelSource = "\
__kernel void convolveKernel(\
        global uchar *in,\
        uint inWidth,\
        uint inHeight,\
        global uint *out,\
        uint outWidth,\
        uint outHeight,\
        global float *convKernel,\
        uint convKernelWidth,\
        uint convKernelHeight)\
{\
    size_t x = get_global_id(0);\
    size_t y = get_global_id(1);\
    \
    /* Kill unneeded threads */\
    if(x >= outWidth || y >= outHeight)\
    {\
        return;\
    }\
    \
    float convolutionSum = 0.0f;\
    for(size_t ky = 0; ky < convKernelHeight; ++ky)\
    {\
        for(size_t kx = 0; kx < convKernelWidth; ++kx)\
        {\
            convolutionSum += (float) in[(y + ky) * inWidth + (x + kx)]\
                            * convKernel[ky * convKernelWidth + kx];\
        }\
    }\
    out[y * outWidth + x] = (uint) clamp(convolutionSum, 0, 255);\
}";

/**
 * Convolve a grayscale image with a convolution kernel on the GPU using OpenCL.
 */
grayImage convolveGPU(grayImage in, convolutionKernel convKernel)
{
    grayImage out;
    out.width = in.width - (convKernel.width - 1);
    out.height = in.height - (convKernel.height - 1);
    out.data = new uchar[out.height * out.width];
    
    // Platforms    
    std::vector< cl::Platform > platforms;
    cl::Platform::get(&platforms);
    assert(platforms.size() > 0);

    // Devices
    std::vector<cl::Device> devices;
    platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
    assert(devices.size() > 0);
    assert(devices[0].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU);

    // Context
    cl::Context context(devices);

    // Create GPU buffers
    cl::Buffer inGPU(context, CL_MEM_READ_ONLY, in.width * in.height * sizeof(uchar));
    cl::Buffer convKernelGPU(context, CL_MEM_READ_ONLY, convKernel.width * convKernel.height * sizeof(float));
    cl::Buffer outGPU(context, CL_MEM_WRITE_ONLY, out.width * out.height * sizeof(uint));
    
    // Commandqueue
    cl::CommandQueue queue(context, devices[0], 0);

    // Upload in.data to inGPU
    queue.enqueueWriteBuffer(
        inGPU,
        false, // FIFO
        0,
        in.width * in.height * sizeof(uchar),
        in.data);
    
    // Upload kernel.data to convKernelGPU
    queue.enqueueWriteBuffer(
        convKernelGPU,
        true, // Blocking for correct timing
        0,
        convKernel.width * convKernel.height * sizeof(float),
        convKernel.data);

    // Program
    cl::Program::Sources source(1, std::make_pair(kernelSource, strlen(kernelSource)));

    cl::Program program(context, source);
    program.build(devices);

    // Ranges
    size_t localWidth = 16;
    size_t localHeight = 16;

    cl::NDRange localRange(localWidth, localHeight);
    cl::NDRange globalRange(((out.width-1)/localWidth+1) * localWidth, ((out.height-1)/localHeight+1) * localHeight);

    // Run kernel
    cl::Kernel kernel(program, "convolveKernel");
    cl::KernelFunctor func = kernel.bind(queue, globalRange, localRange);
    
    cl::Event event = func(inGPU, in.width, in.height, outGPU, out.width, out.height, convKernelGPU, convKernel.width, convKernel.height);
    event.wait();

    // Download result
    uint *outTemp = new uint[out.width * out.height];
    queue.enqueueReadBuffer(
        outGPU,
        true,
        0,
        out.width * out.height * sizeof(uint),
        outTemp);

    // Convert uint array to uchar array
    for(size_t i = 0; i < out.width * out.height; ++i)
    {
        out.data[i] = (uchar) outTemp[i];
    }

    delete outTemp;
    return out;
}
@KE:
