#include <iostream>
#include <CL/cl2.hpp>
#include <string>
#include <fstream>
#include <streambuf>

std::string loadFileFromDisk(std::string src) {
	std::ifstream t(src);
	return std::string((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
}

int main() {
	cl_platform_id platform;
	clGetPlatformIDs(1, &platform, nullptr);
	cl_device_id device;
	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr);
	cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
	cl_command_queue commandQueue = clCreateCommandQueueWithProperties(context, device, 0, nullptr);
	std::string kernelSourceCode = loadFileFromDisk("kernel.cl");

	const char* c_kernelSourceCode = kernelSourceCode.c_str();
	const size_t c_kernelSourceSize = kernelSourceCode.size();

	cl_program program = clCreateProgramWithSource(context, 1, &c_kernelSourceCode, &c_kernelSourceSize, nullptr);
	clBuildProgram(program, 0, nullptr, "-cl-std=CL2.0 -cl-mad-enable", nullptr, nullptr);
	cl_kernel kernel = clCreateKernel(program, "addArrays", nullptr);

	size_t arraySize = 512 * sizeof(int);
	cl_mem outBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, arraySize, nullptr, nullptr);
	cl_mem array1Buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize, nullptr, nullptr);
	cl_mem array2Buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, arraySize, nullptr, nullptr);

	int* out = new int[512];
	int* array1 = new int[512];
	int* array2 = new int[512];

	for(int i = 0; i < 512; i++) {out[i] = i; array1[i] = i; array2[i] = i;}

	const int zero = 0;
	clEnqueueFillBuffer(commandQueue, outBuffer, &zero, sizeof(int), 0, arraySize, 0, nullptr, nullptr);

	clEnqueueWriteBuffer(commandQueue, array1Buffer, CL_TRUE, 0, arraySize, array1, 0, nullptr, nullptr);
	clEnqueueWriteBuffer(commandQueue, array2Buffer, CL_TRUE, 0, arraySize, array2, 0, nullptr, nullptr);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), &outBuffer);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &array1Buffer);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &array2Buffer);

	size_t globalDimensions[1];
	globalDimensions[0] = 512;
	size_t workGroupDimensions[1];
	workGroupDimensions[0] = 32;

	clEnqueueNDRangeKernel(commandQueue, kernel, 1, nullptr, globalDimensions, workGroupDimensions, 0, nullptr, nullptr);
	clFlush(commandQueue);

	clEnqueueReadBuffer(commandQueue, outBuffer, CL_TRUE, 0, arraySize, out, 0, nullptr, nullptr);

	for(int i = 0; i < 512; i++) {
		std::cout << out[i] << ", ";
	}

	return 0;
}