Archive

Posts Tagged ‘GPGPU programming’

Aparapi + Java + OpenCL


Since I’m working on OpenCL project, I was very curious to know, how to reduce the number of lines of code to be written for any program in OpenCL. While searching for the way to reduce the code I came to know the release of Aparapi.

It is an  API for expressing data parallel workloads in Java and a runtime component capable of converting the Java bytecode of compatible workloads into OpenCL so that it can be executed on a variety of GPU devices.

Some cool features of Aparapi are

* No need of thinking of explicit data transfers between host program and GPU kernel(If required you can do for the improvements purposes)

* No need of querying for the platform, devices

* No need of creating context and managing command queue

* No need of writing code for creating buffers

* No need of setting kernel arguments explicitly

* No need of using enqueNDRange()  and remembering so many parameters to set the globalThreads and localThreads and dimensions

* If GPU is not found still the Aparapi transforms the kernel code to use either JTP(Java Thread Pool) or CPU. Its not required to check explicitly if GPU is present or not and no need of explicitly creating threads to manage data parallelism.

* No need to learn C99 OpenCL C to write Highly data parallel applications

* Fast learning curve

*Aparapi is an open source project

Limitations of Aparapi

How ever Aparapi is having a lot of advantages there are some limiataions

* Cannot support local memory(Huge optimization loss here)

* May not reach the highly optimized openCL C wrapper performance for complex mathematical operations

Notwithstanding, no support for local memory Aparapi is great to use because most of the data parallel applications written in Java can hugely benefit from usage Java based Aparapi.

Aparapi Download link

Aparapi Getting Started

Installation Notes

Note: All the views presented here are just subjective and my personal perceptions. If there any thing wrong or I’m not updated with the future releases please notify me.

Thanks for reading!!

OpenCL GPU Matrix multiplication program


Here is the code to multiply two matrices using heterogeneous system programming language OpenCL. The reason being called OpenCL as heterogeneous is that written a code in OpenCL can be ported to CPU or GPU or Cell processor.


//Author: Vasanth Raja
//Program to multiply two matrices using OpenCL in GPU

#include "stdafx.h"

#include < stdio.h >
#include < stdlib.h >
#include < time.h >
#include < ctime >

#define widthA 128
#define heightA 128

#define widthB heightA
#define heightB 128

#define widthC widthA
#define heightC heightB

#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include < CL/cl.h >
#endif

#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)

int main()
{
  float * A = (float *)malloc(sizeof(float)*widthA*heightA);
  float * B = (float *)malloc(sizeof(float)*widthB*heightB);
  float * C = (float *)malloc(sizeof(float)*widthC*heightC);
  float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
  float * D= (float *)malloc(sizeof(float)*widthC*heightC);

   FILE * fp1 = fopen("matAdata.txt", "w");
  if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
   }

  for(int i = 0;i < widthA; i++)
  {
		for(int j=0;j		{
			float p=(rand()%100)/7.0;
			*(A+i*heightA+j)=rand()%100 + p;
			fprintf(fp1, "%f ",*(A+i*heightA+j));
		}
		fprintf(fp1, "\n");
   }
   fclose(fp1);

   fp1 = fopen("matBdata.txt", "w");
   if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
   }

	for(int i = 0;i < widthB; i++)
	{
		for(int j=0; j		{
			float p=(rand()%100)/7.0;
			*((B+i*heightB+j))=rand()%100 + p;
			fprintf(fp1, "%f ",*(B+i*heightA+j));
		}
		fprintf(fp1, "\n");
	}
	fclose(fp1);

  cl_device_id device_id = NULL;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobjA = NULL;
  cl_mem memobjB = NULL;
  cl_mem memobjC = NULL;
  cl_mem rowA = NULL;
  cl_mem colC = NULL;
  cl_program program = NULL;
  cl_kernel kernel = NULL;
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_devices;
  cl_uint ret_num_platforms;
  cl_int ret;

  //char string[MEM_SIZE];

  FILE *fp;
  char fileName[] = "./hello.cl";
  char *source_str;
  size_t source_size;
  int row = widthA;
  int col = heightC;
  /* Load the source code containing the kernel*/
  fp = fopen(fileName, "r");
  if (!fp) {
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
  }
  source_str = (char*)malloc(MAX_SOURCE_SIZE);
  source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
  fclose( fp );

  /* Get Platform and Device Info */
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

  /* Create OpenCL context */
  context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

  /* Create Command Queue */
  command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

  /* Create Memory Buffer */
  memobjA = clCreateBuffer(context, CL_MEM_READ_WRITE, widthA * heightA * sizeof(float), NULL, &ret);
  memobjB = clCreateBuffer(context, CL_MEM_READ_WRITE, widthB * heightB * sizeof(float), NULL, &ret);
  memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
  rowA = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(int), NULL, &ret);
  colC = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(int), NULL, &ret);

  // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
           widthA * heightA * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
            widthB * heightB * sizeof(int), B, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);

  /* Create Kernel Program from the source */
  program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
				                      (const size_t *)&source_size, &ret);

  /* Build Kernel Program */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

  /* Create OpenCL Kernel */
  kernel = clCreateKernel(program, "matrixMultiplication", &ret);

  /* Set OpenCL Kernel Arguments */
  ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
  ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
  ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
  //ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
  ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
  ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
  /* Execute OpenCL Kernel */
  //ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
  size_t globalThreads[2] = {widthA, heightB};
  size_t localThreads[2] = {16,16};

  clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, NULL, 0, NULL);
  /* Copy results from the memory buffer */
  ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
			                widthA * heightC * sizeof(float),Res, 0, NULL, NULL);

  fp1 = fopen("matGPURes.txt", "w");
  if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
  }

  printf("\nResult\n");
	for(int i = 0;i < widthA; i++)
	{
		for(int j=0;j < heightC; j++)
		{

			fprintf(fp1, "%f ",*(Res+i*heightC+j));

		}
		fprintf(fp1, "\n");
	}
	fclose(fp1);

  ret = clFlush(command_queue);
  ret = clFinish(command_queue);
  ret = clReleaseKernel(kernel);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobjA);
  ret = clReleaseMemObject(memobjB);
  ret = clReleaseMemObject(memobjC);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);

  free(source_str);
  system("pause");

  float sum=0.0;

  for(int i = 0;i < widthA; i++)
	{
		for(int j = 0; j < heightC; j++)
		{
			sum = 0;
			for(int k = 0; k < widthB; k++)
			{
				sum += A[i*col+k] * B[k*row+j];
			}
		D[i*heightC+j] = sum;
		}

	}

    fp1 = fopen("matNormalMultiplicationRes.txt", "w");
  if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
  }

  printf("\nResult\n");
	for(int i = 0;i < widthA; i++)
	{
		for(int j=0;j < heightC; j++)
		{
			fprintf(fp1, "%f ",*(D+i*heightC+j));

		}
		fprintf(fp1, "\n");
	}
   system("pause");
  return 0;
}

You can check configuration and set up in Visual studio here.

The actual Kernel executed in the GPU is as follows.

__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C,  int widthA, int widthB )
{
	int i = get_global_id(0);
	int j = get_global_id(1);
	float value=0;
	for ( int k = 0; k < widthA; k++)
	{
		value = value + A[k + j * widthA] * B[k*widthB + i];
	}
	C[i + widthA * j] = value;
}