Home > Programming, Technology > OpenCL GPU Matrix multiplication program

OpenCL GPU Matrix multiplication program


Here is the code to multiply two matrices using heterogeneous system programming language OpenCL. The reason being called OpenCL as heterogeneous is that written a code in OpenCL can be ported to CPU or GPU or Cell processor.


//Author: Vasanth Raja
//Program to multiply two matrices using OpenCL in GPU

#include "stdafx.h"

#include < stdio.h >
#include < stdlib.h >
#include < time.h >
#include < ctime >

#define widthA 128
#define heightA 128

#define widthB heightA
#define heightB 128

#define widthC widthA
#define heightC heightB

#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include < CL/cl.h >
#endif

#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)

int main()
{
  float * A = (float *)malloc(sizeof(float)*widthA*heightA);
  float * B = (float *)malloc(sizeof(float)*widthB*heightB);
  float * C = (float *)malloc(sizeof(float)*widthC*heightC);
  float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
  float * D= (float *)malloc(sizeof(float)*widthC*heightC);

   FILE * fp1 = fopen("matAdata.txt", "w");
  if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
   }

  for(int i = 0;i < widthA; i++)
  {
		for(int j=0;j		{
			float p=(rand()%100)/7.0;
			*(A+i*heightA+j)=rand()%100 + p;
			fprintf(fp1, "%f ",*(A+i*heightA+j));
		}
		fprintf(fp1, "\n");
   }
   fclose(fp1);

   fp1 = fopen("matBdata.txt", "w");
   if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
   }

	for(int i = 0;i < widthB; i++)
	{
		for(int j=0; j		{
			float p=(rand()%100)/7.0;
			*((B+i*heightB+j))=rand()%100 + p;
			fprintf(fp1, "%f ",*(B+i*heightA+j));
		}
		fprintf(fp1, "\n");
	}
	fclose(fp1);

  cl_device_id device_id = NULL;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobjA = NULL;
  cl_mem memobjB = NULL;
  cl_mem memobjC = NULL;
  cl_mem rowA = NULL;
  cl_mem colC = NULL;
  cl_program program = NULL;
  cl_kernel kernel = NULL;
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_devices;
  cl_uint ret_num_platforms;
  cl_int ret;

  //char string[MEM_SIZE];

  FILE *fp;
  char fileName[] = "./hello.cl";
  char *source_str;
  size_t source_size;
  int row = widthA;
  int col = heightC;
  /* Load the source code containing the kernel*/
  fp = fopen(fileName, "r");
  if (!fp) {
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
  }
  source_str = (char*)malloc(MAX_SOURCE_SIZE);
  source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
  fclose( fp );

  /* Get Platform and Device Info */
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

  /* Create OpenCL context */
  context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

  /* Create Command Queue */
  command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

  /* Create Memory Buffer */
  memobjA = clCreateBuffer(context, CL_MEM_READ_WRITE, widthA * heightA * sizeof(float), NULL, &ret);
  memobjB = clCreateBuffer(context, CL_MEM_READ_WRITE, widthB * heightB * sizeof(float), NULL, &ret);
  memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
  rowA = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(int), NULL, &ret);
  colC = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(int), NULL, &ret);

  // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
           widthA * heightA * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
            widthB * heightB * sizeof(int), B, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);

  /* Create Kernel Program from the source */
  program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
				                      (const size_t *)&source_size, &ret);

  /* Build Kernel Program */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

  /* Create OpenCL Kernel */
  kernel = clCreateKernel(program, "matrixMultiplication", &ret);

  /* Set OpenCL Kernel Arguments */
  ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
  ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
  ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
  //ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
  ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
  ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
  /* Execute OpenCL Kernel */
  //ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
  size_t globalThreads[2] = {widthA, heightB};
  size_t localThreads[2] = {16,16};

  clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, NULL, 0, NULL);
  /* Copy results from the memory buffer */
  ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
			                widthA * heightC * sizeof(float),Res, 0, NULL, NULL);

  fp1 = fopen("matGPURes.txt", "w");
  if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
  }

  printf("\nResult\n");
	for(int i = 0;i < widthA; i++)
	{
		for(int j=0;j < heightC; j++)
		{

			fprintf(fp1, "%f ",*(Res+i*heightC+j));

		}
		fprintf(fp1, "\n");
	}
	fclose(fp1);

  ret = clFlush(command_queue);
  ret = clFinish(command_queue);
  ret = clReleaseKernel(kernel);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobjA);
  ret = clReleaseMemObject(memobjB);
  ret = clReleaseMemObject(memobjC);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);

  free(source_str);
  system("pause");

  float sum=0.0;

  for(int i = 0;i < widthA; i++)
	{
		for(int j = 0; j < heightC; j++)
		{
			sum = 0;
			for(int k = 0; k < widthB; k++)
			{
				sum += A[i*col+k] * B[k*row+j];
			}
		D[i*heightC+j] = sum;
		}

	}

    fp1 = fopen("matNormalMultiplicationRes.txt", "w");
  if (!fp1) {
    fprintf(stderr, "Failed to open matAdata.\n");
    exit(1);
  }

  printf("\nResult\n");
	for(int i = 0;i < widthA; i++)
	{
		for(int j=0;j < heightC; j++)
		{
			fprintf(fp1, "%f ",*(D+i*heightC+j));

		}
		fprintf(fp1, "\n");
	}
   system("pause");
  return 0;
}

You can check configuration and set up in Visual studio here.

The actual Kernel executed in the GPU is as follows.

__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C,  int widthA, int widthB )
{
	int i = get_global_id(0);
	int j = get_global_id(1);
	float value=0;
	for ( int k = 0; k < widthA; k++)
	{
		value = value + A[k + j * widthA] * B[k*widthB + i];
	}
	C[i + widthA * j] = value;
}

Advertisements
  1. Ivan
    January 17, 2012 at 10:01 pm

    Thank you very much!!! I learned a lot from it!!!

    Could you please upload code for matrix inversion!!!! or give at least a hint!!!

  2. KOnark PAtel
    February 23, 2013 at 1:58 pm

    thanks for this code. i have a query. when i used few days back this code on my w8, visual studio, it was working well. but since today the program runs too slowly for executing this lines
    float p=(rand()%100)/7.0;
    *(A+i*heightA+j)=rand()%100 + p;
    fprintf(fp1, “%f “,*(A+i*heightA+j));

    and also program is not finishing. command console just remains open. can u suggest any solution?

  3. January 17, 2014 at 10:49 pm

    Useful solution. In line 45 and 62 you have a bug – missing elements of for loop.

    For me it’s works correctly, but I getting following output:

    Result
    sh: pause: command not found

    Result
    sh: pause: command not found

    What means “sh: pause: command not found”?

  4. Alex
    January 19, 2014 at 5:15 pm

    Cool code thx. Hovewer I tried to implement it in vs 2012 and got two errors.

    1.First:
    for(int i = 0;i < widthA; i++)
    {
    for(int j=0;j { //I think you forgot here the hightA
    float p=(rand()%100)/7.0;
    *(A+i*heightA+j)=rand()%100 + p;
    fprintf(fp1, "%f ",*(A+i*heightA+j));
    }
    fprintf(fp1, "\n");
    }

    2.Second:
    for(int i = 0;i < widthB; i++)
    {
    for(int j=0; j { //Here the same thing
    float p=(rand()%100)/7.0;
    *((B+i*heightB+j))=rand()%100 + p;
    fprintf(fp1, "%f ",*(B+i*heightA+j));
    }
    fprintf(fp1, "\n");
    }

    Best regards Alex

  1. November 20, 2011 at 12:28 pm

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: