Archive

Archive for November, 2011

Aparapi Java Matrix Multiplication Example



import java.util.Random;
import com.amd.aparapi.Kernel;

/**
 * @author Vasanth Raja Chittampally
 */

public class AparapiMatrixMultiplication  {
	public static void main(String [] args) throws Exception
	{

		final int r = 1024;
		final int c1 = r;
		final int c2 = r;
		AparapiMatMul ap = new AparapiMatMul(r, c1, c2);

		try {
		long  time1 = System.currentTimeMillis();
                //ap.setExecutionMode(Kernel.EXECUTION_MODE.JTP);
                //ap.setExecutionMode(Kernel.EXECUTION_MODE.GPU);
                //ap.setExecutionMode(Kernel.EXECUTION_MODE.CPU);
		ap.execute(r,c2);
		System.out.println("Time taken for kenel execution in "+ ap.getExecutionMode()+ " mode is :"+ (System.currentTimeMillis() - time1));
		}catch(NullPointerException ne){
			ne.printStackTrace();
		}
		//ap.printResults();
		long time1 = System.currentTimeMillis();
		ap.normalMatMulCalc();
		System.out.println("Time taken for kenel execution in Sequential CPU mode is :"+ (System.currentTimeMillis() - time1));
		ap.compareResults();
		ap.dispose();
	}
}

class AparapiMatMul extends Kernel {

	float matA[];
	float matB[];
	float matC[];
	float C[];

	int rows ;
	int cols1;
	int cols2;

	@Override
	public void run() {
		int i = getGlobalId();
		int j = getPassId();
		float value = 0;
		for(int k = 0; k < cols1; k++)
		{
			value += matA[k + i * cols1] * matB[k * cols2 + j];
		}
		matC[i * cols1 + j] = value;
	}

	public AparapiMatMul(int r, int c1, int c2)
	{

		rows = r;
		cols1 = c1;
		cols2 = c2;

		matA = new float [r * c1];
		matB = new float [c1 * c2];
		matC = new float [r * c2];
		C = new float[r * c2];
		//matC should be initialized with zeros
		for(int i = 0; i < r; i++ )
		{
			for(int j = 0 ; j < c1; j++ )
			{
				matC[i * c1 + j ] = 0;

			}
		}

		//Here matrix A is initialized with random numbers

		for(int i = 0; i < r; i++ )
		{
			for(int j = 0 ; j < c1; j++ )
			{
				matA[i * c1 +j] = new Random().nextFloat();
			}
		}

		// Here matrix B is initialized with random numbers

		for(int i = 0; i < r; i++ )
		{
			for(int j = 0 ; j < c1; j++ )
			{
				matB[i * c2 + j] = new Random().nextFloat();
			}
		}

	}

	public void printResults()
	{
		for(int i = 0; i < rows; i++ )
		{
			for(int j = 0 ; j < cols2; j++ )
			{
				System.out.print(matC[i * cols2 + j]+"    ");
			}
		}
	}

	public void normalMatMulCalc()
	{
		System.out.println();
		System.out.println("Sequential Execution on CPU");
		 for(int i = 0;i < rows; i++)
			{
				for(int j = 0; j < cols2; j++)
				{
					float sum = 0;
					for(int k = 0; k < cols1; k++)
					{
						sum += matA[i*cols1+k] * matB[k*rows+j];
					}
				    C[i * cols2 + j] = sum;
				}

			}
	}
	public void compareResults()
	{
		boolean equal = true;
		for(int i = 0; i < rows * cols2 ; i++)
		{
			if(matC[i] != C[i])
			{
				equal = false;
				break;
			}
		}
		if(!equal)
			System.out.println("Results are not equal");
		else
			System.out.println("Results are equal.. Tested thoroughly!!!");
	}

}

Above code simply performs the matrix multiplication operation. The overloaded run method is the Kernel code which runs
on the GPU or JTP or CPU. First the above code is converted into Bytecode, this byte code is again converted to OpenCL
code.

You can compare ease of writing above code with  OpenCL C code here but  we need to compromise on some optimizations.

Results are as follows:
Output1:

Time taken for kenel execution in GPU mode is :8791
Sequential Execution on CPU
Time taken for kenel execution in Sequential CPU mode is :11580
Results are equal.. Tested thoroughly!!!

Output 2:

Time taken for kenel execution in JTP mode is :7765
Sequential Execution on CPU
Time taken for kenel execution in Sequential CPU mode is :12491
Results are equal.. Tested thoroughly!!!

 

Thanks to Gary Frost for your inputs for this program. Here I’m posting the changes I made to the above program. The ap.execute(r,c2) function calls the kernel c2 times which is not the same as clEnqueueNDRange() function. The corrected code as follows.

 


import java.util.Random;
import com.amd.aparapi.Kernel;

/**
 * @author Vasanth Raja Chittampally
 */

public class AparapiMatrixMultiplication  {
	public static void main(String [] args) throws Exception
	{

		final int r = 1024;
		final int c1 = r;
		final int c2 = r;
		AparapiMatMul ap = new AparapiMatMul(r, c1, c2);

		try {
		ap.setExecutionMode(Kernel.EXECUTION_MODE.GPU);
		long  time1 = System.currentTimeMillis();
		ap.execute(r * c2);
		System.out.println("Time taken for kenel execution in "+ ap.getExecutionMode()+ " mode is :"+ (System.currentTimeMillis() - time1));
		}catch(NullPointerException ne){
			ne.printStackTrace();
		}
		//ap.printResults();
		long time1 = System.currentTimeMillis();
		ap.normalMatMulCalc();
		System.out.println("Time taken for kenel execution in Sequential CPU mode is :"+ (System.currentTimeMillis() - time1));
		ap.compareResults();
		ap.dispose();
	}
}

class AparapiMatMul extends Kernel {

	float matA[];
	float matB[];
	float matC[];
	float C[];

	int rows ;
	int cols1;
	int cols2;

	@Override
	public void run() {
		int i = getGlobalId() /rows;
		int j = getGlobalId() % rows;
		float value = 0;
		for(int k = 0; k < cols1; k++)
		{
			value += matA[k + i * cols1] * matB[k * cols2 + j];
		}
		matC[i * cols1 + j] = value;
	}

	public AparapiMatMul(int r, int c1, int c2)
	{

		rows = r;
		cols1 = c1;
		cols2 = c2;

		matA = new float [r * c1];
		matB = new float [c1 * c2];
		matC = new float [r * c2];
		C = new float[r * c2];
		//matC should be initialized with zeros
		for(int i = 0; i < r; i++ )
		{
			for(int j = 0 ; j < c1; j++ )
			{
				matC[i * c1 + j ] = 0;

			}
		}

		//Here matrix A is initialized with random numbers

		for(int i = 0; i < r; i++ )
		{
			for(int j = 0 ; j < c1; j++ )
			{
				matA[i * c1 +j] = new Random().nextFloat();
			}
		}

		// Here matrix B is initialized with random numbers

		for(int i = 0; i < r; i++ )
		{
			for(int j = 0 ; j < c1; j++ )
			{
				matB[i * c2 + j] = new Random().nextFloat();
			}
		}

	}

	public void printResults()
	{
		for(int i = 0; i < rows; i++ )
		{
			for(int j = 0 ; j < cols2; j++ )
			{
				System.out.print(matC[i * cols2 + j]+"    ");
			}
		}
	}

	public void normalMatMulCalc()
	{
		System.out.println();
		System.out.println("Sequential Execution on CPU");
		 for(int i = 0;i < rows; i++)
			{
				for(int j = 0; j < cols2; j++)
				{
					float sum = 0;
					for(int k = 0; k < cols1; k++)
					{
						sum += matA[i*cols1+k] * matB[k*rows+j];
					}
				    C[i * cols2 + j] = sum;
				}

			}
	}
	public void compareResults()
	{
		boolean equal = true;
		for(int i = 0; i < rows * cols2 ; i++)
		{
			if(matC[i] != C[i])
			{
				equal = false;
				break;
			}
		}
		if(!equal)
			System.out.println("Results are not equal");
		else
			System.out.println("Results are equal.. Tested thoroughly!!!");
	}

}

The results I got are amazing.. I’m posting the results I got in my PC having AMD Radeon 5670 Graphics card.

Output:  GPU Mode

Time taken for kenel execution in GPU mode is :    838
Sequential Execution on CPU
Time taken for kenel execution in Sequential CPU mode is :   13335
Results are equal.. Tested thoroughly!!!

Output: JTP Mode

Time taken for kenel execution in JTP mode is :5671
Sequential Execution on CPU
Time taken for kenel execution in Sequential CPU mode is :13516
Results are equal.. Tested thoroughly!!!

Advertisements

Aparapi + Java + OpenCL


Since I’m working on OpenCL project, I was very curious to know, how to reduce the number of lines of code to be written for any program in OpenCL. While searching for the way to reduce the code I came to know the release of Aparapi.

It is an  API for expressing data parallel workloads in Java and a runtime component capable of converting the Java bytecode of compatible workloads into OpenCL so that it can be executed on a variety of GPU devices.

Some cool features of Aparapi are

* No need of thinking of explicit data transfers between host program and GPU kernel(If required you can do for the improvements purposes)

* No need of querying for the platform, devices

* No need of creating context and managing command queue

* No need of writing code for creating buffers

* No need of setting kernel arguments explicitly

* No need of using enqueNDRange()  and remembering so many parameters to set the globalThreads and localThreads and dimensions

* If GPU is not found still the Aparapi transforms the kernel code to use either JTP(Java Thread Pool) or CPU. Its not required to check explicitly if GPU is present or not and no need of explicitly creating threads to manage data parallelism.

* No need to learn C99 OpenCL C to write Highly data parallel applications

* Fast learning curve

*Aparapi is an open source project

Limitations of Aparapi

How ever Aparapi is having a lot of advantages there are some limiataions

* Cannot support local memory(Huge optimization loss here)

* May not reach the highly optimized openCL C wrapper performance for complex mathematical operations

Notwithstanding, no support for local memory Aparapi is great to use because most of the data parallel applications written in Java can hugely benefit from usage Java based Aparapi.

Aparapi Download link

Aparapi Getting Started

Installation Notes

Note: All the views presented here are just subjective and my personal perceptions. If there any thing wrong or I’m not updated with the future releases please notify me.

Thanks for reading!!

Sieve of Eratosthenes CPP source code


Seive of Eratosthenes is one of the mostly used prime generation algorithms.
Algorithm is very simple and straight forward.

To find all the prime numbers less than or equal to a given integer n by Eratosthenes’ method:
1. Create a list of consecutive integers from 2 to n: (2, 3, 4, …, n).
2. Initially, let p equal 2, the first prime number.
3. Starting from p, count up in increments of p and mark each of these numbers greater than p itself in the list. These numbers will be 2p, 3p, 4p, etc.; note that some of them may have already been marked.
4. Find the first number greater than p in the list that is not marked; let p now equal this number (which is the next prime).
5. If there were no more unmarked numbers in the list, stop. Otherwise, repeat from step 3.
When the algorithm terminates, all the numbers in the list that are not marked are prime.

Improvements over Seives of Eristhones:
One of the improvements that can reduce the larger computation is not including 2’s multiples (however, the only even and prime number is 2). Just considering only the odd numbers we can reduce prime holding array size and improve performance.

The following C++ uses the above improvement and considers each element of primes array as an odd number i.e. primes[n] represents 2*n+1 number in real. At the end of the operation primes array will have one’s in all the corresponding indices of prime numbers.

The below code can be seen running using ideone online compiler

// Written by Vasanth Raja Chittampally
// Source Code: Sieve of Eratosthenes
// All rights are reserved
#include #include
#include
void seive(int start, int range, int * primes)
{
//Seiving is done here
long start1 = clock();
for(int i = 6; i {

int iter = 2 * i + 1;
if(primes[i] == 1)
{
int startInLoop = start / iter;
if(startInLoop % 2 == 0)
startInLoop++;
if(startInLoop < 13)
startInLoop = 13;
for(int j = startInLoop * iter; j primes[(j - 1) / 2] = 0;
}
}
std::cout< int count = 0;
// display the results
for(int i = start / 2; i if(primes[i] != 0)
{
//std::cout<< 2 * i + 1 << " ";
count++;
}
std::cout< }
int main(void)
{
const int range = 5000000 ;

const int start = 1000000 ;
int * primes = new int[range / 2 + 1];
long time1 = clock();
for(int i = 0; i {
primes[i] = 1;
}
for(int i = 4; i {
primes[i] = 0;
}
for(int i = 7; i {
primes[i] = 0;
}
for(int i = 10; i {
primes[i] = 0;
}
for(int i = 16; i {
primes[i] = 0;
}
long totTime = clock() - time1;
std:: cout << std::endl< return 0;
}

The above code can be seen running using ideone online compiler