Cuda_preformance_analysis/arrayMul.cu

// Array multiplication: C = A * B:

// System includes
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <math.h>
#include <stdlib.h>

// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include "helper_functions.h"
#include "helper_cuda.h"


#ifndef THREADS_PER_BLOCK
#define THREADS_PER_BLOCK		128	// number of threads in each block
#endif

#ifndef DATASET_SIZE
#define DATASET_SIZE			( 8*1024*1024 )	// size of the array
#endif

float hA[ DATASET_SIZE ];
float hB[ DATASET_SIZE ];
float hC[ DATASET_SIZE ];

#ifndef TOL
#define TOL		0.00001f		// tolerance to relative error
#endif

void
CudaCheckError( )
{
	cudaError_t e = cudaGetLastError( );
	if( e != cudaSuccess )
	{
		fprintf( stderr, "CUDA failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e));
	} 
}

// array multiplication on the device: C = A * B

__global__  void ArrayMul( float *A, float *B, float *C )
{
	int gid = blockIdx.x*blockDim.x + threadIdx.x;
	if( gid < DATASET_SIZE )
		C[gid] = A[gid] * B[gid];
}


// main program:

int
main( int argc, char* argv[ ] )
{
	//int dev = findCudaDevice(argc, (const char **)argv);

	// fill host memory:

	for( int i = 0; i < DATASET_SIZE; i++ )
	{
		hA[i] = hB[i] = sqrtf(  (float)i  );
	}

	// allocate device memory:

	float *dA, *dB, *dC;

	cudaMalloc( (void **)(&dA), sizeof(hA) );
	cudaMalloc( (void **)(&dB), sizeof(hB) );
	cudaMalloc( (void **)(&dC), sizeof(hC) );
	CudaCheckError( );

	// copy host memory to the device:

	cudaMemcpy( dA, hA, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice );
	cudaMemcpy( dB, hB, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );

	// setup the execution parameters:

	dim3 grid( DATASET_SIZE / THREADS_PER_BLOCK, 1, 1 );
	dim3 threads( THREADS_PER_BLOCK, 1, 1 );

	// create and start the timer:

	cudaDeviceSynchronize( );

	// allocate the events that we'll use for timing:

	cudaEvent_t start, stop;
	cudaEventCreate( &start );
	cudaEventCreate( &stop );
	CudaCheckError( );

	// record the start event:

	cudaEventRecord( start, NULL );
	CudaCheckError( );

	// execute the kernel:

	ArrayMul<<< grid, threads >>>( dA, dB, dC );

	// record the stop event:

	cudaEventRecord( stop, NULL );
	CudaCheckError( );

	// wait for the stop event to complete:

	cudaEventSynchronize( stop );
	CudaCheckError( );

	float msecTotal;
	cudaEventElapsedTime( &msecTotal, start, stop );
	CudaCheckError( );

	// compute and print the performance

	double secondsTotal = 0.001 * (double)msecTotal;
	double multsPerSecond = (double)DATASET_SIZE / secondsTotal;
	double megaMultsPerSecond = multsPerSecond / 1000000.;
	fprintf( stderr, "%12d, %4d, %10.2lf\n", DATASET_SIZE, THREADS_PER_BLOCK, megaMultsPerSecond );

	// copy result from the device to the host:

	cudaMemcpy( hC, dC, sizeof(hC), cudaMemcpyDeviceToHost );
	CudaCheckError( );

	// clean up:

	cudaFree( dA );
	cudaFree( dB );
	cudaFree( dC );
	CudaCheckError( );

	return 0;
}
migrated repo contents from old git server 2023-02-02 23:11:06 +00:00			`// Array multiplication: C = A * B:`

			`// System includes`
			`#include <stdio.h>`
			`#include <assert.h>`
			`#include <malloc.h>`
			`#include <math.h>`
			`#include <stdlib.h>`

			`// CUDA runtime`
			`#include <cuda_runtime.h>`

			`// Helper functions and utilities to work with CUDA`
			`#include "helper_functions.h"`
			`#include "helper_cuda.h"`


			`#ifndef THREADS_PER_BLOCK`
			`#define THREADS_PER_BLOCK 128 // number of threads in each block`
			`#endif`

			`#ifndef DATASET_SIZE`
			`#define DATASET_SIZE ( 810241024 ) // size of the array`
			`#endif`

			`float hA[ DATASET_SIZE ];`
			`float hB[ DATASET_SIZE ];`
			`float hC[ DATASET_SIZE ];`

			`#ifndef TOL`
			`#define TOL 0.00001f // tolerance to relative error`
			`#endif`

			`void`
			`CudaCheckError( )`
			`{`
			`cudaError_t e = cudaGetLastError( );`
			`if( e != cudaSuccess )`
			`{`
			`fprintf( stderr, "CUDA failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e));`
			`}`
			`}`

			`// array multiplication on the device: C = A * B`

			`__global__ void ArrayMul( float A, float B, float *C )`
			`{`
			`int gid = blockIdx.x*blockDim.x + threadIdx.x;`
			`if( gid < DATASET_SIZE )`
			`C[gid] = A[gid] * B[gid];`
			`}`



			`// main program:`

			`int`
			`main( int argc, char* argv[ ] )`
			`{`
			`//int dev = findCudaDevice(argc, (const char **)argv);`

			`// fill host memory:`

			`for( int i = 0; i < DATASET_SIZE; i++ )`
			`{`
			`hA[i] = hB[i] = sqrtf( (float)i );`
			`}`

			`// allocate device memory:`

			`float dA, dB, *dC;`

			`cudaMalloc( (void **)(&dA), sizeof(hA) );`
			`cudaMalloc( (void **)(&dB), sizeof(hB) );`
			`cudaMalloc( (void **)(&dC), sizeof(hC) );`
			`CudaCheckError( );`

			`// copy host memory to the device:`

			`cudaMemcpy( dA, hA, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice );`
			`cudaMemcpy( dB, hB, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice );`
			`CudaCheckError( );`

			`// setup the execution parameters:`

			`dim3 grid( DATASET_SIZE / THREADS_PER_BLOCK, 1, 1 );`
			`dim3 threads( THREADS_PER_BLOCK, 1, 1 );`

			`// create and start the timer:`

			`cudaDeviceSynchronize( );`

			`// allocate the events that we'll use for timing:`

			`cudaEvent_t start, stop;`
			`cudaEventCreate( &start );`
			`cudaEventCreate( &stop );`
			`CudaCheckError( );`

			`// record the start event:`

			`cudaEventRecord( start, NULL );`
			`CudaCheckError( );`

			`// execute the kernel:`

			`ArrayMul<<< grid, threads >>>( dA, dB, dC );`

			`// record the stop event:`

			`cudaEventRecord( stop, NULL );`
			`CudaCheckError( );`

			`// wait for the stop event to complete:`

			`cudaEventSynchronize( stop );`
			`CudaCheckError( );`

			`float msecTotal;`
			`cudaEventElapsedTime( &msecTotal, start, stop );`
			`CudaCheckError( );`

			`// compute and print the performance`

			`double secondsTotal = 0.001 * (double)msecTotal;`
			`double multsPerSecond = (double)DATASET_SIZE / secondsTotal;`
			`double megaMultsPerSecond = multsPerSecond / 1000000.;`
			`fprintf( stderr, "%12d, %4d, %10.2lf\n", DATASET_SIZE, THREADS_PER_BLOCK, megaMultsPerSecond );`

			`// copy result from the device to the host:`

			`cudaMemcpy( hC, dC, sizeof(hC), cudaMemcpyDeviceToHost );`
			`CudaCheckError( );`

			`// clean up:`

			`cudaFree( dA );`
			`cudaFree( dB );`
			`cudaFree( dC );`
			`CudaCheckError( );`

			`return 0;`
			`}`