144 lines
3.1 KiB
Plaintext
144 lines
3.1 KiB
Plaintext
|
// Array multiplication: C = A * B:
|
||
|
|
||
|
// System includes
|
||
|
#include <stdio.h>
|
||
|
#include <assert.h>
|
||
|
#include <malloc.h>
|
||
|
#include <math.h>
|
||
|
#include <stdlib.h>
|
||
|
|
||
|
// CUDA runtime
|
||
|
#include <cuda_runtime.h>
|
||
|
|
||
|
// Helper functions and utilities to work with CUDA
|
||
|
#include "helper_functions.h"
|
||
|
#include "helper_cuda.h"
|
||
|
|
||
|
|
||
|
#ifndef THREADS_PER_BLOCK
|
||
|
#define THREADS_PER_BLOCK 128 // number of threads in each block
|
||
|
#endif
|
||
|
|
||
|
#ifndef DATASET_SIZE
|
||
|
#define DATASET_SIZE ( 8*1024*1024 ) // size of the array
|
||
|
#endif
|
||
|
|
||
|
float hA[ DATASET_SIZE ];
|
||
|
float hB[ DATASET_SIZE ];
|
||
|
float hC[ DATASET_SIZE ];
|
||
|
|
||
|
#ifndef TOL
|
||
|
#define TOL 0.00001f // tolerance to relative error
|
||
|
#endif
|
||
|
|
||
|
void
|
||
|
CudaCheckError( )
|
||
|
{
|
||
|
cudaError_t e = cudaGetLastError( );
|
||
|
if( e != cudaSuccess )
|
||
|
{
|
||
|
fprintf( stderr, "CUDA failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// array multiplication on the device: C = A * B
|
||
|
|
||
|
__global__ void ArrayMul( float *A, float *B, float *C )
|
||
|
{
|
||
|
int gid = blockIdx.x*blockDim.x + threadIdx.x;
|
||
|
if( gid < DATASET_SIZE )
|
||
|
C[gid] = A[gid] * B[gid];
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
// main program:
|
||
|
|
||
|
int
|
||
|
main( int argc, char* argv[ ] )
|
||
|
{
|
||
|
//int dev = findCudaDevice(argc, (const char **)argv);
|
||
|
|
||
|
// fill host memory:
|
||
|
|
||
|
for( int i = 0; i < DATASET_SIZE; i++ )
|
||
|
{
|
||
|
hA[i] = hB[i] = sqrtf( (float)i );
|
||
|
}
|
||
|
|
||
|
// allocate device memory:
|
||
|
|
||
|
float *dA, *dB, *dC;
|
||
|
|
||
|
cudaMalloc( (void **)(&dA), sizeof(hA) );
|
||
|
cudaMalloc( (void **)(&dB), sizeof(hB) );
|
||
|
cudaMalloc( (void **)(&dC), sizeof(hC) );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// copy host memory to the device:
|
||
|
|
||
|
cudaMemcpy( dA, hA, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice );
|
||
|
cudaMemcpy( dB, hB, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// setup the execution parameters:
|
||
|
|
||
|
dim3 grid( DATASET_SIZE / THREADS_PER_BLOCK, 1, 1 );
|
||
|
dim3 threads( THREADS_PER_BLOCK, 1, 1 );
|
||
|
|
||
|
// create and start the timer:
|
||
|
|
||
|
cudaDeviceSynchronize( );
|
||
|
|
||
|
// allocate the events that we'll use for timing:
|
||
|
|
||
|
cudaEvent_t start, stop;
|
||
|
cudaEventCreate( &start );
|
||
|
cudaEventCreate( &stop );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// record the start event:
|
||
|
|
||
|
cudaEventRecord( start, NULL );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// execute the kernel:
|
||
|
|
||
|
ArrayMul<<< grid, threads >>>( dA, dB, dC );
|
||
|
|
||
|
// record the stop event:
|
||
|
|
||
|
cudaEventRecord( stop, NULL );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// wait for the stop event to complete:
|
||
|
|
||
|
cudaEventSynchronize( stop );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
float msecTotal;
|
||
|
cudaEventElapsedTime( &msecTotal, start, stop );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// compute and print the performance
|
||
|
|
||
|
double secondsTotal = 0.001 * (double)msecTotal;
|
||
|
double multsPerSecond = (double)DATASET_SIZE / secondsTotal;
|
||
|
double megaMultsPerSecond = multsPerSecond / 1000000.;
|
||
|
fprintf( stderr, "%12d, %4d, %10.2lf\n", DATASET_SIZE, THREADS_PER_BLOCK, megaMultsPerSecond );
|
||
|
|
||
|
// copy result from the device to the host:
|
||
|
|
||
|
cudaMemcpy( hC, dC, sizeof(hC), cudaMemcpyDeviceToHost );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
// clean up:
|
||
|
|
||
|
cudaFree( dA );
|
||
|
cudaFree( dB );
|
||
|
cudaFree( dC );
|
||
|
CudaCheckError( );
|
||
|
|
||
|
return 0;
|
||
|
}
|