// Monte Carlo simulation of a snowball fight:

// system includes
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include <math.h>
#include <stdlib.h>

// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include "helper_functions.h"
#include "helper_cuda.h"

// setting the number of trials in the monte carlo simulation:
#ifndef NUMTRIALS
#define NUMTRIALS	1024
#endif

#ifndef BLOCKSIZE
#define BLOCKSIZE	8     // number of threads per block
#endif

#define NUMBLOCKS	( NUMTRIALS / BLOCKSIZE )



// ranges for the random numbers:
//#define PROJECT1

#ifdef PROJECT1
const float TXMIN =	-10.0;	// truck starting location in feet
const float TXMAX =	 10.0;	// truck starting location in feet
const float TYMIN  =	 45.0;	// depth distance to truck in feet
const float TYMAX  =	 55.0;	// depth distance to truck in feet
const float TXVMIN =	 10.0;	// truck x velocity in feet/sec
const float TXVMAX =	 30.0;	// truck x velocity in feet/sec
const float SVMIN  =	 10.0;	// snowball velocity in feet/sec
const float SVMAX  =	 30.0;	// snowball velocity in feet/sec
const float STHMIN = 	 10.0;	// snowball launch angle in degrees
const float STHMAX =	 90.0;	// snowball launch angle in degrees
const float HALFLENMIN = 20.;	// half length of the truck in feet
const float HALFLENMAX = 20.;	// half length of the truck in feet
#else
const float TXMIN =	-10.0;	// truck starting location in feet
const float TXMAX =	 10.0;	// truck starting location in feet
const float TXVMIN =	 15.0;	// truck x velocity in feet/sec
const float TXVMAX =	 35.0;	// truck x velocity in feet/sec
const float TYMIN  =	 40.0;	// depth distance to truck in feet
const float TYMAX  =	 50.0;	// depth distance to truck in feet
const float SVMIN  =	  5.0;	// snowball velocity in feet/sec
const float SVMAX  =	 30.0;	// snowball velocity in feet/sec
const float STHMIN = 	 10.0;	// snowball launch angle in degrees
const float STHMAX =	 70.0;	// snowball launch angle in degrees
const float HALFLENMIN = 15.;	// half length of the truck in feet
const float HALFLENMAX = 30.;	// half length of the truck in feet
#endif


// these are here just to be pretty labels, other than that, they do nothing:
#define IN
#define OUT



// function prototypes:
float		Ranf( float, float );
int		Ranf( int, int );
void		TimeOfDaySeed( );



void
CudaCheckError()
{
	cudaError_t e = cudaGetLastError();
	if(e != cudaSuccess)
	{
   		fprintf( stderr, "Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e));
	}
}


// degrees-to-radians:
__device__
float
Radians( float d )
{
	return (M_PI/180.f) * d;
}


//Global means gpu and callable from cpu
__global__
void
MonteCarlo( IN float *dtxs, IN float *dtys, IN float *dtxvs, IN float *dsvs, IN float *dsths, IN float *dhalflens, OUT int *dhits )
{
	//Not sure about this, as it's differnt in the project video.
    //__shared__ int numHits[BLOCKSIZE];
   
    //unsigned int numItems = blockDim.x;
    //unsigned int wgNum    = blockIdx.x;
	//unsigned int tnum     = threadIdx.x;
	unsigned int gid      = blockIdx.x*blockDim.x + threadIdx.x;

	dhits[gid] = 0;

    //Get stuff based off the gid/thread.
	// randomize everything:
	float tx   = dtxs[gid];
    float ty = dtys[gid];
    float txv = dtxvs[gid];
    float sv = dsvs[gid];
    float sthd = dsths[gid];
    float sthr = Radians(sthd);
    float svx = sv * cos(sthr);
    float svy = sv * sin(sthr);

    //I don't see anythign of the half len so I put this here.
    //float halflen = dhalflens[gid];

   	// how long until the snowball reaches the y depth:
    float tstar = ty / svy;
    float truckx = tx + txv * tstar;
    float sbx = svx * tstar;

    //check snowball location.
    //Project video says HALF_LENGTH but I don't see that in the template file.
	if( fabs(truckx - sbx) < dhalflens[gid])
	{
	    dhits[gid] = 1;
	}
}


// main program:

int
main( int argc, char* argv[ ] )
{
	TimeOfDaySeed( );

	//int dev = findCudaDevice(argc, (const char **)argv);

	
	float *htxs  = new float [NUMTRIALS];
	float *htys  = new float [NUMTRIALS];
	float *htxvs = new float [NUMTRIALS];
	float *hsvs  = new float [NUMTRIALS];
	float *hsths = new float [NUMTRIALS];
	float *hhalflens = new float [NUMTRIALS];

	// fill the random-value arrays:
	for( int n = 0; n < NUMTRIALS; n++ )
	{
		htxs[n]  = Ranf(  TXMIN,  TXMAX );
		htys[n]  = Ranf(  TYMIN,  TYMAX );
 		htxvs[n] = Ranf(  TXVMIN, TXVMAX );
 		hsvs[n]  = Ranf(  SVMIN,  SVMAX );
 		hsths[n] = Ranf(  STHMIN, STHMAX );
		hhalflens[n] = Ranf( HALFLENMIN, HALFLENMAX );
	}

	int *hhits = new int [NUMTRIALS];

	// allocate device memory:

	float *dtxs, *dtys, *dtxvs, *dsvs, *dsths, *dhalflens;
	int   *dhits;


	//cudaError_t status;
	cudaMalloc( (void **)(&dtxs),   NUMTRIALS*sizeof(float) );
	CudaCheckError( );

    //Malloc the space for the devices ones.
	cudaMalloc((void **)(&dtys), NUMTRIALS*sizeof(float) );
    CudaCheckError( );

	cudaMalloc((void **)(&dtxvs), NUMTRIALS*sizeof(float) );
    CudaCheckError( );
	
    cudaMalloc((void **)(&dsvs), NUMTRIALS*sizeof(float) );
    CudaCheckError( );

	cudaMalloc((void **)(&dsths), NUMTRIALS*sizeof(float) );
    CudaCheckError( );

	cudaMalloc((void **)(&dhalflens), NUMTRIALS*sizeof(float) );
    CudaCheckError( );

	cudaMalloc((void **)(&dhits), NUMTRIALS*sizeof(int) );
    CudaCheckError( );



	// copy host memory to the device:

	cudaMemcpy( dtxs,  htxs,  NUMTRIALS*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );
    
	cudaMemcpy( dtys,  htys,  NUMTRIALS*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );

	cudaMemcpy( dtxvs,  htxvs,  NUMTRIALS*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );

	cudaMemcpy( dsvs,  hsvs,  NUMTRIALS*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );

	cudaMemcpy( dsths,  hsths,  NUMTRIALS*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );

	cudaMemcpy( dhalflens,  hhalflens,  NUMTRIALS*sizeof(float), cudaMemcpyHostToDevice );
	CudaCheckError( );

    //The project video doesn't show this being done.....but it's part of the function...
	cudaMemcpy( dhits, hhits, NUMTRIALS*sizeof(int), cudaMemcpyHostToDevice );
    CudaCheckError( );

    // setup the execution parameters:

	dim3 threads(BLOCKSIZE, 1, 1 );
	dim3 grid(NUMBLOCKS, 1, 1 );

	// create and start timer

	cudaDeviceSynchronize( );

	// allocate CUDA events that we'll use for timing:

	cudaEvent_t start, stop;
	cudaEventCreate( &start );
	CudaCheckError( );
	cudaEventCreate( &stop );
	CudaCheckError( );

	// record the start event:

	cudaEventRecord( start, NULL );
	CudaCheckError( );

	// execute the kernel:

	MonteCarlo<<< grid, threads >>>( dtxs, dtys, dtxvs, dsvs, dsths,  dhalflens, dhits );

	// record the stop event:

	cudaEventRecord( stop, NULL );
	CudaCheckError( );

	// wait for the stop event to complete:

	cudaEventSynchronize( stop );
	CudaCheckError( );

	float msecTotal = 0.0f;
	cudaEventElapsedTime( &msecTotal, start, stop );
	CudaCheckError( );

	// copy result from the device to the host:

	cudaMemcpy( hhits, dhits, NUMTRIALS *sizeof(int), cudaMemcpyDeviceToHost );
	CudaCheckError( );

	// compute the sum :

	int numHits = 0;
	//????
    for(int i = 0; i < NUMTRIALS; i++)
    {
        numHits += hhits[i];
        //fprintf(stderr, "hhits[%d] = %5d ; Total numHits = %5d\n", i, hhits[i], numHits);
    }

	float probability = 100.f * (float)numHits / (float)NUMTRIALS;

	// compute and printL

	double secondsTotal = 0.001 * (double)msecTotal;
	double trialsPerSecond = (float)NUMTRIALS / secondsTotal;
	double megaTrialsPerSecond = trialsPerSecond / 1000000.;
	fprintf( stderr, "Number of Trials = %10d, Blocksize = %8d, MegaTrials/Second = %10.4lf, Probability = %6.2f%%\n",
		NUMTRIALS, BLOCKSIZE, megaTrialsPerSecond, probability );

	// clean up memory:
	delete [ ] htxs;
	delete [ ] htys;
	delete [ ] htxvs;
	delete [ ] hsvs;
	delete [ ] hsths;
	delete [ ] hhits;

	cudaFree( dtxs );
	CudaCheckError( );
	cudaFree( dtys );
	CudaCheckError( );
	cudaFree( dtxvs );
	CudaCheckError( );
	cudaFree( dsvs );
	CudaCheckError( );
	cudaFree( dsths );
	CudaCheckError( );
	cudaFree( dhits );
	CudaCheckError( );


	return 0;
}

float
Ranf( float low, float high )
{
	float r = (float) rand();               // 0 - RAND_MAX
	float t = r  /  (float) RAND_MAX;       // 0. - 1.

	return   low  +  t * ( high - low );
}

int
Ranf( int ilow, int ihigh )
{
	float low = (float)ilow;
	float high = ceil( (float)ihigh );

	return (int) Ranf(low,high);
}

void
TimeOfDaySeed( )
{
	struct tm y2k = { 0 };
	y2k.tm_hour = 0;   y2k.tm_min = 0; y2k.tm_sec = 0;
	y2k.tm_year = 100; y2k.tm_mon = 0; y2k.tm_mday = 1;

	time_t  timer;
	time( &timer );
	double seconds = difftime( timer, mktime(&y2k) );
	unsigned int seed = (unsigned int)( 1000.*seconds );    // milliseconds
	srand( seed );
}