#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <cstdio>
#include <ctime>
#include <iostream>
using
namespace
std::chrono;
using
namespace
std;
static
void
HandleError(cudaError_t err,
const
char
* file,
int
line)
{
if
(err != cudaSuccess) {
printf
(
"\n%s in %s at line %d\n"
,
cudaGetErrorString(err),
file, line);
exit
(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (
HandleError(err, __FILE__, __LINE__))
template
<
typename
T>
__global__
void
Hillis_Steele_Scan_Kernel(T* arr,
__int64
space,
__int64
step,
__int64
steps)
{
__int64
x = threadIdx.x
+ blockDim.x * blockIdx.x;
__int64
y = threadIdx.y
+ blockDim.y * blockIdx.y;
__int64
tid = x + (y * gridDim.x
* blockDim.x);
if
(tid >= space)
arr[tid] += arr[tid - space];
}
template
<
typename
T>
T* Hillis_Steele_Scan(T* input,
__int64
N)
{
__int64
* out;
HANDLE_ERROR(
cudaMallocManaged(&out,
(
sizeof
(
__int64
) * N)));
dim3 THREADS(1024, 1, 1);
dim3 BLOCKS;
if
(N >= 65536)
BLOCKS = dim3(64, N / 65536, 1);
else
if
(N <= 1024)
BLOCKS = dim3(1, 1, 1);
else
BLOCKS = dim3(N / 1024, 1, 1);
__int64
space = 1;
__int64
steps =
__int64
(log2(
float
(N)));
HANDLE_ERROR(cudaMemcpy(
out, input,
sizeof
(
__int64
) * N,
cudaMemcpyDeviceToDevice));
for
(
size_t
step = 0;
step < steps; step++) {
Hillis_Steele_Scan_Kernel<<<BLOCKS, THREADS> > >(
out, space, step, steps);
space *= 2;
}
cudaDeviceSynchronize();
return
out;
}
int
main()
{
__int64
* inputArr;
__int64
arraysize = 10;
__int64
N =
__int64
(1)
<< (
__int64
(log2(
float
(arraysize))) + 1);
cout <<
"\n\nELEMS --> 2^"
<< N
<<
" >= "
<< arraysize;
HANDLE_ERROR(cudaMallocManaged(&inputArr,
(
sizeof
(
__int64
) * N)));
HANDLE_ERROR(cudaDeviceSynchronize());
for
(
__int64
i = 0; i < N; i++) {
inputArr[i] = 1;
}
high_resolution_clock::time_point tg1
= high_resolution_clock::now();
__int64
* out = Hillis_Steele_Scan(
inputArr, N);
high_resolution_clock::time_point tg2
= high_resolution_clock::now();
duration<
double
> time_span
= duration_cast<duration<
double
> >(tg2 - tg1);
cout <<
"\nTime Taken : "
<< time_span.count() * 1000
<<
" ms"
;
cout << endl;
for
(
__int64
i = 0; i < arraysize; i++)
std::cout <<
'\t'
<< out[i];
std::cout << std::endl;
cudaFree(out);
cudaFree(inputArr);
return
0;
}