Hillis Steele Scan (Parallel Prefix Scan Algorithm)

In this article, a scanning algorithm known as the Hillis-Steele Scan, also known as Parallel Prefix Scan Algorithm, is discussed. A scan operation in this context essentially means the calculation of prefix sums of an array. The Hillis-Steele scan is an algorithm for a scan operation that runs in a parallel fashion. Below is the approach of the algorithm for an array, x[] of size N:

Iterate in the range [1, log₂(N)] using the variable d and for all k in parallel, check if the value of k is at least 2^d or not. If found to be true, then add the value of x[k – 2^{d – 1}] to the value x[k].

Visual Representation:

When depth d reaches log₂N the calculation terminates and the result is calculated as the prefix sum of the array. All the individual additional operations run in parallel and each layer (d = 1, d = 2, …, ) proceeds linearly.

Below is the implementation of the algorithm in CUDA C++:

C++

// C++ program for the above approach
 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <cstdio>
#include <ctime>
#include <iostream>

using namespace std::chrono;

using namespace std;
 
// Function to handle error

static void HandleError(cudaError_t err,

                        const char* file,

                        int line)
{

    // If the error occurs then print

    // that error

    if (err != cudaSuccess) {

        printf("\n%s in %s at line %d\n",

               cudaGetErrorString(err),

               file, line);
 
        // Exit

        exit(EXIT_FAILURE);

    }
}
 
#define HANDLE_ERROR(err) (
HandleError(err, __FILE__, __LINE__))
 
template <typename T>

__global__ void
Hillis_Steele_Scan_Kernel(T* arr,

 __int64 space,

                          __int64 step,

 __int64 steps)
{

    __int64 x = threadIdx.x

                + blockDim.x * blockIdx.x;
 
    __int64 y = threadIdx.y

                + blockDim.y * blockIdx.y;
 
    // 2D Kernel Launch parameters

    __int64 tid = x + (y * gridDim.x

                       * blockDim.x);
 
    // Kernel runs in the parallel

    // TID is the unique thread ID

    if (tid >= space)

        arr[tid] += arr[tid - space];
}
 
template <typename T>

T* Hillis_Steele_Scan(T* input, __int64 N)
{

    __int64* out;

    HANDLE_ERROR(

        cudaMallocManaged(&out,

                          (sizeof(__int64) * N)));
 
    // 2D Kernel Launch Parameters

    dim3 THREADS(1024, 1, 1);

    dim3 BLOCKS;

    if (N >= 65536)

        BLOCKS = dim3(64, N / 65536, 1);

    else if (N <= 1024)

        BLOCKS = dim3(1, 1, 1);

    else

        BLOCKS = dim3(N / 1024, 1, 1);
 
    __int64 space = 1;
 
    // Begin with a stride of 2^0

    __int64 steps = __int64(log2(float(N)));
 
    // Log2N depth dependency of scan

    HANDLE_ERROR(cudaMemcpy(

        out, input, sizeof(__int64) * N,

        cudaMemcpyDeviceToDevice));
 
    // Copy Input Array to Output Array

    for (size_t step = 0;

         step < steps; step++) {

        Hillis_Steele_Scan_Kernel<<<BLOCKS, THREADS> > >(

            out, space, step, steps);
 
        // Calls the parallel operation

        space *= 2;
 
        // A[i] += A[i - stride]

        // log N times where N

        // is array size

    }
 
    cudaDeviceSynchronize();
 
    return out;
}
 
// Driver Code

int main()
{

    __int64* inputArr;

    __int64 arraysize = 10;
 
    // Size of the input array

    __int64 N = __int64(1)

                << (__int64(log2(float(arraysize))) + 1);
 
    // N is the nearest power of 2

    // to the array size

    cout << "\n\nELEMS --> 2^" << N

         << " >= " << arraysize;
 
    // Allocate memory on the GPU

    HANDLE_ERROR(cudaMallocManaged(&inputArr,

                                   (sizeof(__int64) * N)));
 
    HANDLE_ERROR(cudaDeviceSynchronize());
 
    // INIT Test Data

    for (__int64 i = 0; i < N; i++) {

        inputArr[i] = 1;

    }
 
    // An array with only 1s was chosen

    // as test data so the result is

    // 1, 2, 3, 4, ..., N

    high_resolution_clock::time_point tg1

        = high_resolution_clock::now();
 
    __int64* out = Hillis_Steele_Scan(

        inputArr, N);
 
    // Function Call

    high_resolution_clock::time_point tg2

        = high_resolution_clock::now();
 
    duration<double> time_span

        = duration_cast<duration<double> >(tg2 - tg1);
 
    cout << "\nTime Taken : "

         << time_span.count() * 1000

         << " ms";
 
    cout << endl;

    for (__int64 i = 0; i < arraysize; i++)

        std::cout << '\t' << out[i];

    std::cout << std::endl;
 
    cudaFree(out);
 
    // Free allocated memory from GPU

    cudaFree(inputArr);
 
    return 0;
}

Complexity Analysis: O(log N) time and O(N) processors

Article Tags :

C++

C++ Programs

Misc