## CUDA Parallel Merge Sort

This work was part of a final project for Programming Languages. The CUDA API was investigated and used to write a parallel merge sort algorithm that executes on the Graphics Processing Unit(GPU), through a technique called GPGPU. GPGPU stand for General Purpose computing on a GPU. GPGPU allows for the GPU to be used as a co-processor for computationally intense tasks. To learn more about this project please take a look at the PDFs and source code below.

[...] CUDA Parallel Merge Sort [...]

Only one block in the grid? isn’t that pretty inefficient? Should have at least 64 blocks if not 100 and at least 256 threads per block otherwise its kinda pointless and could probably run faster on the cpu.

Good observation Jake. This was done as an introduction to programming in CUDA. I am sure there is a lot of optimization that can be done.

Roger.

Cool. this code would be useful to me. maybe I’ll optimize it and send it back to you later

Sounds great! If you are able to optimize I would be willing to post the code here.

Hi, your link of the code is lost. Can you have a check so that we can study your code. Thanks.

It appears the my web server does not like to serve up .cu files. I put the kernel and main program in a zip file. The link should work now. Thanks!

Hi, probably I don’t understand much the code, but it doesn’t look like a parallel code, but more like the iterative one running on gpu.

If you edit the main loop in this way:

if(tid==0)

{

k=1;

while(k<NUM)

{

i = 0;

while(i+k NUM)

{

u = NUM+1;

}

Merge(shared, results, i, i+k, u);

i=i + k * 2;

}

k=k*2;

}

}

__syncthreads();

}

the result will be still correct and the performance will be improved.

If you look at the reports the methodology is explained. We did performance tests and the parallel version ran much faster than the iterative version (not sure I have those numbers though).

Wow! it was almost two years ago! I’ve completed that task; it turned out some algorithms just doesn’t belong to parallel processing world! However I changed it a bit and then parallelized it (or whatever the word is!) and here is the code:

(i used bcc5.5 with cuda sdk 3)

cls

nvcc -run –use_fast_math -arch=compute_11 mergec.cu –output-file fileName.exe

//———————————————————————————————-

//———————————————————————————————-

//———————————————————————————————-

//———————————————————————————————-

//———————————— Kernel CODE ————————————–

//———————————————————————————————-

//———————————————————————————————-

//———————————————————————————————-

#ifndef _MERGEK_CU_

#define _MERGEK_CU_

#include “device_functions.h”

#define BLIND_UNROLL_COUNT 255

#define T (*((unsigned short*) (&Shared_Ptrs)))

#define M (*(((unsigned short*) (&Shared_Ptrs)+1)))

#define F (*((unsigned char*) (&composite)))

#define SIDE (*(((unsigned char*) (&composite)+1)))

#define J1 (*(((unsigned char*) (&composite)+2)))

#define J2 (*(((unsigned char*) (&composite)+3)))

//A: input array from host

//B: (shared memory) calculation area

//C: output array to host

// i didn’t use binary search for the searching part, because the algorithm has been changed

// and the “binary search” part of it has been replaced by a “pairwise swap sort” (MergeSortK2Pairs)

// and an improved version of “sequensial search” specially for cuda (MergeSortK2Seqns)

// because finding the exact location of each item is easier when “pairwise swap sort” is aleady applied

// but the algorithm could still be improved by adding a cuda-friendly search method that mixes up

// binary (to be faster) and sequensial (to protect the coalescing of reads and writes)

//Output of this function will be the full array but only sorted in multiple 512-elements arrays

//There must be an optimum value to this thing but I just used 512

//this funtion has almost got no conditional branches. “for”s are fully unrolled and most

//of the “if”s here also don’t count, because the program runs only through one of them in each warp.

//after that the second and the third functions run one after another in a loop

//until all the array elements are sorted.

__global__ static void MergeSortK1(int* A, unsigned int blockSize)

{

extern __shared__ int B[];

unsigned int Shared_Ptrs;//contains T and M (T is current location, M is T+L)

unsigned int composite = 0;//contains block offset(=F), J1 and J2

int val1,val2;//temp

//Fill B from A

//to prevent conflicts and also to keep the coalescing, two sides are exactly one blockSize apart

val1 = (blockIdx.x+blockIdx.y*gridDim.x)*(blockSize<<1)+threadIdx.x;

B[threadIdx.x] = A[val1];

__syncthreads();

B[threadIdx.x+blockSize] = A[val1 + blockSize];

__syncthreads();

//Sort B in segments of 512

//L is the distance between T and its M, and L*2 is the distance between T and next T

//L=1

//if B[T] and B[M] are not in an incremental order then swap them

if(B[threadIdx.x<B[1+(threadIdx.x<<1)])

{

B[threadIdx.x<<1] ^= B[1+(threadIdx.x<<1)];

B[1+(threadIdx.x<<1)] ^= B[threadIdx.x<<1];

B[threadIdx.x<<1] ^= B[1+(threadIdx.x<B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

M = T+1;

if ((T&1)==1 && B[M]

B[M]){

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)//nothing happens if j=0. so the loop starts from j=1

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

__syncthreads();

//L=8

#define _L 8

#define _l 7

composite = 0;

T = (threadIdx.x&-_L) + threadIdx.x;

M = T + _L;

F = T & _l;

if(B[T]>B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

__syncthreads();

//L=16

#define _L 16

#define _l 15

composite = 0;

T = (threadIdx.x&-_L) + threadIdx.x;

M = T + _L;

F = T & _l;

if(B[T]>B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

__syncthreads();

//L=32

#define _L 32

#define _l 31

composite = 0;

T = (threadIdx.x&-_L) + threadIdx.x;

M = T + _L;

F = T & _l;

if(B[T]>B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

__syncthreads();

//L=64

#define _L 64

#define _l 63

composite = 0;

T = (threadIdx.x&-_L) + threadIdx.x;

M = T + _L;

F = T & _l;

if(B[T]>B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

if (J1!=0 && J2!=0) break;//from now on we have this condition-break in loops which reduces time to 219 from 255

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

__syncthreads();

//L=128

#define _L 128

#define _l 127

composite = 0;

T = (threadIdx.x&-_L) + threadIdx.x;

M = T + _L;

F = T & _l;

if(B[T]>B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

if (J1!=0 && J2!=0) break;

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

__syncthreads();

//L=256

#define _L 256

#define _l 255

composite = 0;

T = (threadIdx.x&-_L) + threadIdx.x;

M = T + _L;

F = T & _l;

if(B[T]>B[M])

{

B[T] ^= B[M];

B[M] ^= B[T];

B[T] ^= B[M];

}

__syncthreads();

val1 = B[T];

val2 = B[M];

#pragma unroll _L

for (int j=1; j<_L; j++)

{

if (J1==0 && j<=F && B[M-j]<=B[T]) J1=j;

if (J2==0 && j=B[M]) J2=j;

if (J1!=0 && J2!=0) break;

}

__syncthreads();

if (J1>0) B[T+F-J1+1] = val1;

if (J2>0) B[T+F+J2] = val2;

//Update A from B

__syncthreads();

val1 = (blockIdx.x+blockIdx.y*gridDim.x)*(blockSize<<1)+threadIdx.x;

A[val1] = B[threadIdx.x];

__syncthreads();

A[val1 + blockSize] = B[threadIdx.x+blockSize];

__syncthreads();

}

//A and C cannot be a single array because due to the parallel limit of 3584 (256 here) threads,

//original values of A is needed for next 256 threads (512 elements) and sortings should not

//be made directly to A and therefor A is treated as read-only until all threads have completed one level

//this kernel performs one level of pairwise Sort (or you should name it Dual Swap sort)

__global__ static void MergeSortK2Pairs(int* A, unsigned int blockSize, unsigned int L)

{

unsigned int gid = (threadIdx.x+blockSize*(blockIdx.x+blockIdx.y*gridDim.x));

unsigned int t = ((gid&(-L))<<1) +(gid&(L-1));//so this kernel will run as many as half of all elements

int temp = A[t+L];

if (temp<A[t])

{

A[t+L] = A[t];

A[t] = temp;

}

//This Method is much slower:

//atomicMin(&A[t],atomicMax(&A[t+L],A[t]));

}

//this kernel performs one level of Sequential Sort (Loop search through the memory) one sided

__global__ static void MergeSortK2Seqns(int* A, int* C, unsigned int blockSize, unsigned int L)

{

unsigned int gid = (threadIdx.x+blockSize*(blockIdx.x+blockIdx.y*gridDim.x));

int cur = A[gid];//current value

unsigned int t = gid & (L-1);//index in current subarray

unsigned int s;//start of other subarray

unsigned int h;//start of block (for left side) or end of block (for right side) which have to be searched

unsigned int f;//number of blocks to subarray's starting (ending) point from current block starting (ending) point

unsigned int j;//simple counter

int J;//correct offset relative to gid

if ((gid&L)==0)

{

s = (gid+L)&(-L);

h = (gid+L)&(-blockSize);

f = (h-s)/blockSize;

//sort

if (A[h]=h; j–)

{

//search current block from t to start

if (A[j]cur)

{

//target is less than every element in the other side

J = -1;

}

else

{

//target is in range (s..h)

for (j=0; j<f; j++)

{

//quickly search previous blocks in the subarray

h-=blockSize;

if (A[h]<=cur) break;

}

//target is in the block with starting point 'h'

J = h+blockSize-1;

for (j=0; j<blockSize; j++)

{

if(A[J]=cur)

{

//target is in current block

J = L-t;

for (j=gid-L; j=cur) break;

J–;

}

}

else

{

//don’t search current block, instead search one other block entirely

if (A[s]<cur)

{

//target is more than every element in the other side

J = 0;

}

else

{

//target is in range (h..s)

for (j=0; j=cur) break;

}

//target is in the block with ending point ‘h’

J = h-blockSize+1;

for (j=0; j=cur) break;

J++;

}

J = s-J+1;

}

}

//place the cur value in the correct index of C

C[gid-J] = cur;

}

__syncthreads();

}

#endif

//———————————————————————————————-

//———————————————————————————————-

//———————————————————————————————-

//———————————————————————————————-

//———————————— CPP CODE —————————————–

//———————————————————————————————-

//———————————————————————————————-

//———————————————————————————————-

#include

#include

#include

#include

#include “mergeK.cu”

#include

//Forward declaration

void SetDimensions();

void DeclareDimensions();

void StartKernel();

bool VerifyResults();

void MakeArraysReady();

void InitializeProg();

void FinalizeProg(bool passed);

int main ();

//Cuda-enabled device which is in use

cudaDeviceProp prop;

//grid and block size for three kernels’ exec config

dim3 halfGrid, grid, block;

//Number of elements to sort

unsigned int NUM;

//Arrays

int *host_input_arr;

int *dev_arr1;

int *dev_arr2;

int *host_output_arr;

//Reporting

clock_t start, end;

cudaError_t err;

void SetDimensions()

{

//Set grid and block sizes for all three Kernels’ execution configuration

//blockSize = warpSize * number of blocks per multiprocessor

block.x = 256;

block.y = 1;

block.z = 1;

halfGrid.x = 16384;//lower this value if gpu terminated the program

halfGrid.y = 4;

halfGrid.z = 1;

grid.x = halfGrid.x*2;

grid.y = halfGrid.y;

grid.z = 1;

NUM = block.x*grid.x*grid.y;

}

void MakeArraysReady()

{

//Input data is loaded into host_input_arr, then copied to dev_arr.

//And after the execution of the Kernel,

//the result will be copied back to host_output_arr

host_input_arr = new int[NUM];

host_output_arr = new int[NUM];

//Fill host_input_arr with random data

for(int i=0; i<NUM; i++)

{

host_input_arr[i] = rand()%10;

}

//Allocate dev_arr's on the device memory (Global)

cutilSafeCall(cudaMalloc((void**)&dev_arr1, sizeof(int)*NUM));

cutilSafeCall(cudaMalloc((void**)&dev_arr2, sizeof(int)*NUM));

//Copy host_input_arr into dev_arr

cutilSafeCall(cudaMemcpy(dev_arr1, host_input_arr, sizeof(int)*NUM, cudaMemcpyHostToDevice));

}

void StartKernel()

{

printf("Working…\n");

start = clock();

//the following kernel returns an array in which elements are sorted in 512(=2*block.x) subarrays

MergeSortK1<<>>(dev_arr1,block.x);

//Wait until Kernel is done

cudaThreadSynchronize();

int SubArrayHalfLength = block.x<<1;

int counter = 0;

while (SubArrayHalfLength<NUM)

{

if (counter%2==0)

{

//each thread of the following kernel, swap-sorts two memory addresses: A[T],A[T+512] then A[T],A[T+1024]…

MergeSortK2Pairs<<>>(dev_arr1,block.x,SubArrayHalfLength);//no shared size

//Wait until Kernel is done

cudaThreadSynchronize();

//this kernel read one elem and find its position (subarray size starts from 2×512)

MergeSortK2Seqns<<>>(dev_arr1,dev_arr2,block.x,SubArrayHalfLength);

//Wait until Kernel is done

cudaThreadSynchronize();

}

else

{

MergeSortK2Pairs<<>>(dev_arr2,block.x,SubArrayHalfLength);//no shared size

//Wait until Kernel is done

cudaThreadSynchronize();

MergeSortK2Seqns<<>>(dev_arr2,dev_arr1,block.x,SubArrayHalfLength);

//Wait until Kernel is done

cudaThreadSynchronize();

}

SubArrayHalfLength<1)

{

printf(“\n%i CUDA-Enabled devices found.\nFirst device is used.\n”,num_devices);

}

//Set active device to dev#0 and read its features and properties

cudaSetDevice(0);

err = cudaGetDeviceProperties(&prop, 0);

//Show GPU device features on the screen

printf(“—————————————————————–\n”);

printf(“> GPU Basic Info GPU Memory Sizes Hierarchy Sizes Input Data <\n");

printf("—————————————————————–\n");

printf("Number of input data: %i\n",NUM);

printf("Input data type: int\n");

printf("Input data pattern: Random 1..10\n");

printf("Grid Size: %i\n",grid.x);

printf("Block Size: %i x %i\n",block.x,block.y);

printf("—————————————————————–\n");

}bool VerifyResults()

{

for (int i=0; i0) if (host_output_arr[i]<host_output_arr[i-1])

return false;

return true;

}

void FinalizeProg(bool passed)

{

//Print the outcome

printf("\n—————————————————————–\n");

printf("%s\n", passed ? "Pass!" : "Fail!");

printf("NUM:\t\t2^%i =\t%i\nGridSize:\t2^%i x\t2^%i\n", (int)log2((double)NUM), NUM, (int)log2((double)grid.x), (int)log2((double)grid.y));

printf("Time:\t\t%i ms\n", end-start);

printf("—————————————————————–\n");

printf("Press ENTER to exit…\n");

fflush(stdout);

fflush(stderr);

getchar();

exit(EXIT_SUCCESS);

}

just right where the font turned bold there’s an invisible “is smaller than” sign.