書接上文,我們將要著手實現一個在GPU上構建KDTree的演算法。

首先先無腦尬吹一波周昆爸爸,有大佬們的研究,我們這些菜雞纔有機會玩玩並行編程。

本文所用的方法將來自這篇文章[1]。


總體思路:

下面的偽代碼已經很清晰的解釋了演算法的總體思路,將構建過程分為三部分,第一部分是劃分大節點,第二部分再對大節點進行更細的劃分,第三部分根據之前的劃分構建KDTree並優化存儲結構。其中第一第二部分採用了不同的並行策略。

看完是不是覺得還挺簡單,hhh,然而在開始復現代碼之前,我們需要先實現幾個並行原語。


原語:

本文所用實現主要來自[2]。

1)Scan(exclusive)

該原語表示對一給定序列 A=[a_0,a_1,...,a_{n-1}] ,一個左單元 varepsilon ,和一個操作 oplus ,輸出一個序列 B=[varepsilon,varepsilon{oplus}a_0,varepsilon{oplus}a_0{oplus}a_1,...,varepsilon{oplus}a_0{oplus}a_1{oplus}...{oplus}a_{n-2}] 。需要注意,該原語的B序列中每一元素不包含A中對應位置元素,因此是exclusive的。

2) Scan(inclusive)

該原語表示對一給定序列 A=[a_0,a_1,...,a_{n-1}] ,一個左單元 varepsilon ,和一個操作 oplus ,輸出一個序列 B=[varepsilon{oplus}a_0,varepsilon{oplus}a_0{oplus}a_1,...,varepsilon{oplus}a_0{oplus}a_1{oplus}...{oplus}a_{n-1}] 。需要注意,該原語的B序列中每一元素包含A中對應位置元素,因此是inclusive的。

3) SegmentScan

該原語表示對一給定序列 A=[a_0,a_1,...,a_{n-1}] ,一個標識切片的序列 B=[flag_0,flag_1,...,flag_{n-1}] ,一個左單元 varepsilon ,和一個操作 oplus ,輸出一個序列 C ,該序列中每一分段相互獨立的進行Scan操作。


原語實現:

參考該圖,我們先從下而上的traverse一棵樹並對其應用二元操作oplus,再反過來從上而下的將結果送到合適的需要該結果的位置。具體細節可以參考[2],文章寫的很詳細了。

下面直接附上實現,因為實際上用到的主要是segmentscan,而且他可以代替scan,所以我偷懶了一下沒有保證scan實現的正確性而僅對segmentscan做了測試...

Scan:

template<typename T>
__global__ void LScan_1_(const T* input, T* output, T* middle, const int size, const T prim, T(*fp)(T, T)) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d, t;
extern __shared__ T block_mem_value[];

// init shared memory
{
block_mem_value[threadIdx.x] = idx < size ? input[idx] : prim;
__syncthreads();
}

//reduce L1
for (d = 0, t = 1; t * 2 <= blockDim.x; d++)
{
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
block_mem_value[p + t] = fp(block_mem_value[p], block_mem_value[p + t]);
}
__syncthreads();
t <<= 1;
}

//write to L2
{
#ifndef WRITE_TYPE_0
if (threadIdx.x == 0)
#endif
middle[blockIdx.x] = block_mem_value[blockDim.x - 1];
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
}
}

template<typename T>
__global__ void LScan_2_(const T* input, T* output, const T prim, T(*fp)(T, T)) {
int idx = threadIdx.x;
int d, t;
extern __shared__ T block_mem_value[];

// init shared memory
{
block_mem_value[idx] = input[idx];
__syncthreads();
}

//reduce
for (d = 0, t = 1; t * 4 <= blockDim.x; d++)
{
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
block_mem_value[p + t] = fp(block_mem_value[p], block_mem_value[p + t]);
}
__syncthreads();
t <<= 1;
}

// set last to prim
{
#ifndef WRITE_TYPE_0
if (idx == 0)
#endif
block_mem_value[blockDim.x - 1] = prim;
__syncthreads();
}

//map
for (; d >= 0; d--) {
if (idx << (31 - d) == 0) {
int p = idx + t - 1;
float tmp = block_mem_value[p];
block_mem_value[p] = block_mem_value[p + t];
block_mem_value[p + t] = fp(tmp, block_mem_value[p + t]);
}
__syncthreads();
t >>= 1;
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
}
}

template<typename T>
__global__ void LScan_3_(const T* middle, T* output, int saved_d, const T prim, T(*fp)(T, T)) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d, t;
extern __shared__ T block_mem_value[];

// init shared memory
{
block_mem_value[threadIdx.x] = output[idx];
__syncthreads();
}

//write back to L1
{
#ifndef WRITE_TYPE_0
if (threadIdx.x == 0)
#endif
block_mem_value[blockDim.x - 1] = middle[blockIdx.x];
__syncthreads();
}

//map L1
{
for (d = saved_d, t = blockDim.x >> 1; d >= 0; d--) {
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
float tmp = block_mem_value[p];
block_mem_value[p] = block_mem_value[p + t];
block_mem_value[p + t] = fp(tmp, block_mem_value[p + t]);
}
__syncthreads();
t >>= 1;
}
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
}
}

template<typename T>
void LScan(BufferPool<T>& TPool, int size, CUDABuffer<T>& input, CUDABuffer<T>& output, T prim, T(*fp)(T, T), bool inclusive = false) {
int pow_of_2 = Min2Pow(size);
int2 disp1 = DispachNum(pow_of_2, 128);
int2 disp2 = DispachNum(disp1.x, 128);

CUDABuffer<T>& middle1= TPool.Get(disp1.x);
CUDABuffer<T>& middle2= TPool.Get(disp2.x);

output.Resize(pow_of_2);

LScan_1_<T><<<disp1.x, disp1.y, disp1.y * sizeof(T)>>>(input, output, middle1, size, prim, fp);

LScan_1_<T><<<disp2.x, disp2.y, disp2.y * sizeof(T)>>>(middle1, middle1, middle2, disp1.x, prim, fp);

LScan_2_<T><<<1, disp2.x, disp2.x * sizeof(T)>>>(middle2, middle2, prim, fp);

LScan_3_<T><<<disp2.x, disp2.y, disp2.y * sizeof(T)>>>(middle2, middle1, MinNoneZero(disp2.y) - 1, prim, fp);

LScan_3_<T><<<disp1.x, disp1.y, disp1.y * sizeof(T)>>>(middle1, output, MinNoneZero(disp1.y) - 1, prim, fp);
}

SegmentScan:

template<typename T>
__global__ void LSScan_1_(const T* input, const int* segment, T* output, int* segtemp, T* middle_v, int* middle_s, const int size, const T prim, T(*fp)(T, T)) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d, t;
extern __shared__ T block_mem[];
T *block_mem_value = block_mem;
int *block_mem_segment = (int*)&block_mem[blockDim.x];

// init shared memory
{
block_mem_value[threadIdx.x] = idx < size ? input[idx] : prim;
block_mem_segment[threadIdx.x] = idx < size ? segment[idx] : 0;
__syncthreads();
}

//reduce L1
for (d = 0, t = 1; t * 2 <= blockDim.x; d++)
{
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
if (!block_mem_segment[p + t])
block_mem_value[p + t] = fp(block_mem_value[p], block_mem_value[p + t]);
block_mem_segment[p + t] = block_mem_segment[p] | block_mem_segment[p + t];
}
__syncthreads();
t <<= 1;
}

//write to L2
{
#ifndef WRITE_TYPE_0
if (threadIdx.x == 0) {
#endif
middle_v[blockIdx.x] = block_mem_value[blockDim.x - 1];
middle_s[blockIdx.x] = block_mem_segment[blockDim.x - 1];
#ifndef WRITE_TYPE_0
}
#endif
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
segtemp[idx] = block_mem_segment[threadIdx.x];
}
}

template<typename T>
__global__ void LSScan_2_(const T* input, const int* original_segment, T* output, int* segment, const int stride, const T prim, T(*fp)(T, T)) {
int idx = threadIdx.x;
int d, t;
extern __shared__ T block_mem[];
T *block_mem_value = block_mem;
int *block_mem_segment = (int*)&block_mem[blockDim.x];

// init shared memory
{
block_mem_value[idx] = input[idx];
block_mem_segment[idx] = segment[idx];
__syncthreads();
}

//reduce
for (d = 0, t = 1; t * 4 <= blockDim.x; d++)
{
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
if (!block_mem_segment[p + t])
block_mem_value[p + t] = fp(block_mem_value[p], block_mem_value[p + t]);
block_mem_segment[p + t] = block_mem_segment[p] | block_mem_segment[p + t];
}
__syncthreads();
t <<= 1;
}

// set last to prim
{
#ifndef WRITE_TYPE_0
if (idx == 0)
#endif
block_mem_value[blockDim.x - 1] = prim;
__syncthreads();
}

//map
for (; d >= 0; d--) {
if (idx << (31 - d) == 0) {
int p = idx + t - 1;
float tmp = block_mem_value[p];
block_mem_value[p] = block_mem_value[p + t];
int if1 = original_segment[(idx + t)*stride];
if (if1)
block_mem_value[p + t] = prim;
else if (block_mem_segment[p])
block_mem_value[p + t] = tmp;
else
block_mem_value[p + t] = fp(tmp, block_mem_value[p + t]);
block_mem_segment[p] = 0;
}
__syncthreads();
t >>= 1;
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
segment[idx] = block_mem_segment[idx];
}
}

template<typename T>
__global__ void LSScan_3_(const T* middle_v, const int* middle_s, const int* original_seg, int*segment, T* output, T* outseg, const int stride, const int saved_d, const T prim, T(*fp)(T, T)) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d, t;
extern __shared__ T block_mem[];
T *block_mem_value = block_mem;
int *block_mem_segment = (int*)&block_mem[blockDim.x];

// init shared memory
{
block_mem_value[threadIdx.x] = output[idx];
block_mem_segment[threadIdx.x] = segment[idx];
__syncthreads();
}

//write back to L1
{
#ifndef WRITE_TYPE_0
if (threadIdx.x == 0) {
#endif
block_mem_value[blockDim.x - 1] = middle_v[blockIdx.x];
block_mem_segment[blockDim.x - 1] = middle_s[blockIdx.x];
#ifndef WRITE_TYPE_0
}
#endif
__syncthreads();
}

//map L1
{
for (d = saved_d, t = blockDim.x >> 1; d >= 0; d--) {
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
float tmp = block_mem_value[p];
block_mem_value[p] = block_mem_value[p + t];
int if1 = original_seg[(idx + t)*stride];
if (if1)
block_mem_value[p + t] = prim;
else if (block_mem_segment[p])
block_mem_value[p + t] = tmp;
else
block_mem_value[p + t] = fp(tmp, block_mem_value[p + t]);
block_mem_segment[p] = 0;
}
__syncthreads();
t >>= 1;
}
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
outseg[idx] = block_mem_segment[threadIdx.x];
}
}

template<typename T>
__global__ void LSScan_4_(const T* middle_v, const T* middle_s, const int* original_segment, T* output, int*segtemp, const int size, const int saved_d, const T prim, T(*fp)(T, T)) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d, t;
extern __shared__ T block_mem[];
T *block_mem_value = block_mem;
int *block_mem_segment = (int*)&block_mem[blockDim.x];

// init shared memory
{
block_mem_value[threadIdx.x] = output[idx];
block_mem_segment[threadIdx.x] = segtemp[idx];
__syncthreads();
}

//write back to L1
{
#ifndef WRITE_TYPE_0
if (threadIdx.x == 0) {
#endif
block_mem_value[blockDim.x - 1] = middle_v[blockIdx.x];
block_mem_segment[blockDim.x - 1] = middle_s[blockIdx.x];
#ifndef WRITE_TYPE_0
}
#endif
__syncthreads();
}

//map L1
{
for (d = saved_d, t = blockDim.x >> 1; d >= 0; d--) {
if (threadIdx.x << (31 - d) == 0) {
int p = threadIdx.x + t - 1;
float tmp = block_mem_value[p];
block_mem_value[p] = block_mem_value[p + t];
if (original_segment[idx + t])
block_mem_value[p + t] = prim;
else if (block_mem_segment[p])
block_mem_value[p + t] = tmp;
else
block_mem_value[p + t] = fp(tmp, block_mem_value[p + t]);
block_mem_segment[p] = 0;
}
__syncthreads();
t >>= 1;
}
}

//write result
{
output[idx] = block_mem_value[threadIdx.x];
}
}

template<typename T>
__global__ void Add_(const T* A, const T* B, T* C, const int size, T(*fp)(T, T)) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < size)
C[idx] = fp(A[idx], B[idx]);
}

template<typename T>
void LSScan(BufferPool<T>& TPool, BufferPool<int>& intPool, int size, CUDABuffer<T>& input, CUDABuffer<int>& segment, CUDABuffer<T>& output, T prim, T(*fp)(T, T), bool inclusive = false) {
int pow_of_2 = Min2Pow(size);
int2 disp1 = DispachNum(pow_of_2, 128);
int2 disp2 = DispachNum(disp1.x, 128);

CUDABuffer<T>& segtemp = TPool.Get(pow_of_2);
CUDABuffer<T>& middlev1 = TPool.Get(disp1.x);
CUDABuffer<T>& middlev2 = TPool.Get(disp2.x);
CUDABuffer<int>& middles1 = intPool.Get(disp1.x);
CUDABuffer<int>& middles1tmp = intPool.Get(disp1.x);
CUDABuffer<int>& middles2 = intPool.Get(disp2.x);

output.Resize(pow_of_2);

LSScan_1_<T><<<disp1.x,disp1.y,disp1.y*(sizeof(T)+sizeof(int))>>>(input, segment, output, segtemp, middlev1, middles1, size, prim, fp);

LSScan_1_<T><<<disp2.x,disp2.y,disp2.y*(sizeof(T)+sizeof(int))>>>(middlev1, middles1, middlev1, middles1tmp, middlev2, middles2, disp1.x, prim, fp);

LSScan_2_<T><<<1,disp2.x,disp2.x*(sizeof(T)+sizeof(int))>>>(middlev2, segment, middlev2, middles2, disp1.y * disp2.y, prim, fp);

LSScan_3_<T><<<disp2.x,disp2.y,disp2.y*(sizeof(T)+sizeof(int))>>>(middlev2, middles2, segment, middles1tmp, middlev1, middles1, disp1.y, MinNoneZero(disp2.y) - 1, prim, fp);

LSScan_4_<T><<<disp1.x,disp1.y,disp1.y*(sizeof(T)+sizeof(int))>>>(middlev1, middles1, segment, output, segtemp, size, MinNoneZero(disp1.y) - 1, prim, fp);

TPool.Release(segtemp);
TPool.Release(middlev1);
TPool.Release(middlev2);
intPool.Release(middles1);
intPool.Release(middles1tmp);
intPool.Release(middles2);

if (inclusive)
Add_<T><<<disp1.x, disp1.y>>>(input, output, output, size, fp);
}

SegmentscanInverse:

template<typename T>
__global__ void Inv_(const T* A, T* B, const int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < size)
B[size - 1 - idx] = A[idx];
}

template<typename T>
__global__ void Inv_(T* A, const int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_1 = size - 1;

if (idx < size / 2) {
T tmp = A[idx];
A[idx] = A[s_1 - idx];
A[s_1 - idx] = tmp;
}
}

template<typename T>
__global__ void InvSeg_(const T* A, T* B, const int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < size) {
unsigned int t = size - idx; //注意這裡的左移操作,請思考為何
t = t % ((unsigned int)size);
B[idx] = A[t];
}
}

template<typename T>
void LSScanInv(BufferPool<T>& TPool, BufferPool<int>& intPool, int size, CUDABuffer<T>& input, CUDABuffer<int>& segment, CUDABuffer<T>& output, T prim, T(*fp)(T, T), bool inclusive = false) {
int pow_of_2 = Min2Pow(size);
int2 disp1 = DispachNum(size, 64);

CUDABuffer<T>& input_inv = TPool.Get(pow_of_2);
CUDABuffer<T>& segment_inv = TPool.Get(pow_of_2);

Inv_<T><<<disp1.x, disp1.y>>>(input, input_inv, size);

InvSeg_<T><<<disp1.x, disp1.y>>>(segment, segment_inv, size);

LSScan(TPool, intPool, size, input_inv, segment_inv, output, prim, fp, inclusive);

TPool.Release(input_inv);
TPool.Release(segment_inv);

Inv_<T><<<disp1.x, disp1.y/2>>>(output, size);
}

有了這幾個原語之後,就可以拼湊出第一個我們需要的操作----SegmentSplit了。


Segment Split:

這張圖大致說明瞭這個操作是個什麼東西,如果想要詳細瞭解還是看論文吧,不太好說清楚。

太複雜不知道該怎麼說,總之還是直接上代碼吧...

__global__ void GetNF_(int size, const int* e, const int* f, const int* segment, int* NF) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx == size - 1 || segment[idx + 1] == 1) {
NF[idx] = e[idx] + f[idx];
}
else
{
NF[idx] = 0;
}
}

void GetNF(int size, CUDABuffer<int>& e, CUDABuffer<int>& f, CUDABuffer<int>& segment, CUDABuffer<int>& NF) {
int pow_of_2 = Min2Pow(size);
int2 disp1 = DispachNum(size, 64);

GetNF_<<<disp1.x, disp1.y>>>(size, e, f, segment, NF);
}

__global__ void Getidb_(const int* segment, int* idb) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (segment[idx] == 1) {
idb[idx] = idx;
}
else
{
idb[idx] = 0;
}
}

void Getidb(int size, CUDABuffer<int>& segment, CUDABuffer<int>& idb) {
int pow_of_2 = Min2Pow(size);
int2 disp1 = DispachNum(size, 64);

Getidb_<<<disp1.x, disp1.y>>>(segment, idb);
}

__global__ void GetIndex_(const int* e, const int* f, const int* idS, const int* NFS, int* index) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;

index[idx] = e[idx] ? (f[idx] + idS[idx]) : (idx - f[idx] + NFS[idx]);
}

void GetIndex(int size, CUDABuffer<int>& e, CUDABuffer<int>& f, CUDABuffer<int>& idS, CUDABuffer<int>& NFS, CUDABuffer<int>& index) {
int pow_of_2 = Min2Pow(size);
int2 disp1 = DispachNum(size, 64);

GetIndex_<<<disp1.x, disp1.y>>>(e, f, idS, NFS, index);
}

void SegmentSplitIndex(int size, CUDABuffer<int>& boolen, CUDABuffer<int>& segment, CUDABuffer<int>& output) {
int pow_of_2 = Min2Pow(size);

//f
auto& f = pool.Get(pow_of_2);
LSScan(pool, pool, size, boolen, segment, f, 0, int_add);

//NFS
auto& NF = pool.Get(pow_of_2);
GetNF(size, boolen, f, segment, NF);
auto& NFS = pool.Get(pow_of_2);
LSScanInv(pool, pool, size, NF, segment, NFS, 0, int_add, true);

//idS
auto& idb = pool.Get(pow_of_2);
Getidb(size, segment, idb);
auto& idS = pool.Get(pow_of_2);
LSScan(pool, pool, size, idb, segment, idS, 0, int_add, true);

// combine result
output.Resize(pow_of_2);
GetIndex(size, boolen, f, idS, NFS, output);

pool.Release(f);
pool.Release(idb);
pool.Release(idS);
pool.Release(NF);
pool.Release(NFS);
}

template<typename T>
void SegmentSplit(int size, CUDABuffer<T>& input, CUDABuffer<int>& boolen, CUDABuffer<int>& segment, CUDABuffer<T>& output) {
int pow_of_2 = Min2Pow(size);

//get new index
auto& new_index = pool.Get(pow_of_2);
SegmentSplitIndex(size, boolen, segment, new_index);

//switch result
Switch(size, input, new_index, output);

pool.Release(new_index);
}

沒做完全的測試,小規模測試結果似乎是正確的,就當是對的吧233。就耗時來看,目前的實現還是十分優秀的,在4M的規模下也僅需0.1ms,應該說為後續的實時GPU KDTree打下了較好的基礎。

[1] Real-Time KD-Tree Construction on Graphics Hardware Kun Zhou? ? Qiming Hou? Rui Wang? Baining Guo? ? ?Zhejiang University ?Tsinghua University ?Microsoft Research Asia

[2]Scan Primitives for GPU Computing Shubhabrata Sengupta, Mark Harris, Yao Zhang, and John D. Owens University of California, Davis NVIDIA Corporation


推薦閱讀:
相關文章