光子映射渲染器-2-GPUKDTree-1LNS

题图来自网路，侵删。

好多人说上一篇文章太硬核，而且距离上一篇文章的时间也有些久远。于是我们先来复习一下上一篇我们到底干了什么。

复习环节：

1）Scan(exclusive)

该原语表示对一给定序列 $A=[a_0,a_1,...,a_{n-1}]$ ，一个左单元，和一个操作，输出一个序列 $B=[varepsilon,varepsilon{oplus}a_0,varepsilon{oplus}a_0{oplus}a_1,...,varepsilon{oplus}a_0{oplus}a_1{oplus}...{oplus}a_{n-2}]$ 。需要注意，该原语的B序列中每一元素不包含A中对应位置元素，因此是exclusive的。

为方便理解具体举例来说，若，

， = [](int a,int b){return a+b; }。则输出序列

2) Scan(inclusive)

该原语表示对一给定序列 $A=[a_0,a_1,...,a_{n-1}]$ ，一个左单元，和一个操作，输出一个序列 $B=[varepsilon{oplus}a_0,varepsilon{oplus}a_0{oplus}a_1,...,varepsilon{oplus}a_0{oplus}a_1{oplus}...{oplus}a_{n-1}]$ 。需要注意，该原语的B序列中每一元素包含A中对应位置元素，因此是inclusive的。

为方便理解具体举例来说，若，

， = [](int a,int b){return a+b; }。则输出序列

3) SegmentScan(exclusive)

该原语表示对一给定序列 $A=[a_0,a_1,...,a_{n-1}]$ ，一个标识切片的序列 $B=[flag_0,flag_1,...,flag_{n-1}]$ ，一个左单元，和一个操作，输出一个序列，该序列中每一分段相互独立的进行Scan操作。

为方便理解具体举例来说，若，

， = [](int a,int b){return a+b; }。则输出序列

4) 扩展

通过赋予不同的左单元和操作，我们可以获得很多有意思的结果。

比如当， = [](int a,int b){return a<b?a:b; }，我们可以使用Scan(inclusive)获得一个最小值序列：

若，则输出序列

好的复习到这里就结束啦，详细实现细节请翻阅上一篇文章：

头像是狐狸吗：光子映射渲染器-2-GPUKDTree-0原语?

zhuanlan.zhihu.com

Stack On GPU

为了更好的支持后续的工作，我们首先需要实现一个GPU上的Stack，来确保能够有序的组织数据。

template<typename T> struct MemoryManager { int max_size; int* flag; T* buffer;

__device__ T* Allocate_back(int& index) {
int ptr = atomicAdd(flag, 1);
index = ptr;
#if CheckError > 1
if (ptr < max_size) return buffer + ptr;
else {
printf("(ERROR allocate: %d / %d)",ptr, max_size);
index = -1;
flag[1] = 1;
return NULL;
}
#else
return buffer + ptr;
#endif

}
__device__ T* Allocate_back() {
int ptr = atomicAdd(flag, 1);
#if CheckError > 1
if (ptr < max_size) return buffer + ptr;
else {
printf("(ERROR allocate: %d / %d)", ptr, max_size);
flag[1] = 1;
return NULL;
}
#else
return buffer + ptr;
#endif
}

MemoryManager() {}
MemoryManager(CUDABuffer<T>& buffer, CUDABuffer<int>& flag_buffer) {
flag = flag_buffer;
this->buffer = buffer;
max_size = buffer.Size();
}
int flag_[2];
void clear() {
flag_[0] = 0; flag_[1] = 0;
cudaMemcpy(flag, flag_, CheckError * sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice);
}

int size() {
cudaMemcpy(flag_, flag, sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost);
return flag_[0];
}

#if CheckError > 1
bool stackOverflow(bool throwException = false) {
flag_[0] = 0;
cudaMemcpy(flag_, flag + 1, sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
if (throwException && flag_[0] != 0) throw exception("stack overflow!");
return flag_[0] != 0;
}
#endif
};

如上所示，我们需要实现一个Allocate_back()函数，该函数在栈顶分配一个新的存储单元并返回指针，该函数有两个重载，其中一个可以同时返回栈顶序号。此外，还定义了如果宏CheckError大于1(实际上等于2)，将会对stack Overflow做检查报错与记录，这会为Debug带来方便。由于该结构只包含指针与常量值max_size，因而可以放心在Device与Host之间传递而不必有所顾虑。此外，由于演算法实现上不会出现一次dispatch同时需要入栈和出栈的情况。因而可以大大简化设计。

正文

进入正题，这篇文章讲的是实现KDTree Builder的Large Node Stage。

由于一开始时，KDTree的节点数少而元素数量大，因而节点级的并行建树太慢，我们需要用特殊的方式以元素作为并行单元。这也是需要实现前述的各种原语的原因。

接下来是一段伪代码：

template<typename T> int BuildKdTree(int data_size, CUDABuffer<T>& input) { int size = Min2Pow(data_size); //2的整数次幂的Buffer大小可以简化原语实现 // ----------------------- // ---initialization stage // ----------------------- //申请3个将在后续阶段被使用的Buffer CUDABuffer<int> segment(size); segemnt.SetValue(0) // 将内容清零 CUDABuffer<int> split_axil_buffer(size); split_axil_buffer.SetValue(0) CUDABuffer<float> split_value_buffer(size); split_value_buffer.SetValue(0)

//申请三个stack备用
MemoryManager<Node> result(STACK_SIZE); // 这个stack存放构建的Node

MemoryManager<int> activelist(STACK_SIZE); // 这两个stack存放指向result
MemoryManager<int> smalllist(STACK_SIZE); // 中某Node的Index

// 初始化Root节点
// 计算root.AABB，将root节点压入activelist
InitRootNode(input, segments, result, activelist, data_size);

// -------------------
// ---large node stage
// -------------------

int activelist_size;
while ((activelist_size = activelist.size()) != 0) //如果activelist不为空则进行节点划分
{

// 对activelist中存放的所有节点(实际存放的是指向result中Node的Index)进行划分
// 结果存放在smalllist和nextlist中
// 其中smalllist将在下一个Small Node Stage被处理，而nextlist中的节点将不
// 断被划分直到全部都转化为Small Node
ProcessLargeNodes(size, activelist_size, input, result,
activelist, smalllist, nextlist,
segments, split_axil_buffer, split_value_buffer);

// 交换nextlist与activelist
auto tmp = nextlist;
nextlist = activelist;
nextlist.clear(); //清空stack
activelist = tmp;
}

// -------------------
// ---small node stage
// -------------------

// -----------------------
// ---kd-tree output stage
// -----------------------

// --------------------
// ---release resources
// --------------------

return result.size(); //返回总节点数目
}

已经详细注释，相信大家都能看得懂了。

随后我们关注一下这段代码里的两个具体函数的实现：

Init Root Node:

void InitRootNode(CUDABuffer<T>& input, CUDABuffer<int>& segments, MemoryManager<Node>& result, MemoryManager<int>& active, int size) { int pow_of_2 = Min2Pow(size);

// 从池中请求一个大小为pow_of_2的临时Buffer
CUDABuffer<T>& output = Tpool.Get(pow_of_2);

// 这个scan获得了input中元素的最小值
// 别再问为什么了=_=，前文讲的很详细了
LSScan<T>(Tpool, pool, size, input, segments, output, Unit_max_value, Unit_min, true);
T min = output.GetValue(pow_of_2 - 1);

// 这个scan获得了input中元素的最大值
LSScan<T>(Tpool, pool, size, input, segments, output, Unit_min_value, Unit_max, true);
T max = output.GetValue(pow_of_2 - 1);

int flags[2] = { 0, 1 };

//填充Root Node的信息
Node root;
root.aabb.min[0] = min.x;
root.aabb.min[1] = min.y;
root.aabb.min[2] = min.z;
root.aabb.max[0] = max.x;
root.aabb.max[1] = max.y;
root.aabb.max[2] = max.z;
root.l = root.r = root.split_axil = -1; // 用split_axil = -1来标志某一Node是叶节点
root.prims.ptr = 0; // 指向包含元素的起始
root.prims.size = size; // 包含元素的个数

//将数据送到该送去的buffer
cudaMemcpy(result.flag, flags + 1, sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice);
cudaMemcpy(result.buffer, &root, sizeof(Node), cudaMemcpyKind::cudaMemcpyHostToDevice);

cudaMemcpy(active.flag, flags + 1, sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice);
cudaMemcpy(active.buffer, flags, sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice);

//释放临时Buffer
Tpool.Release(output);
}

Process Large Nodes:

void ProcessLargeNodes(int size, int task_num, CUDABuffer<T>& input, MemoryManager<Node> result, MemoryManager<int>& activelist, MemoryManager<int>& smalllist, MemoryManager<int>& nextlist, CUDABuffer<int>& segments, CUDABuffer<int>& split_axil_buffer, CUDABuffer<float>& split_value_buffer) {

//申请临时Buffer
auto& split_axil_buffer_scan = pool.Get(size); // 记录split_axil按segment扩散后的结果

auto& split_value_buffer_scan = Fpool.Get(size);// 记录split_value按segment扩散后的结果

auto& flags = pool.Get(size); // 记录元素与split_axil、split_value比较后的归属结果

auto& output = Tpool.Get(size); // 按归属重新排序后的元素Buffer

auto& counter = pool.Get(size); // 计数Buffer，记录分属不同子节点的元素个数

// 计算activelist中所有节点的split_axil和split_value值，
// 实际上因这一阶段的Node数目不多，可以由Host完成，然而我
// 的实现还是用了Device，这样的好处是可以节省一些数据在
// Device与Host之间的拷贝，然而到底孰优孰劣，懒得去试了
Split_Large_Nodes(task_num, result, activelist, split_axil_buffer, split_value_buffer);

// 通过SegmentScan来扩散split_axil、split_value到每个节点所囊括的元素分段
LSScan<int>(pool, pool, size, split_axil_buffer, segments, split_axil_buffer_scan, 0, int_add, true);
LSScan<float>(Fpool, pool, size, split_value_buffer, segments, split_value_buffer_scan, 0, float_add, true);

// 将元素与split_axil、split_value比较，比较结果存入flags
Compare_To_Split(size, input, split_axil_buffer_scan, split_value_buffer_scan, flags);

// 对flags应用int_add的SegmentScan，获得每一分段的分属左右子节点的元素个数
LSScan<int>(pool, pool, size, flags, segments, counter, 0, int_add, true);

// 对元素按flags中bool值进行SegmentSplit重新排序元素
SegmentSplit(size, input, flags, segments, output);

// 填充新生成的子节点的信息，并按节点判别标准压入nextlist或smallist
Complete_Node(task_num, result, activelist, counter, segments, nextlist, smalllist);

// 将重排序的元素Buffer拷贝到原元素Buffer中
Copy(size, output, input);

// 释放临时Buffer
pool.Release(split_axil_buffer_scan);
Fpool.Release(split_value_buffer_scan);
pool.Release(flags);
Tpool.Release(output);
pool.Release(counter);
}