cuda实践5---cuda kdtree、octree

cuda kdtree
前言：将kdtree 查询部分移植到GPU端，在很多应用中对提高算法的执行效率很有帮助，本文使用英伟达GPU语言cuda，完成了kdtree GPU端的移植。
步骤比较简单：1、cpu端创建kdtree； 2、迁移kdtree node 节点到GPU端；3、GPU端实现临近检索（注：里面会有很多处理小技巧，望相互学习）
核心代码：
cuda_kdtree.cu
//main.cu
#include <stdio.h>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>

#include "cuda_kdtree.h"

void CheckCUDAError(const char *msg)
{
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err) {
        fprintf(stderr, "Cuda error: %s: %s.
", msg, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__device__ float distance3d(float* pt1, float* pt2)
{
    float temp = sqrtf(
        (pt1[0] - pt2[0])*(pt1[0] - pt2[0]) +
        (pt1[1] - pt2[1])*(pt1[1] - pt2[1]) +
        (pt1[2] - pt2[2])*(pt1[2] - pt2[2]));
    return temp;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__device__ int findLeafNode_GPU(float* searchPoint,int current_node ,int node_num, CUDA_KDNode* kdnode_vector) {

    if (node_num == 0)
    {
        //std::cout << "CUDA_KDNode is empty ..." << std::endl;
        return -1;
    }

    while (true)
    {
        if (kdnode_vector[current_node].left == -1 && kdnode_vector[current_node].right == -1)   //leaf node
        {
            break;
        }

        int slipnum0 = 1;
        slipnum0 = kdnode_vector[current_node].dim;
        switch (slipnum0)
        {
        case 1:
            if (searchPoint[0] < kdnode_vector[current_node].split_value)
            {
                current_node = kdnode_vector[current_node].left;
            }
            else
            {
                current_node = kdnode_vector[current_node].right;
            }
            break;

        case 2:
            if (searchPoint[1] < kdnode_vector[current_node].split_value)
            {
                current_node = kdnode_vector[current_node].left;
            }
            else
            {
                current_node = kdnode_vector[current_node].right;
            }
            break;

        case 3:
            if (searchPoint[2] < kdnode_vector[current_node].split_value)
            {
                current_node = kdnode_vector[current_node].left;
            }
            else
            {
                current_node = kdnode_vector[current_node].right;
            }
            break;
        }
    }

    return current_node;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__device__ int findNode_GPU(float* searchPoint, float searchRadius, int current_node, int node_num, CUDA_KDNode* kdnode_vector) {

    if (node_num == 0)
    {
        //std::cout << "CUDA_KDNode is empty ..." << std::endl;
        return -1;
    }

    while (true)
    {
        if (kdnode_vector[current_node].left == -1 || kdnode_vector[current_node].right == -1)   //leaf node
        {
            break;
        }

        int slipnum0 = 1;
        slipnum0 = kdnode_vector[current_node].dim;
        float dis_temp = fabs(searchPoint[slipnum0 - 1] - kdnode_vector[current_node].split_value);  
        if (dis_temp < searchRadius)
        {
            break;
        }

        if (searchPoint[slipnum0-1] < kdnode_vector[current_node].split_value)
        {
            current_node = kdnode_vector[current_node].left;
        }
        else
        {
            current_node = kdnode_vector[current_node].right;
        }        
    }

    return current_node;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__device__ int searchRadius_GPU(
    float* pointCloud,int* pt_indexs, int node_num, CUDA_KDNode* kdnode_vector, 
    float* searchPoint,float searchRadius,
    int* searchIndex, float* searchDistance) 
{
    if (node_num == NULL)
    {
        //std::cout << "CUDA_KDNode is empty ..." << std::endl;
        return -1;
    }
    int current_node = 0;   //根节点
    int templeafnode = 0;// findNode_GPU(searchPoint, searchRadius, current_node, node_num, kdnode_vector);

    float distanceTemp = 0.0;
    int temp = 0;
    for (int j = 0; j < kdnode_vector[templeafnode].numOfData; j++)
    {
        float pt2[3];

        pt2[0] = pointCloud[3 * pt_indexs[kdnode_vector[templeafnode].start_index + j]];
        pt2[1] = pointCloud[3 * pt_indexs[kdnode_vector[templeafnode].start_index + j] + 1];
        pt2[2] = pointCloud[3 * pt_indexs[kdnode_vector[templeafnode].start_index + j] + 2];

        distanceTemp = distance3d(searchPoint, pt2);

        if (searchRadius > distanceTemp && temp < maxN)
        {
            searchIndex[temp] = pt_indexs[kdnode_vector[templeafnode].start_index + j];
            searchDistance[temp] = distanceTemp;
            temp++;
        }
    }

    while (true)
    {
        int indexTemp = -1;
        float tempdis;
        int changetimes = 0;

        if (temp == 0)
        {
            break;
        }

        for (int i = 0; i < temp-1; i++)
        {
            if (searchDistance[i] > searchDistance[i + 1])  
            {
                tempdis = searchDistance[i];
                searchDistance[i] = searchDistance[i + 1];
                searchDistance[i + 1] = tempdis;

                indexTemp = searchIndex[i];
                searchIndex[i] = searchIndex[i + 1];
                searchIndex[i + 1] = indexTemp;

                changetimes++;
            }
        }

        if (changetimes == 0)
        {
            break;   
        }
    }

    return temp;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <int BLOCK_SIZE> __global__ void runKNNSearchRadius_GPU(float* pointCloud, int* pt_indexs, int node_num, CUDA_KDNode* kdnode_vector,float searchRadius, 
    int* searchIndex_d, float* searchDistance_d)
{
    int searchIndex[maxN];
    float searchDistance[maxN];
    for (int i = 0; i < maxN; i++)
    {
        searchIndex[i] = -999;
        searchDistance[i] = 999.;
    }

    int Row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    int Col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    
    float searchPoint[2];
    searchPoint[0] = Col * 1.0 / imgWidth_d;
    searchPoint[1] = (imgHeight_d - Row) * 1.0 / imgHeight_d;

    int searchNum = searchRadius_GPU(
        pointCloud, pt_indexs, node_num, kdnode_vector,
        searchPoint, searchRadius,
        searchIndex, searchDistance);

    int threadId = Row * imgWidth_d + Col;
    for (int i = 0; i < maxN; i++)
    {
        searchIndex_d[maxN*threadId + i] = searchIndex[i];
        searchDistance_d[maxN*threadId + i] = searchDistance[i];
    }

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
CUDA_KDTREE::~CUDA_KDTREE()
{
    cudaFree(m_gpu_nodes);
    cudaFree(m_gpu_indexes);
    cudaFree(m_gpu_points);
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CUDA_KDTREE::createKDTree(std::vector<Point3d>& pointCloud, vector<CUDA_KDNode>& cpu_nodes, vector<int>& indexes)
{
    kd::KDTree<Point3d> tree;
    tree.setInputPointCloud(pointCloud);
    tree.setNumOfLeafData(100);
    tree.buildKDTree();

    std::queue<kd::kdnode*> kdnodePtrQue;
    kdnodePtrQue.push(tree.ptree->root);

    std::vector<kd::kdnode*> kdnode_vect;

    //go through kdtree node
    int node_t = 0;
    while (!kdnodePtrQue.empty())
    {
        kd::kdnode* tempkdnode = kdnodePtrQue.front();

        if (tempkdnode==NULL)
        {
            kdnodePtrQue.pop();
            continue;
        }

        tempkdnode->node_index = node_t;
        kdnode_vect.push_back(tempkdnode);

        CUDA_KDNode gpu_node_temp;
        gpu_node_temp.dim = tempkdnode->dim;
        gpu_node_temp.nodelr = tempkdnode->nodelr;
        gpu_node_temp.numOfData = tempkdnode->numOfData;
        gpu_node_temp.split_value = tempkdnode->split_value;
        cpu_nodes.push_back(gpu_node_temp);

        kdnodePtrQue.pop();
        if (tempkdnode->left!=NULL)
        {
            kdnodePtrQue.push(tempkdnode->left);
        }
        if (tempkdnode->right != NULL)
        {
            kdnodePtrQue.push(tempkdnode->right);
        }

        node_t++;
    }
    
    int index_t = 0;
    for (int i = 0; i < node_t; i++)
    {
        if (kdnode_vect[i]->father==NULL)
            cpu_nodes[i].father = -1;
        else
            cpu_nodes[i].father = kdnode_vect[i]->father->node_index;

        if (kdnode_vect[i]->left == NULL)
            cpu_nodes[i].left = -1;
        else
            cpu_nodes[i].left = kdnode_vect[i]->left->node_index;

        if (kdnode_vect[i]->right == NULL)
            cpu_nodes[i].right = -1;
        else
            cpu_nodes[i].right = kdnode_vect[i]->right->node_index;

        cpu_nodes[i].start_index = index_t;
        index_t += cpu_nodes[i].numOfData;

        for (int j = 0; j < cpu_nodes[i].numOfData; j++)
        {
            indexes.push_back(kdnode_vect[i]->data[j]);
        }
    }
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int CUDA_KDTREE::search(std::vector<Point3d>& pointCloud, vector<CUDA_KDNode>& cpu_nodes, vector<int>& indexes, int* queries_indexes, float *queries_dists)
{
    // Create the nodes again on the CPU, laid out nicely for the GPU transfer
    int m_num_index = indexes.size();
    int num_nodes = cpu_nodes.size();
    int point_num = pointCloud.size();

    //int* queries_indexes = NULL;
    //float *queries_dists = NULL;
    //queries_indexes = (int*)malloc(sizeof(int) * imgHeight_d * imgWidth_d* maxN);
    //queries_dists = (float*)malloc(sizeof(int) * imgHeight_d * imgWidth_d* maxN);

    int* searchIndex_d = NULL;
    float* searchDistance_d = NULL;

    cudaMalloc((void**)&m_gpu_nodes, sizeof(CUDA_KDNode)*num_nodes);
    cudaMalloc((void**)&m_gpu_indexes, sizeof(int)*m_num_index);
    cudaMalloc((void**)&m_gpu_points, sizeof(float) * 3 * point_num);

    cudaMalloc((void**)&searchIndex_d, sizeof(int) * imgHeight_d * imgWidth_d* maxN);
    cudaMalloc((void**)&searchDistance_d, sizeof(float) * imgHeight_d * imgWidth_d* maxN);

    cudaMemcpy(m_gpu_nodes, &cpu_nodes[0], sizeof(CUDA_KDNode) * num_nodes, cudaMemcpyHostToDevice);
    cudaMemcpy(m_gpu_indexes, &indexes[0], sizeof(int) * m_num_index, cudaMemcpyHostToDevice);
    cudaMemcpy(m_gpu_points, &pointCloud[0], sizeof(float) * 3 * point_num, cudaMemcpyHostToDevice);

    int BLOCK_SIZE = 16;
    // Setup execution parameters
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);   //16*16 或者 32*32  block大小
    dim3 grid(imgWidth_d / block.x, imgHeight_d / block.y);   //计算grid 大小

    runKNNSearchRadius_GPU<16><<<grid, block>>> (m_gpu_points, m_gpu_indexes, num_nodes, m_gpu_nodes, 0.02, searchIndex_d, searchDistance_d);

    cudaError_t cudaStatus;
    cudaStatus = cudaGetLastError();
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();

    cudaMemcpy(&queries_indexes[0], searchIndex_d, sizeof(int)*imgHeight_d * imgWidth_d* maxN, cudaMemcpyDeviceToHost);
    cudaMemcpy(&queries_dists[0], searchDistance_d, sizeof(float)*imgHeight_d * imgWidth_d* maxN, cudaMemcpyDeviceToHost);

    CheckCUDAError("CreateKDTree");

    // free 
    cudaFree(searchIndex_d);
    cudaFree(searchDistance_d);

    return 0;
}
参考：
https://github.com/vincentfpgarcia/kNN-CUDA
http://raytracingdiary.blogspot.com/2011/04/cuda-kd-tree-implementations.html
http://nghiaho.com/?p=437