cuda实践4

这里对 上一篇 ---cuda实践3 中的代码进行修改，在block中对share memory 进行迭代。
template <int BLOCK_SIZE> __global__ void caculateShelter_cuda(
    float *uv_triangulation_0,
    float *uv_triangulation,
    float *w_triangulation,
    float *w_triangulation_center,
    float *position_panorama_vect,
    int triangule_num_d, int panorama_num_d, int imgHeight_d, int imgWidth_d, float dis_threshold, int*inside_ptr_d,  int* result)
{
    int distance_threshold = dis_threshold;
    int times = triangule_num_d;
    int num = imgHeight_d * imgWidth_d;

    int x = threadIdx.x;
    int y = threadIdx.y;
    int Row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    int Col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    //int blockId = blockIdx.x + blockIdx.y * gridDim.x;
    //int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
    float searchPoint[2];
    searchPoint[0] = Col * 1.0 / imgWidth_d;
    searchPoint[1] = (imgHeight_d - Row) * 1.0 / imgHeight_d;

    int threadId = Row * imgWidth_d + Col;

    int inside = -1;
    int index_temp = -1;
    int step = BLOCK_SIZE * BLOCK_SIZE;
    int grid_size = (times / step) + 1;

    int threadid_0 = BLOCK_SIZE * y + x;
    int tempx = threadid_0 % 32;
    int tempy = threadid_0 / 32;
    for (int t = 0; t < grid_size; t++)
    {        
        ////////////////////////////////////////share memory
        __shared__ float tile_uv[BLOCK_SIZE*BLOCK_SIZE * 6];
        if ((t*step + BLOCK_SIZE * y + x) < times)
        {

            tile_uv[32 * tempy * 6 + tempx] = uv_triangulation_0[t*step + BLOCK_SIZE * y + x];    //注意越域
            tile_uv[32 * tempy * 6 + 1 * 32 + tempx] = uv_triangulation_0[1 * times + t * step + BLOCK_SIZE * y + x];
            tile_uv[32 * tempy * 6 + 2 * 32 + tempx] = uv_triangulation_0[2 * times + t * step + BLOCK_SIZE * y + x];
            tile_uv[32 * tempy * 6 + 3 * 32 + tempx] = uv_triangulation_0[3 * times + t * step + BLOCK_SIZE * y + x];
            tile_uv[32 * tempy * 6 + 4 * 32 + tempx] = uv_triangulation_0[4 * times + t * step + BLOCK_SIZE * y + x];
            tile_uv[32 * tempy * 6 + 5 * 32 + tempx] = uv_triangulation_0[5 * times + t * step + BLOCK_SIZE * y + x];
        }
        else
        {
            tile_uv[32 * tempy * 6 + tempx] = 0.0;    //注意越域
            tile_uv[32 * tempy * 6 + 1 * 32 + tempx] = 0.0;
            tile_uv[32 * tempy * 6 + 2 * 32 + tempx] = 0.0;
            tile_uv[32 * tempy * 6 + 3 * 32 + tempx] = 0.0;
            tile_uv[32 * tempy * 6 + 4 * 32 + tempx] = 0.0;
            tile_uv[32 * tempy * 6 + 5 * 32 + tempx] = 0.0;
        }
        __syncthreads();

        int v = 0;
        while (v < BLOCK_SIZE*BLOCK_SIZE)
        {
            float A[2], B[2], C[2];

            int threadid_1 = (threadid_0 + v) % (BLOCK_SIZE*BLOCK_SIZE);
            int tempx2 = threadid_1 % 32;
            int tempy2 = threadid_1 / 32;

            A[0] = tile_uv[32 * tempy2 * 6 + tempx2];
            A[1] = tile_uv[32 * tempy2 * 6 + 1 * 32 + tempx2];
            B[0] = tile_uv[32 * tempy2 * 6 + 2 * 32 + tempx2];
            B[1] = tile_uv[32 * tempy2 * 6 + 3 * 32 + tempx2];
            C[0] = tile_uv[32 * tempy2 * 6 + 4 * 32 + tempx2];
            C[1] = tile_uv[32 * tempy2 * 6 + 5 * 32 + tempx2];

            if ((t*step + tempy2*32 + tempx2) >= times)
            {
                break;
            }

            bool inornot = pointInTriangle_cuda(A, B, C, searchPoint);
            if (inornot && inside == -1 && inside == -1)   //inside /on 
            {
                index_temp = t * step + tempy2 * 32 + tempx2;
                inside = 0;
                break;
            }

            v++;
        }
        __syncthreads();

        //////////////////////////////////////////////global memory
        //float A[2], B[2], C[2];
        //A[0] = uv_triangulation_0[t];
        //A[1] = uv_triangulation_0[1 * times + t];
        //B[0] = uv_triangulation_0[2 * times + t];
        //B[1] = uv_triangulation_0[3 * times + t];
        //C[0] = uv_triangulation_0[4 * times + t];
        //C[1] = uv_triangulation_0[5 * times + t];
        //if (pointInTriangle_cuda(A, B, C, searchPoint))   //inside /on 
        //{
        //    index_temp = t;
        //    inside = 0;
        //    break;
        //}
    }

    inside_ptr_d[2 * threadId] = inside;
    inside_ptr_d[2 * threadId + 1] = index_temp;

    if (inside == 0 && threadId < num)
    {
        int tr_index = index_temp;
        float pt3d[3];
        float uv_triangulation_temp[6];

        uv_triangulation_temp[0] = uv_triangulation[6 * tr_index];
        uv_triangulation_temp[1] = uv_triangulation[6 * tr_index + 1];
        uv_triangulation_temp[2] = uv_triangulation[6 * tr_index + 2];
        uv_triangulation_temp[3] = uv_triangulation[6 * tr_index + 3];
        uv_triangulation_temp[4] = uv_triangulation[6 * tr_index + 4];
        uv_triangulation_temp[5] = uv_triangulation[6 * tr_index + 5];

        float w_triangulation_temp[9];
        w_triangulation_temp[0] = w_triangulation[9 * tr_index];
        w_triangulation_temp[1] = w_triangulation[9 * tr_index + 1];
        w_triangulation_temp[2] = w_triangulation[9 * tr_index + 2];
        w_triangulation_temp[3] = w_triangulation[9 * tr_index + 3];
        w_triangulation_temp[4] = w_triangulation[9 * tr_index + 4];
        w_triangulation_temp[5] = w_triangulation[9 * tr_index + 5];
        w_triangulation_temp[6] = w_triangulation[9 * tr_index + 6];
        w_triangulation_temp[7] = w_triangulation[9 * tr_index + 7];
        w_triangulation_temp[8] = w_triangulation[9 * tr_index + 8];

        caculateMappingTriangle2dTo3d(uv_triangulation_temp, searchPoint, w_triangulation_temp, pt3d); // rewrite in cuda

        int not_in_shelter_num = 0;
        //Shelter check
        for (int m = 0; m < panorama_num_d; m++)
        {
            if (not_in_shelter_num >= PanoramaNUM)
            {
                break;
            }
            // add threshold ditance (pt3d to optical_center)
            //...
            // add threshold  angle of triangle face normal and vector(pt3d to optical_center)
            //...
            float optical_center[3];
            optical_center[0] = position_panorama_vect[m * 3];
            optical_center[1] = position_panorama_vect[m * 3 +1];
            optical_center[2] = position_panorama_vect[m * 3 +2];

            //caculate shelter
            //caculate near triangles of the ray
            int ret0 = 0;
            float redius_ = distance3d(optical_center, pt3d);

            float rayoptical[3];
            rayoptical[0] = optical_center[0] - pt3d[0];
            rayoptical[1] = optical_center[1] - pt3d[1];
            rayoptical[2] = optical_center[2] - pt3d[2];
            float angle = angleOfNormalOfTriangleAndSunline(w_triangulation_temp, rayoptical);
            float temp0 = fabs(angle - 3.1415926 / 2.0);

            if (redius_ > distance_threshold || temp0 < 3.1415926 / 10.0)  //
            {
                continue;
            }

            float mid_pt[3];
            mid_pt[0] = (optical_center[0] + pt3d[0])*0.5;
            mid_pt[1] = (optical_center[1] + pt3d[1])*0.5;
            mid_pt[2] = (optical_center[2] + pt3d[2])*0.5;
            for (int i = 0; i < times; i++)
            {
                float triangle[9];
                triangle[0] = w_triangulation[9 * i];
                triangle[1] = w_triangulation[9 * i + 1];
                triangle[2] = w_triangulation[9 * i + 2];
                triangle[3] = w_triangulation[9 * i + 3];
                triangle[4] = w_triangulation[9 * i + 4];
                triangle[5] = w_triangulation[9 * i + 5];
                triangle[6] = w_triangulation[9 * i + 6];
                triangle[7] = w_triangulation[9 * i + 7];
                triangle[8] = w_triangulation[9 * i + 8];
                
                float pt_temp[3];
                pt_temp[0] = w_triangulation_center[3 * i];
                pt_temp[1] = w_triangulation_center[3 * i + 1];
                pt_temp[2] = w_triangulation_center[3 * i + 2];

                float temp1 = distance3d(pt_temp, mid_pt);
                if (temp1 > 0.36*redius_ || i == tr_index)
                {
                    continue;
                }

                
                //int ret1 = rayTracingShelterCaculate_cuda2(pt3d, optical_center, triangle);
                //if (ret1 == 0)  // in shelter
                //{
                //    ret0 = -1;
                //    break;
                //}
                float ret_dis = pt3dToLine3d(pt_temp, optical_center, pt3d);
                if (ret_dis < 0.10)  // in shelter
                {
                    ret0 = -1;
                    break;
                }
            }
     
            if (ret0 == 0)
            {
                result[threadId*PanoramaNUM + not_in_shelter_num] = m+1;    //not in shelter
                not_in_shelter_num++;
            }
        }
    }

}