cudaThreadSynchronize()

// 调用CUDA kernel 是非阻塞的，调用kernel语句后面的语句不等待kernel执行完，立即执行。所以在 call_kernel（see kernel.cu）中执行 m5op.dump 是错误的！！！

// REF: https://www.cs.virginia.edu/~csadmin/wiki/index.php/CUDA_Support/Measuring_kernel_runtime

// cudaThreadSynchronize() 暂停调用者的执行，直到前面的 stream operation 执行完毕。

// REF: https://stackoverflow.com/questions/13485018/cudastreamsynchronize-vs-cudadevicesynchronize-vs-cudathreadsynchronize

// C++ thread join 问题，在 kernel.cpp 中也有 join，那么是在 kernel.cpp 中 dump 还是在main.cpp中join后面dump?

// REF: http://en.cppreference.com/w/cpp/algorithm/for_each

// 若 GPU 先执行完毕，在 main.cpp 中join后 dump 似乎合理；若 CPU 先执行完毕，岂不是要阻塞在 cudaThreadSynchronize 处？

// 暂且在 kernel.cp p中 dump！

kernel.cpp

// CPU threads--------------------------------------------------------------------------------------
void run_cpu_threads(T *matrix_out, T *matrix, std::atomic_int *flags, int n, int m, int pad, int n_threads, int ldim, int n_tasks, float alpha
#ifdef CUDA_8_0
    , std::atomic_int *worklist
#endif
    ) {
        std::cout<<"run_cpu_threads start."<<std::endl;

    const int                REGS_CPU = REGS * ldim;
    std::vector<std::thread> cpu_threads;
    for(int i = 0; i < n_threads; i++) {
    
        cpu_threads.push_back(std::thread([=]() {

#ifdef CUDA_8_0
            Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads, worklist);
#else
            Partitioner p = partitioner_create(n_tasks, alpha, i, n_threads);
#endif

            const int matrix_size       = m * (n + pad);
            const int matrix_size_align = (matrix_size + ldim * REGS - 1) / (ldim * REGS) * (ldim * REGS);

            for(int my_s = cpu_first(&p); cpu_more(&p); my_s = cpu_next(&p)) {

                // Declare on-chip memory
                T   reg[REGS_CPU];
                int pos      = matrix_size_align - 1 - (my_s * REGS_CPU);
                int my_s_row = pos / (n + pad);
                int my_x     = pos % (n + pad);
                int pos2     = my_s_row * n + my_x;
// Load in on-chip memory
#pragma unroll
                for(int j = 0; j < REGS_CPU; j++) {
                    if(pos2 >= 0 && my_x < n && pos2 < matrix_size)
                        reg[j] = matrix[pos2];
                    else
                        reg[j] = 0;
                    pos--;
                    my_s_row = pos / (n + pad);
                    my_x     = pos % (n + pad);
                    pos2     = my_s_row * n + my_x;
                }

                // Set global synch
                while((&flags[my_s])->load() == 0) {
                }
                (&flags[my_s + 1])->fetch_add(1);

                // Store to global memory
                pos = matrix_size_align - 1 - (my_s * REGS_CPU);
#pragma unroll
                for(int j = 0; j < REGS_CPU; j++) {
                    if(pos >= 0 && pos < matrix_size)
                        matrix_out[pos] = reg[j];
                    pos--;
                }
            }
        }));
    }
    std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); });
    std::cout<<"dump.. after run_cpu_threads end."<<std::endl;
    m5_dump_stats(0,0);
}

kernel.cu

cudaError_t call_Padding_kernel(int blocks, int threads, int n, int m, int pad, int n_tasks, float alpha, 
    T *matrix_out, T *matrix, int *flags
#ifdef CUDA_8_0
    , int l_mem_size, int *worklist
#endif
    ){
        std::cout<<"call_pad start."<<std::endl;
    dim3 dimGrid(blocks);
    dim3 dimBlock(threads);
    Padding_kernel<<<dimGrid, dimBlock
#ifdef CUDA_8_0
        , l_mem_size
#endif
        >>>(n, m, pad, n_tasks, alpha, 
        matrix_out, matrix, flags
#ifdef CUDA_8_0
        , worklist
#endif
        );
    cudaError_t err = cudaGetLastError();
    std::cout<<"dump.. after call_pad end."<<std::endl;
    m5_dump_stats(0,0);
    return err;
}

main.cpp

for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {

        // Reset
#ifdef CUDA_8_0
        for(int i = 0; i < p.n_bins; i++) {
            h_histo[i].store(0);
        }
#else
        memset(h_histo, 0, p.n_bins * sizeof(unsigned int));
        cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);
        cudaThreadSynchronize();
        CUDA_ERR();
#endif

        std::cout<<"m5 work begin."<<std::endl;

        // Launch GPU threads
        // Kernel launch
        if(p.n_gpu_blocks > 0) {
            std::cout<<"launch gpu."<<std::endl;
            cudaStatus = call_Histogram_kernel(p.n_gpu_blocks, p.n_gpu_threads, p.in_size, p.n_bins, n_cpu_bins, 
                d_in, (unsigned int*)d_histo, p.n_bins * sizeof(unsigned int));
            CUDA_ERR();
        }

        // Launch CPU threads
        std::cout<<"launch cpu."<<std::endl;
        std::thread main_thread(run_cpu_threads, (unsigned int *)h_histo, h_in, p.in_size, p.n_bins, p.n_threads,
            p.n_gpu_threads, n_cpu_bins);
            std::cout<<"cuda sync."<<std::endl;

        cudaThreadSynchronize();
        std::cout<<"cpu join after cuda sync."<<std::endl;
        main_thread.join();

        //m5_work_end(0, 0);
        std::cout<<"m5 work end."<<std::endl;
    }