【CUDA开发】CUDA开发琐碎知识

//实现一个一维1*16的小矩阵的加法。

//矩阵大小:1*16
//分配一个block，共有16个线程并发。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#include <cutil.h>

#define VEC_SIZE 16

//kernel function
__global__ void vecAdd(float* d_A,float* d_B,float* d_C)
{
int index=threadIdx.x;
d_C[index]=d_A[index]+d_B[index];

}

int main()
{
//得到分配空间的大小
size_t size=VEC_SIZE*sizeof(float);

    //为本地分配内存
    float* h_A=(float*)malloc(size);
    float* h_B=(float*)malloc(size);
    float* h_C=(float*)malloc(size);

    //初始化
    for (int i=0;i<VEC_SIZE;++i)
   {
        h_A[i]=1.0;
        h_B[i]=2.0;
    }

    //将本地内存的中的数据复制到设备中
    float* d_A;
    cudaMalloc((void**)&d_A,size);
    cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);

    float* d_B;
    cudaMalloc((void**)&d_B,size);
    cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);

    //分配存放结果的空间
    float* d_C;
    cudaMalloc((void**)&d_C,size);

    //定义16个线程
    dim3 dimblock(16);
    vecAdd<<<1,dimblock>>>(d_A,d_B,d_C);

//讲计算结果复制回主存中
cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);

    //输出计算结果
    for (int j=0;j<VEC_SIZE;++j)
   {
        printf("%f/t",h_C[j]);
    }

    //释放主机和设备内存
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

return 0;
}

## cudaMallocPitch()的使用

名称 cudaMallocPitch – 向GPU分配存储器

概要 cudaError_t cudaMallocPitch( void** devPtr，size_t* pitch，size_t widthInBytes，size_t height )

说明向设备分配至少widthInBytes*height字节的线性存储器，并以*devPtr的形式返回指向所分配存储器的指针。该函数可以填充所分配的存储器，以确保在地址从一行更新到另一行时，给定行的对应指针依然满足对齐要求。cudaMallocPitch()以*pitch的形式返回间距，即所分配存储器的宽度，以字节为单位。间距用作存储器分配的一个独立参数，用于在2D数组内计算地址。如果给定一个T类型数组元素的行和列，可按如下方法计算地址：

T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;

对于2D数组的分配，建议程序员考虑使用cudaMallocPitch()来执行间距分配。由于硬件中存在间距对齐限制，如果应用程序将在设备存储器的不同区域之间执行2D存储器复制（无论是线性存储器还是CUDA数组），这种方法将非常有用。

例子：为EmuDebug
原来《CUDA编程指南》上给出的pitch的类型为int，在实际运行时与cudaMallocPitch()类型不匹配。

/************************************************************************/
/* This is a example of the CUDA program.
/************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>

/************************************************************************/
/* myKernel                                                           */
/************************************************************************/
__global__ void myKernel(float* devPtr,int height,int width,int pitch)
{
    for(int r=0;r    {
        float* row=(float*)((char*)devPtr+r*pitch);
        for (int c=0;c        {
            float element=row[c];
            printf("%f/n",element);//模拟运行
        }
    }
}

/************************************************************************/
/* Main CUDA                                                            */
/************************************************************************/
int main(int argc, char* argv[])
{
    size_t width=10;
    size_t height=10;

    float* decPtr;
   //pitch的值应该为size_t在整形的时，与函数参数不匹配
    size_t pitch;
    cudaMallocPitch((void**)&decPtr,&pitch,width*sizeof(float),height);
    myKernel<<<1,1>>>(decPtr,10,10,pitch);
    cudaFree(decPtr);

printf("%d/n",pitch);

//CUT_EXIT(argc, argv);

return 0;
}

## cudaMallocArray()的使用

名称:
cudaMemcpyToArray – 在主机和设备间复制数据

概要:
cudaError_t cudaMemcpyToArray(struct cudaArray* dstArray，size_t dstX，size_t dstY，const void* src，size_t count，enum cudaMemcpyKind kind)
cudaError_t cudaMemcpyToArrayAsync(struct cudaArray* dstArray，size_t dstX，size_t dstY，const void* src，size_t count，enum cudaMemcpyKind kind，cudaStream_t stream)

说明
从src指向的存储器区域内将count个字节复制到一个CUDA数组dstArray，该数组的左上角从(dstX，dstY)开始，其中kind是cudaMemcpyHostToHost、cudaMemcpyHost-ToDevice、cudaMemcpyDeviceToHost或cudaMemcpyDeviceToDevice之一，用于指定复制的方向。
cudaMemcpyToArrayAsync()是异步的，可选择传入非零流参数，从而将其关联到一个流。它仅对分页锁定的主存储器有效，如果传入指向可分页存储器的指针，那么将返回一个错误。

返回值
相关返回值：
cudaSuccess
cudaErrorInvalidValue
cudaErrorInvalidDevicePointer cudaErrorInvalidMemcpyDirection
注意，如果之前是异步启动，该函数可能返回错误码。

注：
在《CUDA编程指导》中对，cudaMallocArray()函数的使用，个人觉得有错误。
enum cudaMemcpyKind kind ，应该是cudaMemcpyHostToHost、cudaMemcpyHost-ToDevice、cudaMemcpyDeviceToHost或cudaMemcpyDeviceToDevice之一。
在指导中使用的是cudaMemcpyToArray(cuArray,0,0,h_data,&channelDesc),channelDese为cudaChannelFormatDesc类型，不是cudaMemcpyKind.

/*********************************************************************/
/* This is a example of the CUDA program.*/
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>

/************************************************************************/
/* myKernel                                                           */
/************************************************************************/

/************************************************************************/
/* Main CUDA                                                            */
/************************************************************************/
int main(int argc, char* argv[])
{
    const int width=10;
    const int height=10;

   //初始化h_array
   int h_array[width][height];
    for (int i=0;i<width;i++)
        for (int j=0;j<height;++j)
            h_array[i][j]=j+i*64;
        }
    }

    //以机构提channelDesc描述CUDA数组中的组件数量和数据类型
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindUnsigned);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray,&channelDesc,width,height);

    size_t sizeMem=width*height*sizeof(int);
    size_t potX=0;
    size_t potY=0;
    cudaMemcpyToArray(cuArray,potX,potY,h_array,sizeMem,cudaMemcpyDeviceToHost);

cudaFreeArray(cuArray);

return 0;
}

## CUDA统计时间

在CUDA中统计运算时间，大致有三种方法：

<1>使用cutil.h中的函数
unsigned int timer=0;
//创建计时器
cutCreateTimer(&timer);
//开始计时
cutStartTimer(timer);
{
//统计的代码段
…………
}
//停止计时
cutStopTimer(timer);
//获得从开始计时到停止之间的时间
cutGetTimerValue( timer);
//删除timer值
cutDeleteTimer( timer);

不知道在这种情况下，统计精度。

<2>time.h中的clock函数
clock_t start, finish;
float costtime;
start = clock();
{
//统计的代码段
…………
}
finish = clock();
//得到两次记录之间的时间差
costtime = (float)(finish - start) / CLOCKS_PER_SEC;
时钟计时单元的长度为1毫秒，那么计时的精度也为1毫秒。

<3>事件event
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecend(start,0);
{
//统计的代码段
…………
}
cudaEventRecord(stop,0);
float costtime;
cudaEventElapsedTime(&costtime,start,stop);

cudaError_t cudaEventCreate( cudaEvent_t* event )---创建事件对象；
cudaError_t cudaEventRecord( cudaEvent_t event，CUstream stream )--- 记录事件；
cudaError_t cudaEventElapsedTime( float* time，cudaEvent_t start，cudaEvent_t end )---计算两次事件之间相差的时间；
cudaError_t cudaEventDestroy( cudaEvent_t event )---销毁事件对象。
计算两次事件之间相差的时间（以毫秒为单位，精度为0.5微秒）。如果尚未记录其中任何一个事件，此函数将返回cudaErrorInvalidValue。如果记录其中任何一个事件使用了非零流，则结果不确定。

## CUDA代码常用编写技巧

1. 声明 __shared__ 变量或数组:

__shared__ float sh_farr[ 256];
__shared__ int a;
2.结构体指针成员的分配设备内存:

typedef struct Teacher_t
...{
    int a;
    unsigned int    *g_mem1;
    float            *g_mem2;
}Teacher;
void initMem( Teacher& t, const unsigned int mat_size)
...{
    unsigned int mat_size_ui = sizeof(int) * mat_size;
    unsigned int mat_size_f = sizeof(float) * mat_size;
    CUDA_SAFE_CALL( cudaMalloc((void**)&t.g_mem1, mat_size_ui) );
    CUDA_SAFE_CALL( cudaMalloc((void**)&t.g_mem1, mat_size_f) );
    ...
}
3.计时:

unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
...{
...//kernel
}
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Total time: %f ms ", cutGetTimerValue( timer) );
CUT_SAFE_CALL( cutDeleteTimer( timer));
4. 获取输入命令行中包含的文件名:

/**/////////////////////////////////////////////////////////////////////////////////
//! Check if a particular filename has to be used for the file where the result
//! is stored
//! @param argc number of command line arguments (from main(argc, argv)
//! @param argv pointers to command line arguments (from main(argc, argv)
//! @param filename filename of result file, updated if user specified
//! filename
/**/////////////////////////////////////////////////////////////////////////////////
void
getResultFilename( int argc, char** argv, char*& filename)
...{

    char* temp = NULL;
    cutGetCmdLineArgumentstr( argc, (const char**) argv, "filename-result", &temp);
    if( NULL != temp)
    ...{
        filename = (char*) malloc( sizeof(char) * strlen( temp));
        strcpy( filename, temp);
        cutFree( temp);
    }
    printf( "Result filename: '%s' ", filename);
}
类似的:

/**/////////////////////////////////////////////////////////////////////////////////
//! Check if a specific precision of the eigenvalue has to be obtained
//! @param argc number of command line arguments (from main(argc, argv)
//! @param argv pointers to command line arguments (from main(argc, argv)
//! @param iters_timing numbers of iterations for timing, updated if a
//!                      specific number is specified on the command line
/**/////////////////////////////////////////////////////////////////////////////////
void
getPrecision( int argc, char** argv, float& precision)
...{
    float temp = -1.0f;
    cutGetCmdLineArgumentf( argc, (const char**) argv, "precision", &temp);
    if( temp > 0.0f)
    ...{
        precision = temp;
    }
    printf( "Precision: %f ", precision);
}
5.Host调用完kernel函数需要进行线程同步,而在kernel或global函数只需要在必要的地方__syncthreads();即可:

CUDA_SAFE_CALL( cudaThreadSynchronize());

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/dvchn/archive/2008/02/25/2119590.aspx