CUDA大作业_进行图像特征匹配V1.0

题目四：图像特征匹配算法

以下为Matlab实现的一个图像特征匹配算法，BTR.txt，BMP.txt，T.txt 作为输入，是三类目标的图像特征表示，test 变量表示待匹配目标特征，输出该目标类型type_string。请改写为C代码，并在GPU上进行优化，与CPU运行性能比较、分析。

BTR_train=load('BTR.txt');

BMP_train=load('BMP.txt');

T_train=load('T.txt');

[M N]=size(BTR_train);

[M1 N1]=size(BMP_train);

[M2 N2]=size(T_train);

test=[ 131.0000 5.2004 0.5679 1.5263 2.1928 2.7875

5.2777 4.2365 5.2146 3.3337 2.9507 1.7726 3.1895];

n=1;

dis_BTR=inf*ones(20,2);

for m=1:M

if(abs(test(1)-BTR_train(m,1))<=10)

dis_BTR(n,1)=sqrt(sum((test(2:13)-BTR_train(m,2:13)).^2));

dis_BTR(n,2)=BTR_train(m,14);

n=n+1;

end

dis=min(dis_BTR(:,1));

BTR_tag=dis_BTR(dis==dis_BTR,2);

dis_BTR(:,1)=sort(dis_BTR(:,1));

dis_BTR1=sum(dis_BTR(1:3,1))/3;

n1=1;

dis_BMP=inf*ones(20,2);

for m1=1:M1

if(abs(test(1)-BMP_train(m1,1))<=10)

dis_BMP(n1,1)=sqrt(sum((test(2:13)-BMP_train(m1,2:13)).^2));

dis_BMP(n1,2)=BMP_train(m1,14);

n1=n1+1;

end

dis=min(dis_BMP(:,1));

BMP_tag=dis_BMP(dis==dis_BMP,2);

dis_BMP(:,1)=sort(dis_BMP(:,1));

dis_BMP1=sum(dis_BMP(1:3,1))/3;

n2=1;

dis_T=inf*ones(20,2);

for m2=1:M2

if(abs(test(1)-T_train(m2,1))<=10)

dis_T(n2,1)=sqrt(sum((test(2:13)-T_train(m2,2:13)).^2));

dis_T(n2,2)=T_train(m2,14);

n2=n2+1;

end

dis=min(dis_T(:,1));

T_tag=dis_T(dis==dis_T,2);

dis_T(:,1)=sort(dis_T(:,1));

dis_T1=sum(dis_T(1:3,1))/3;

dis=[dis_BTR1 dis_BMP1 dis_T1];

if(min(dis)==dis_BTR1 )

type_string='BTR-70';

elseif(min(dis)==dis_BMP1)

type_string='BMP-2';

else

type_string='T-72';

end

————————————————————————————————————分割线————————————————————————————————————

下面是我们的实现代码：

数据扩充部分：

#! /usr/bin/env python
############################################################################
#
# Copyright (c) 2015 UCAS.com, Inc. All Rights Reserved
#
###########################################################################
"""
Brief:
Authors: raoqiang(raoqiangwill@163.com)
Date: 2015/05/19
File: create_data.py
"""
import sys
def create_data(fileName,newdataName,row,mul):
file=open(fileName,'r')
if not file:
print "can not open the file"
line=file.readline()
cre_data= open(newdataName,'w')
line_num=row
multiply=mul
while(line):
lineA=line.split()
for j in range(0,multiply):
cre_data.write(lineA[0]+' ')
for i in range(1,13):
b=float(lineA[i])+1/(j+1)
cre_data.write(str(b)+' ')
cre_data.write(str(float(lineA[13])+j*line_num)+' ')
line=file.readline()
cre_data.close()
def main():
filename =sys.argv[1]
newdataname=sys.argv[2]
row =int(sys.argv[3])
mul =int(sys.argv[4])
create_data(filename,newdataname,row,mul)
print "create the data successfuly"

if __name__ == '__main__':
main()

CUDA代码部分：V1.0

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <math.h>
#include<vector>
#include<algorithm>
#include<time.h>

using namespace std;
template<typename T>
void readMatrixfromTXT(const char *fileName, const int numColumn, const int numRow, T *matrix);
float CalDist(float*mat, int row, int col);
float * MatrixInver(float A[], int m, int n);
void matrixTranspose(float* matrix, int M, int N);
int getFileColumns(const char *fileName);
int getFileRows(const char *fileName);

cudaStream_t stream[3];
int gpu[3] = { 5,5, 5 };

#define inf 1000000
#define CUDA_CHECK_RETURN(value) {
cudaError_t _m_cudaStat = value;
if (_m_cudaStat != cudaSuccess) {
fprintf(stderr, "Error %s at line %d in file %s ",
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);
exit(1);
} }

void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s. ", msg, cudaGetErrorString(err));
exit(-1);
}
}

const int M = 181, N = 14, M1 = 168, N1 = 14, M2 = 168, N2 = 14;
__constant__ float test[13] = { 131.0000, 5.2004, 0.5679, 1.5263, 2.1928, 2.78, 5.2777, 4.2365, 5.2146, 3.3337, 2.9507, 1.7726, 3.1895 };
const float testcpu[13] = { 131.0000, 5.2004, 0.5679, 1.5263, 2.1928, 2.78, 5.2777, 4.2365, 5.2146, 3.3337, 2.9507, 1.7726, 3.1895 };

#define SHARE 256
__global__ void global_sort(float * a/*first three*blockNum1 element find by CalDis_Sort */, float*first_thr/*first three minimum */,int num)//improved rank sort
{
__shared__ float temp[SHARE];
float temp_rank;
int thdI = threadIdx.x;
int blockDx=blockDim.x;
int till;
int i,j,k;
__share__ tosort[3];
if (blockIdx.x*SHARE > num)
{
till = num % SHARE;
}
else
{
till = SHARE;
}
int icycle=(till+blockDx-1)/blockDx;
for(i=0;i<icycle;i++)
{
if (blockIdx.x*SHARE+thdI+i*blockDx < num)
temp[thdI+i*blockDx]=a[blockIdx.x*SHARE+thdI+i*blockDx];
// else
// temp[thdI+i*blockDx]=inf;
}
__syncthreads();
for(i=0;i<(till+blockDx-1)/blockDx;i++)
{

if(thdI + i* blockDx > till)
break;
temp_rank = temp[thdI+i*blockDx];

k = 0;
for (j= 0; j < till; j++)
{
if (temp_rank > temp[j])
{
++k;
}
else if (temp_rank == temp[j] && thdI+i*blockDx > j)
{
++k;
}
}
if (k<3){
first_thr[blockIdx.x * 3 + k] = temp_rank;//temp_rank;
}
}

}
__global__ void CalDis_Sort(float *totest, float *b/*matrix to cal Euclidean dis*/, float *tosort, int R/*Matrix row number*/, int C/*colum number*/)
{
/********************** CalDis ****************************/

__shared__ float temp[256];
//__shared__ float first_three[3];
// __shared__ float share_test[13];
int blockIdxx=blockIdx.x;
int blockDimx=blockDim.x;
int thdI = threadIdx.x;
int i,j,k;
float addup;
int index = thdI + blockIdx.x*blockDimx;
float temp_add = 0;
float temp_rank;
float tosort1[3];
temp[thdI] = inf;
// if (thdI < 13) share_test[thdI] = totest[thdI];

__syncthreads();
// for(int i=0;i<13;i++)
// share_test[i] = test[i];

if (index<R)
{
if (abs(test[0] - b[index]) <= 10)
{
for (i = 1; i < 13; i++)
{
addup = test[i] - b[i*R + index];
temp_add += addup*addup;
}
temp[thdI] = temp_add;

}
}
__syncthreads();
/********************** Sort ****************************/

temp_rank = temp[thdI];

k = 0;
for (j = 0; j < blockDimx; j++)
{

if (temp_rank > temp[j])
{
++k;
}
else if (temp_rank == temp[j] && thdI>j)
{
++k;
}
}
if (k<3){
tosort[k] =temp_rank;
}
__syncthreads();
if(thdI==k)
tosort[blockIdxx * 3 + thdI]=tosort[thdI];

}

//__global__ void appendINF(float *a)
//{
// a[threadIdx.x+blockIdx.x*blockDim.x]=inf;__syncthreads();
//}
class CRunGPU{
public:
cudaEvent_t start_gpu, end_gpu;
float consume_gpu;
int row;
float *BTR_train;

float *dev_first_thr;
float *dev_BTR_train;
float *dev_tosort;
float *dev_test;
float *tosort_host;
float *sort_block;
int temp_gpu;
cudaStream_t temp_stream;

int threadsPerBlock ;
int blocksPerGrid ;

int threadsPerBlock2;
int blocksPerGrid2 ;

int flag;
int flag2;//sort
int R;
int C;
const char*fileName;
CRunGPU(const char*fileName, int R/*row*/, int C/*column*/,int flag)//construc function
{
this->R=R;
this->C=C;
this->fileName=fileName;
this->flag=flag;
row = 256 * int((R + 255) / 256);;
BTR_train = (float*)malloc(row*C*sizeof(float));

threadsPerBlock = 256;
blocksPerGrid = (R + threadsPerBlock - 1) / threadsPerBlock;

threadsPerBlock2 = 256;
blocksPerGrid2 = (blocksPerGrid * 3 + threadsPerBlock2 - 1) / threadsPerBlock2;

cout << "BTR_train" << row << "C" << C << endl;
readMatrixfromTXT<float>(fileName, C, R, BTR_train);
cout << "read matrix successful!" << endl;
//BTR_train = MatrixInver(BTR_train, row, C);
cout << "Transpose matrix success!" << endl;
tosort_host = (float *)malloc(3 * sizeof(float));
switch (flag)
{
case(0) : temp_stream = stream[0]; temp_gpu = gpu[0]; break;

case(1) : temp_stream = stream[1]; temp_gpu = gpu[1]; break;

case(2) : temp_stream = stream[2]; temp_gpu = gpu[2]; break;
}

cout << "malloc and copy " << endl;
cudaSetDevice(temp_gpu);
cudaStreamCreate(&temp_stream);
CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_test, 13 * sizeof(float)));

CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_first_thr, 3 * blocksPerGrid2 * sizeof(float)));

CUDA_CHECK_RETURN(cudaMalloc((void**)&dev_BTR_train, C*row*sizeof(float)));
CUDA_CHECK_RETURN(cudaMemcpyAsync(dev_test, test, 13 * sizeof(float), cudaMemcpyHostToDevice,temp_stream));
CUDA_CHECK_RETURN(cudaMemcpyAsync(dev_BTR_train, BTR_train, C*row*sizeof(float), cudaMemcpyHostToDevice, temp_stream));
//CUDA_CHECK_RETURN(cudaMemcpy(dev_BTR_train, BTR_train, C*row*sizeof(float), cudaMemcpyHostToDevice));
cout << "strat calculate" << endl;
/*********************** calculate BTR ******************************/
cudaMalloc((void**)&dev_tosort, blocksPerGrid * 3 * sizeof(float));
sort_block = (float *)malloc(blocksPerGrid * 3 * sizeof(float));

}
void compute()
{
cudaSetDevice(temp_gpu);
// cudaEventCreate(&start_gpu);
// cudaEventCreate(&end_gpu);

// cudaEventRecord(start_gpu, temp_stream);
CalDis_Sort << <blocksPerGrid, threadsPerBlock, 400 * sizeof(float), temp_stream >> >(dev_test, dev_BTR_train, dev_tosort, row, C);
// cudaEventRecord(end_gpu, temp_stream);
// cudaEventSynchronize(end_gpu);
// cudaEventElapsedTime(&consume_gpu, start_gpu, end_gpu);
// cout<<"caldis_sort:"<<consume_gpu<<endl;
// cout<<"sort done"<<endl;
// checkCUDAError("kernel invocation1");
int num=blocksPerGrid *3;
// threadsPerBlock2 = 512;
// blocksPerGrid2 =(num + threadsPerBlock2 -1)/ threadsPerBlock2;
flag2 =1;
//appendINF<<<3 ,threadsPerBlock2 >>>(dev_first_thr);
//for(int i=0;i<num;i++)
// cout<<"num::"<<num<<endl;
while(num>3)
{
//threadsPerBlock2 = 512;
blocksPerGrid2 = (num + threadsPerBlock2 -1)/threadsPerBlock2;
if(flag2 == 1)
{
//cout<<"flag2==1"<<endl;

global_sort << <blocksPerGrid2, threadsPerBlock2,SHARE*4 , temp_stream >> >(dev_tosort, dev_first_thr,num);//cout<<"flag2==1"<<endl;

flag2=0;
}
else
{
global_sort << <blocksPerGrid2, threadsPerBlock2,SHARE*4 , temp_stream >> >(dev_first_thr,dev_tosort,num);
flag2=1;
}
num = blocksPerGrid2*3;
// cout <<"num-- "<<num;
// cout<<endl<<"blocksPerGrid2::"<<blocksPerGrid2<<endl;
}

// cudaEventRecord(bmp.end_gpu, bmp.temp_stream);
//cudaEventRecord(end_gpu, temp_stream);

}
float getResult()
{
if(flag2==1)
cudaMemcpyAsync(tosort_host, dev_tosort, 3 * sizeof(float), cudaMemcpyDeviceToHost, temp_stream);
else
cudaMemcpyAsync(tosort_host, dev_first_thr, 3 * sizeof(float), cudaMemcpyDeviceToHost, temp_stream);
//CUDA_CHECK_RETURN(cudaMemcpyAsync(tosort_host, dev_first_thr, sizeof(float) * 3, cudaMemcpyDeviceToHost, temp_stream));
//cout << "tosort_host[0]:=" << tosort_host[0] << " ,tosort_host[1]:=" << tosort_host[1] << ", tosort_host[2]:=" << tosort_host[2] << endl;
tosort_host[0] = (sqrt(tosort_host[0]) + sqrt(tosort_host[1]) + sqrt(tosort_host[2]));

cout << fileName << "distance is" << endl;
printf("%f ", tosort_host[0]/3);
return tosort_host[0];
}

virtual ~CRunGPU()
{

switch (flag)
{
case(0) : temp_stream = stream[0]; temp_gpu = gpu[0]; break;

case(1) : temp_stream = stream[1]; temp_gpu = gpu[1]; break;

case(2) : temp_stream = stream[2]; temp_gpu = gpu[2]; break;
}
cudaSetDevice(temp_gpu);
CUDA_CHECK_RETURN(cudaFree(dev_BTR_train));
CUDA_CHECK_RETURN(cudaFree(dev_test));
free(BTR_train);
free(tosort_host);
CUDA_CHECK_RETURN(cudaFree(dev_tosort));
CUDA_CHECK_RETURN(cudaFree(dev_first_thr));
free(sort_block);
}
};

//void runCPU(const char*fileName){}

#define MULNUM 10000
#define GPUCYCLE 10
#define CPUCYCLE 10
const char *fileNameBTR = "data/BTR10000.txt";const char *fileNameBMP = "data/BMP10000.txt";const char *fileNameT = "data/T10000.txt";

int main()
{
/********************************************`
/ GPU go first
/*******************************************/
printf("***********************START OF GPU*********************** ");
int row_M = getFileRows(fileNameBTR);
int row_M1 = getFileRows(fileNameBMP);
int row_M2 = getFileRows(fileNameT);
cout << row_M << endl;
cout << "creating array of stream and gpu success" << endl;
//double consume_gpu=10;

CRunGPU btr(fileNameBTR, row_M, N,0) ;
CRunGPU bmp(fileNameBMP, row_M1, N1,1) ;
CRunGPU t(fileNameT, row_M2, N2,2) ;

clock_t gpuParelledTime_b,gpuParelledTime_e;
gpuParelledTime_b=clock();
for(int i=0;i<GPUCYCLE;i++)
{
btr.compute();
bmp.compute();
t.compute();
cudaStreamSynchronize(btr.temp_stream);
cudaStreamSynchronize(bmp.temp_stream);
cudaStreamSynchronize(t.temp_stream);
// cudaEventSynchronize(btr.end_gpu); // ratio = 870
// cudaEventSynchronize(bmp.end_gpu); // ratio = 860
// cudaEventSynchronize(t.end_gpu); //ratio = 860
}
gpuParelledTime_e=clock();
double consume_gpu=(double)( gpuParelledTime_e - gpuParelledTime_b ) / CLOCKS_PER_SEC * 1000 /GPUCYCLE;
printf("gpu_time=%f ", consume_gpu);
btr.getResult();
bmp.getResult();
t.getResult();

//float consume_gpu = (gpu_t1 + gpu_t2 + gpu_t3) / 3;
printf("***********************END OF GPU************************* ");
printf(" ");
printf(" ");
/********************************************
/ CPU go second
/*******************************************/

printf("***********************START OF CPU*********************** ");
int col, row;
float distBMP, distBTR, distT;
float*dataMatrix1, *dataMatrix2, *dataMatrix3;
//vector<float> DisBTR;
//vector<float> DisBMP;
//vector<float> DisT;

//BTR

col = getFileColumns(fileNameBTR);
row = getFileRows(fileNameBTR);
int r_btr = 256 * int((row + 255) / 256);
//printf("row:%d,col:%d ",row,col);
dataMatrix1 = (float*)malloc(col*r_btr*sizeof(float));
if (dataMatrix1){
//printf("malloc matrix successful! ");
readMatrixfromTXT<float>(fileNameBTR, col, row, dataMatrix1);
}
cout << "col" << col << endl;

// BMP

col = getFileColumns(fileNameBMP);
row = getFileRows(fileNameBMP);
int r_bmp = 256 * int((row + 255) / 256);

//printf("row:%d,col:%d ",row,col);
dataMatrix2 = (float*)malloc(col*r_bmp*sizeof(float));
if (dataMatrix2){
//printf("malloc matrix successful! ");
readMatrixfromTXT<float>(fileNameBMP, col, row, dataMatrix2);
}

// T

col = getFileColumns(fileNameT);
row = getFileRows(fileNameT);
int r_t = 256 * int((row + 255) / 256);
//printf("row:%d,col:%d ",row,col);
dataMatrix3 = (float*)malloc(col*r_t*sizeof(float));
if (dataMatrix3){
//printf("malloc matrix successful! ");
readMatrixfromTXT<float>(fileNameT, col, row, dataMatrix3);
}

clock_t start_cpu = clock();
//printf("start_cpu: %f ", start_cpu);
for (int cnt = 0; cnt<CPUCYCLE; ++cnt){
distBTR = CalDist(dataMatrix1, r_btr, col);
distBMP = CalDist(dataMatrix2, r_bmp, col);
distT = CalDist(dataMatrix3, r_t, col);
}
clock_t end_cpu = clock();
//printf("end_cpu: %f ", end_cpu);
clock_t consume_cpu = (end_cpu - start_cpu);

//DisBTR.push_back(distBTR);
printf("The distance of BTR is %f ", distBTR);
//DisBMP.push_back(distBMP);
printf("The distance of BMP is %f ", distBMP);
//DisT.push_back(distT);
printf("The distance of T is %f ", distT);
free(dataMatrix1);
free(dataMatrix2);
free(dataMatrix3);

if (distBMP < distBTR)
{
if (distBMP < distT)
{
printf("The type of picture is BMP ");
string type = "BMP";
}
else
{
printf("The type of picture is T ");
string type = "T";
}
}
else
{
if (distBTR < distT)
{
printf("The type of picture is BTR ");
string type = "BTR";
}
else
{
printf("The type of picture is T ");
string type = "T";
}
}
printf("cpu_time=%f ", (double)consume_cpu / CLOCKS_PER_SEC * 1000 / CPUCYCLE);
printf("***********************END OF CPU************************* ");
printf("cpu_time/gpu_time=%f ", (double)consume_cpu / CLOCKS_PER_SEC * 1000 / consume_gpu / CPUCYCLE);
//printf("cpu_time/gpu_time=%f ", (double)totaltime/CLOCKS_PER_SEC*1000 / consume_gpu);

return 0;
}

void BubbleSort(float* pData/*array*/, int count/*the dimension of array*/)
{
float temp;
for (int i = 1; i < 4; i++)
{
for (int j = count - 1; j >= i; j--)
{
if (pData[j] < pData[j - 1])
{
temp = pData[j - 1];
pData[j - 1] = pData[j];
pData[j] = temp;
}
}
}
}

template<typename T>
void readMatrixfromTXT(const char *fileName, const int numColumn, const int numRow, T *matrix)
{
// std::ifstream fin(fileName,std::ifstream::in);
ifstream fin(fileName);
// ifstream fin(fileName.c_str(),ios::in);
if (!fin)
{
cerr << "不能打开文件" << endl;
exit(1);
}
string line;
float tmp;
int j = 0;
int i = 0;
int numRow2 = 256 * int((numRow + 255) / 256);
for (i = 0; i<numRow - 1; i++){
getline(fin, line);
j = 0;
//for(int j=0;j<numColumn;j++){
istringstream istr(line);
while (istr >> tmp){
//matrix[i*numColumn + j] = tmp;
matrix[j*numRow2 + i]=tmp;
++j;
//cout<<tmp<<endl;
}
istr.clear();
line.clear();
}
// cout<<"to add to num%256==0"<<endl;
getline(fin, line);
fin.close();
j = 0;
int rownum2 = numRow - 1;

do
{
j = 0;
istringstream istr(line);
while (istr >> tmp){
matrix[j*numRow2 + rownum2]=tmp;
++j;
}
istr.clear();
++rownum2;
} while (rownum2 % 256>0);

for(int i=numRow;i<numRow2;i++)
matrix[i]=1000;
}

int getFileColumns(const char *fileName){
return 14;
}

int getFileRows(const char *fileName){
ifstream fileStream(fileName, ios::in);
string tmp;
int count = 0;
if (fileStream){
while (getline(fileStream, tmp, ' ')){
count++;
}
}
fileStream.close();
return count;
}

float CalDist(float*mat, int row, int col){
//vector<float>tmp_dist;
float * a= (float*)malloc(sizeof(float)*MULNUM*181);
int count=0;
for (int i = 0; i<row; i++){
if (abs(testcpu[0] - mat[i]) <= 10){
float sum = 0;
for (int k = 1; k<13; k++){
sum += (testcpu[k] - mat[i + k*row])*(testcpu[k] - mat[i + k*row]);
}
sum = sqrt(sum);
//tmp_dist.push_back(sum);
a[count++]=sum;
}
}
//cout<<"cpu sorting..."<<endl;
BubbleSort(a, count);
float dist1 = (a[0] + a[1] + a[2])/3;
free(a);
return dist1;
}