tvm模型c++部署调用gpu

tvm c++部署官方教程

https://github.com/apache/tvm/tree/main/apps/howto_deploy

官方说执行run_example.sh脚本就可以完成部署

c++部署代码

https://github.com/apache/tvm/blob/main/apps/howto_deploy/cpp_deploy.cc

Makefile文件

https://github.com/apache/tvm/blob/main/apps/howto_deploy/Makefile

结合Makefile文件和run_example.sh脚本一起看

脚本先创建lib目录，然后执行sudo make命令，make操作的执行要看Makefile文件

make命令会先在lib文件夹中编译一个名为libtvm_runtime_pack.o的静态链接库

然后运行prepare_test_lib.py文件生成将模型生成为test_addone_dll.so,test_addone_sys.o和test_relay_add.so三个库，给cpp_deploy.cc调用，生成两个可执行文件cpp_deploy_pack和cpp_deploy_normal

我的目标是用其他框架写的深度学习网络通过tvm转换成so文件，使用c++部署，在gpu上进行调用，下面是cpu上部署的代码

* Licensed to the Apache Software Foundation (ASF) under one

* or more contributor license agreements. See the NOTICE file

* distributed with this work for additional information

* regarding copyright ownership. The ASF licenses this file

* to you under the Apache License, Version 2.0 (the

* "License"); you may not use this file except in compliance

* with the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing,

* software distributed under the License is distributed on an

* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

* KIND, either express or implied. See the License for the

* specific language governing permissions and limitations

* under the License.

/*!

* rief Example code on load and run TVM module.s

* file cpp_deploy.cc

#include <dlpack/dlpack.h>

#include <tvm/runtime/module.h>

#include <tvm/runtime/packed_func.h>

#include <tvm/runtime/registry.h>

#include <cstdio>

#include <fstream>

#include <sstream>

#include <string>

using namespace std;

template <class Type>

Type stringToNum(const string& str)

{

istringstream iss(str);

Type num;

iss >> num;

return num;

}

void DeployGraphRuntime() {

ifstream in("/home/aiteam/tiwang/tvm-tfs-gpu-bkp/data.txt");

//int image[784];

string s;

int image_index=0;

while(getline(in,s))

{

image[i]=stringToNum<int>(s);

++i;

}*/

LOG(INFO) << "Running graph runtime...";

// load in the library

DLContext ctx{kDLGPU, 0};

tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("/home/aiteam/tiwang/tvm-tfs-gpu-bkp/model.so");

// create the graph runtime module

tvm::runtime::Module gmod = mod_factory.GetFunction("default")(ctx);

tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");

tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");

tvm::runtime::PackedFunc run = gmod.GetFunction("run");

// Use the C++ API

tvm::runtime::NDArray x = tvm::runtime::NDArray::Empty({1,784}, DLDataType{kDLFloat, 32, 1}, ctx);

tvm::runtime::NDArray y = tvm::runtime::NDArray::Empty({1, 10}, DLDataType{kDLFloat, 32, 1}, ctx);

while(getline(in,s))

{

static_cast<float*>(x->data)[image_index]=((float)stringToNum<int>(s))/255;

image_index++;

}

// set the right input

set_input("x", x);

// run the code

run();

// get the output

get_output(0, y);

for(int i=0;i<10;++i)

{

LOG(INFO)<<static_cast<float*>(y->data)[i];

}

for (int i = 0; i < 2; ++i) {

for (int j = 0; j < 2; ++j) {

ICHECK_EQ(static_cast<float*>(y->data)[i * 2 + j], i * 2 + j + 1);

}

}*/

}

int main(void) {

//DeploySingleOp();

DeployGraphRuntime();

return 0;

}

思路很简单就是把数据读进来，set_input，run然后get_output，修改了将target修改成cuda后并不能成功在gpu上运行，会出现core dump的问题

原因是想要让模型在gpu上运行，需要在gpu上开辟内存，然后将数据拷贝到gpu上运行，这个代码没有这些操作所以运行时会导致core崩溃

下面是tvm c++部署调用gpu的完整过程，深度学习模型使用keras写的mnist手写体识别网络，保存成了pb格式，模型代码就不放了，这里直接读取pb文件进行转化，模型输入是（1，784），输出是（1，10）

导入头文件

import tvm
from tvm import te
from tvm import relay

# os and numpy
import numpy as np
import os.path

# Tensorflow imports
import tensorflow as tf

try:
    tf_compat_v1 = tf.compat.v1
except ImportError:
    tf_compat_v1 = tf

# Tensorflow utility functions
import tvm.relay.testing.tf as tf_testing
from tvm.contrib import graph_runtime

参数设置

#cpu
#target = "llvm"
#target_host = "llvm"
#layout = None
#ctx = tvm.cpu(0)

#gpu
target = "cuda"  
target_host = 'llvm'  
layout = "NCHW"  
ctx = tvm.gpu(0)

处理数据

from tensorflow.python.keras.datasets import mnist
from tensorflow.python.keras.utils import np_utils

(x_train,y_train),(x_test,y_test)=mnist.load_data()
x_test1=x_test.reshape(x_test.shape[0],x_test.shape[1]*x_test.shape[2])

print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)
x_train=x_train.reshape(x_train.shape[0],x_train.shape[1]*x_train.shape[2])
x_test=x_test.reshape(x_test.shape[0],x_test.shape[1]*x_test.shape[2])
x_train=x_train/255
x_test=x_test/255
y_train=np_utils.to_categorical(y_train)
y_test=np_utils.to_categorical(y_test)
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

with open("data.txt",'w') as wf:    
    for i in range(784):
        wf.write(str(x_test1[12][i]))
        wf.write('
')

读取模型

with tf_compat_v1.gfile.GFile('./frozen_models/simple_frozen_graph.pb', "rb") as f:
    graph_def = tf_compat_v1.GraphDef()
    graph_def.ParseFromString(f.read())
    graph = tf.import_graph_def(graph_def, name="")
    # Call the utility to import the graph definition into default graph.
    graph_def = tf_testing.ProcessGraphDefParam(graph_def)
    # Add shapes to the graph.
    
    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    with tf_compat_v1.Session() as sess:
        graph_def = tf_testing.AddShapesToGraphDef(sess, "Identity")
        
tensor_name_list = [tensor.name for tensor in tf.compat.v1.get_default_graph().as_graph_def().node]
for tensor_name in tensor_name_list:
    print(tensor_name,'
')

构建

shape_dict = {"x": x_train[0:1].shape}
print(shape_dict)
dtype_dict = {"x": "uint8"}
mod, params = relay.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict)

print("Tensorflow protobuf imported to relay frontend.")

编译成tvm模型

with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, target_host=target_host, params=params)

测试一下tvm模型能不能用

from tvm.contrib import graph_runtime

tt=np.zeros([1,784])
i=0
file=open("data.txt")
while 1:
    line=file.readline()
    if not line:
        break
    tt[0][i]=int(line)
    i+=1
file.close()

dtype = "float32"
m = graph_runtime.GraphModule(lib["default"](ctx))
# set inputs
m.set_input("x", tvm.nd.array(tt.astype(dtype)))
# execute
m.run()
# get outputs
tvm_output = m.get_output(0, tvm.nd.empty(((1, 10)), "float32"))
print(tvm_output.shape,tvm_output)

保存模型

from tvm.contrib import utils
temp=utils.tempdir()
path_lib=temp.relpath("/home/aiteam/test_code/model.so")
print(path_lib)
lib.export_library(path_lib)
print(temp.listdir())

然后进入到tvm/apps/howto_deploy目录，修改tvm_runtime_pack.cc文件，加上头文件

#include "../../src/runtime/cuda/cuda_device_api.cc"
#include "../../src/runtime/cuda/cuda_module.cc"

然后再写一个cc文件存放自己的部署代码，修改Makefile文件进行编译

我的文件名是cpp_deploy_bkp.cc

修改后的Makefile文件

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# Makefile Example to deploy TVM modules.
TVM_ROOT=$(shell cd ../..; pwd)
DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core

PKG_CFLAGS = -std=c++14 -g -fPIC
    -I${TVM_ROOT}/include
    -I${DMLC_CORE}/include
    -I${TVM_ROOT}/3rdparty/dlpack/include
    -I/usr/local/cuda/include

PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -pthread -L/usr/local/cuda/lib64  -lcudart -lcuda


.PHONY: clean all
all:lib/libtvm_runtime_pack.o lib/cpp_deploy_pack

#all: lib/cpp_deploy_pack lib/cpp_deploy_normal

# Build rule for all in one TVM package library
.PHONY: lib/libtvm_runtime_pack.o
lib/libtvm_runtime_pack.o: tvm_runtime_pack.cc
    @mkdir -p $(@D)
    $(CXX) -c $(PKG_CFLAGS) -o $@  $^ $(PKG_LDFLAGS)



# Deploy using the all in one TVM package library
.PHONY: lib/cpp_deploy_pack
lib/cpp_deploy_pack: cpp_deploy_bkp.cc lib/libtvm_runtime_pack.o
    @mkdir -p $(@D)
    $(CXX) $(PKG_CFLAGS) -o $@  $^ $(PKG_LDFLAGS)

里面要加上cuda头文件的位置和动态链接库的位置

cpp_deploy_bkp.cc

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * rief Example code on load and run TVM module.s
 * file cpp_deploy.cc
 */

#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>

#include <cstdio>

#include <fstream>
#include <sstream>
#include <string>
using namespace std;

template <class Type>
Type stringToNum(const string& str)
{
    istringstream iss(str);
    Type num;
    iss >> num;
    return num;
}


void DeployGraphRuntime() {
  constexpr int dtype_code= 2U;
  constexpr int dtype_bits=32;
  constexpr int dtype_lines=1;
  constexpr int device_type= 2;
  constexpr int device_id=0;
  int ndim=2;
  int64_t in_shape[2]={1,784};
  int64_t out_shape[2]={1,10};

  DLTensor* DLTX=nullptr;
  DLTensor* DLTY=nullptr;

  TVMArrayAlloc(in_shape,ndim,dtype_code,dtype_bits,dtype_lines,device_type,device_id,&DLTX);
  TVMArrayAlloc(out_shape,ndim,dtype_code,dtype_bits,dtype_lines,device_type,device_id,&DLTY);

  float img[784];
  float rslt[10];

  ifstream in("/home/aiteam/tiwang/data.txt");
  //int image[784];
  string s;
  int image_index=0;
  /*
  while(getline(in,s))
  {
      image[i]=stringToNum<int>(s);
      ++i;
  }*/
  bool enabled = tvm::runtime::RuntimeEnabled("cuda");
  if (!enabled) 
  {
      LOG(INFO) << "Skip heterogeneous test because cuda is not enabled."<< "
";
      return;
  }

  LOG(INFO) << "Running graph runtime...";
  // load in the library
  DLContext ctx{kDLGPU, 0};
  tvm::runtime::Module mod_factory = tvm::runtime::Module::LoadFromFile("/home/aiteam/test_code/model.so");
  // create the graph runtime module
  tvm::runtime::Module gmod = mod_factory.GetFunction("default")(ctx);
  tvm::runtime::PackedFunc set_input = gmod.GetFunction("set_input");
  tvm::runtime::PackedFunc get_output = gmod.GetFunction("get_output");
  tvm::runtime::PackedFunc run = gmod.GetFunction("run");

  // Use the C++ API
  while(getline(in,s))
  {
      if(image_index%28==0)
          printf("
");
      //static_cast<float*>(x->data)[image_index]=((float)stringToNum<int>(s))/255;
      img[image_index]=((float)stringToNum<int>(s))/255;
      
      int a=stringToNum<int>(s);
      printf("%4d",a);
      image_index++;
  }
  TVMArrayCopyFromBytes(DLTX,&img[0],image_index*sizeof(float));
  // set the right input
  set_input("x", DLTX);
  // run the code
  run();
  // get the output
  get_output(0, DLTY);
  TVMArrayCopyToBytes(DLTY,&rslt[0],10*sizeof(float));
  
  for(int i=0;i<10;++i)
  {
      LOG(INFO)<<rslt[i];
      //LOG(INFO)<<static_cast<float*>(y->data)[i];
  }
}

int main(void) {
  //DeploySingleOp();
  DeployGraphRuntime();
  return 0;
}

相比于之前cpu部署的代码，gpu部署多了一个拷贝张量的过程

参照

https://discuss.tvm.apache.org/t/deploy-nnvm-module-using-c-on-gpu-using-opencl-target/229

最终结果

首先在tvm/apps/howto_deplpy目录下执行sudo make

编译通过，运行可执行文件 ./lib/cpp_deploy_pack

无情的摸鱼机器

tvm模型c++部署 调用gpu

tvm模型c++部署调用gpu