NVIDIA GPU的神经网络自动调度

针对特定设备和工作负载的自动调整对于获得最佳性能至关重要。这是一个关于如何使用自动调度器为NVIDIA GPU调整整个神经网络的资料。

为了自动调整一个神经网络，将网络划分成小的子图并独立地进行调整。每个子图被视为一个搜索任务。任务调度器对时间进行切片，并动态地为这些任务分配时间资源。任务调度器预测每个任务对端到端执行时间的影响，并对最能缩短执行时间的任务进行优先级排序。

对于每个子图，使用tvm/python/topi中的compute声明来获得张量表达式形式的计算DAG。然后使用自动调度器来构造这个DAG的搜索空间，并搜索好的调度（低级优化）。

与基于模板的autotvm依赖手动模板来定义搜索空间不同，auto scheduler不需要任何调度模板。换句话说，自动调度程序只使用tvm/python/topi中的compute声明，而不使用现有的调度模板。

本文不会在Windows或最新版本的macOS上运行。要让它运行，需要将主体包装在if __name__ == "__main__": 块中。

import numpy as np

import tvm

from tvm import relay, auto_scheduler

import tvm.relay.testing

from tvm.contrib import graph_runtime

Define a Network

首先，需要用转换前端API定义网络。可以从tvm转换测试。还可以从MXNet、ONNX、PyTorch和TensorFlow加载模型（参见前端）。

对于卷积神经网络，虽然自动调度器可以在任何布局下正常工作，但发现通常使用NHWC布局可以获得最佳性能。还使用自动调度器对NHWC布局进行了更多的优化。因此，建议将模型转换为NHWC布局以使用自动调度程序。可以使用ConvertLayout pass在TVM中执行布局转换。

def get_network(name, batch_size, layout="NHWC", dtype="float32"):

"""Get the symbol definition and random weight of a network"""

# auto-scheduler prefers NHWC layout

if layout == "NHWC":

image_shape = (224, 224, 3)

elif layout == "NCHW":

image_shape = (3, 224, 224)

else:

raise ValueError("Invalid layout: " + layout)

input_shape = (batch_size,) + image_shape

output_shape = (batch_size, 1000)

if name.startswith("resnet-"):

n_layer = int(name.split("-")[1])

mod, params = relay.testing.resnet.get_workload(

num_layers=n_layer,

batch_size=batch_size,

layout=layout,

dtype=dtype,

image_shape=image_shape,

)

elif name.startswith("resnet3d-"):

n_layer = int(name.split("-")[1])

mod, params = relay.testing.resnet.get_workload(

num_layers=n_layer,

batch_size=batch_size,

layout=layout,

dtype=dtype,

image_shape=image_shape,

)

elif name == "mobilenet":

mod, params = relay.testing.mobilenet.get_workload(

batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape

)

elif name == "squeezenet_v1.1":

assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"

mod, params = relay.testing.squeezenet.get_workload(

version="1.1",

batch_size=batch_size,

dtype=dtype,

image_shape=image_shape,

)

elif name == "inception_v3":

input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)

mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)

elif name == "mxnet":

# an example for mxnet model

from mxnet.gluon.model_zoo.vision import get_model

assert layout == "NCHW"

block = get_model("resnet18_v1", pretrained=True)

mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)

net = mod["main"]

net = relay.Function(

net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs

)

mod = tvm.IRModule.from_expr(net)

return mod, params, input_shape, output_shape

# Define the neural network and compilation target

network = "resnet-18"

batch_size = 1

layout = "NHWC"

target = tvm.target.Target("cuda")

dtype = "float32"

log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)

Extract Search Tasks

接下来，从网络中提取搜索任务及其权重。任务权重是指任务子图在整个网络中的出现次数。通过使用权重，可以将网络的端到端延迟近似为sum（latency[t]*weight[t]），其中latency[t]是任务的延迟，weight[t]是任务的权重。任务调度器只会优化这个目标。

# Extract tasks from the network

print("Extract tasks...")

mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)

tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

for idx, task in enumerate(tasks):

print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))

print(task.compute_dag)

Out:

Extract tasks...

========== Task 0 (workload key: ["b32ed43fb351136894c322ee49097a1a"]) ==========

placeholder = PLACEHOLDER [1, 1000]

T_softmax_maxelem(i0) max= placeholder[i0, k]

T_softmax_exp(i0, i1) = tir.exp((placeholder[i0, i1] - T_softmax_maxelem[i0]))

T_softmax_expsum(i0) += T_softmax_exp[i0, k]

T_softmax_norm(i0, i1) = (T_softmax_exp[i0, i1]/T_softmax_expsum[i0])

========== Task 1 (workload key: ["d09dc1a6bb90d59c91b68989ad3492ff"]) ==========

placeholder = PLACEHOLDER [1, 512]

placeholder = PLACEHOLDER [1000, 512]

T_dense(i, j) += (placeholder[i, k]*placeholder[j, k])

placeholder = PLACEHOLDER [1000]

T_add(ax0, ax1) = (T_dense[ax0, ax1] + placeholder[ax1])

========== Task 2 (workload key: ["7de313da0ca29a8c63f647791692430d"]) ==========

placeholder = PLACEHOLDER [1, 7, 7, 512]

tensor(ax0, ax1, ax2, ax3) += placeholder[ax0, ((ax1*7) + rv0), ((ax2*7) + rv1), ax3]

tensor(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3]/(float32((select((bool)1, ((ax1 + 1)*7), (((ax1 + 1)*7) + 1)) - (ax1*7)))*float32((select((bool)1, ((ax2 + 1)*7), (((ax2 + 1)*7) + 1)) - (ax2*7)))))

========== Task 3 (workload key: ["8d5a93959138dc7b2ee1f1b3219dfa14"]) ==========

placeholder = PLACEHOLDER [1, 7, 7, 512]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]

B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 512, 512]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*4)*4) + (floordiv(h, 2)*4)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 7, 7, 512]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

placeholder = PLACEHOLDER [1, 1, 1, 512]

T_multiply(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3]*placeholder[ax0, 0, 0, ax3])

placeholder = PLACEHOLDER [1, 1, 1, 512]

T_add(ax0, ax1, ax2, ax3) = (T_multiply[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 4 (workload key: ["ac6920940de3797cc3f9f9c260675e5d"]) ==========

placeholder = PLACEHOLDER [1, 7, 7, 512]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 512, 512]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*4)*4) + (floordiv(h, 2)*4)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 1, 1, 512]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 5 (workload key: ["7e83a2ee5cd5d50282ed19310700046a"]) ==========

placeholder = PLACEHOLDER [1, 7, 7, 512]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 512, 512]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*4)*4) + (floordiv(h, 2)*4)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 7, 7, 512]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

========== Task 6 (workload key: ["1f6cd3637ec856bf5cf5010a623eed05"]) ==========

placeholder = PLACEHOLDER [1, 14, 14, 256]

PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

placeholder = PLACEHOLDER [3, 3, 256, 512]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

placeholder = PLACEHOLDER [1, 1, 1, 512]

T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 7 (workload key: ["424ba83160af31badc0b098136e1a3b0"]) ==========

placeholder = PLACEHOLDER [1, 14, 14, 256]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 256, 256]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*7)*7) + (floordiv(h, 2)*7)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 14, 14, 256]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

placeholder = PLACEHOLDER [1, 1, 1, 256]

T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 8 (workload key: ["a169cd0053d3a7ca82998fcb62e42c58"]) ==========

placeholder = PLACEHOLDER [1, 14, 14, 256]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 256, 256]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*7)*7) + (floordiv(h, 2)*7)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 1, 1, 256]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 9 (workload key: ["0141ffc4fbabc10cc5a94c954419055b"]) ==========

placeholder = PLACEHOLDER [1, 14, 14, 256]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 256, 256]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*7)*7) + (floordiv(h, 2)*7)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 14, 14, 256]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

========== Task 10 (workload key: ["81aae4b8e2c076a4014d403e8a2c70a1"]) ==========

placeholder = PLACEHOLDER [1, 28, 28, 128]

PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

placeholder = PLACEHOLDER [3, 3, 128, 256]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

placeholder = PLACEHOLDER [1, 1, 1, 256]

T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 11 (workload key: ["c7a6b56bdc04b94c829fb2ef9874019e"]) ==========

placeholder = PLACEHOLDER [1, 28, 28, 128]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 128, 128]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*14)*14) + (floordiv(h, 2)*14)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 28, 28, 128]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

placeholder = PLACEHOLDER [1, 1, 1, 128]

T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 12 (workload key: ["c035cc8b0568a8e054d06bd7f4950550"]) ==========

placeholder = PLACEHOLDER [1, 28, 28, 128]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 128, 128]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*14)*14) + (floordiv(h, 2)*14)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 1, 1, 128]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 13 (workload key: ["c5ee3e05edd9754492d0763aa41fd025"]) ==========

placeholder = PLACEHOLDER [1, 28, 28, 128]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [4, 4, 128, 128]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*14)*14) + (floordiv(h, 2)*14)) + floordiv(w, 2)), co]

placeholder = PLACEHOLDER [1, 28, 28, 128]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

========== Task 14 (workload key: ["022ebb6b7c55c5ed030421380ec83a04"]) ==========

placeholder = PLACEHOLDER [1, 56, 56, 64]

PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

placeholder = PLACEHOLDER [3, 3, 64, 128]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

placeholder = PLACEHOLDER [1, 1, 1, 128]

T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 15 (workload key: ["de0df0893e01892cfe69f7bc2c24111f"]) ==========

placeholder = PLACEHOLDER [1, 56, 56, 64]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]

B(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 6) == 5)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 6) == 4)), ..(OMITTED).. (floormod(j, 6) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 6) == 0)), 1f, 0f))))))))))))))))))))))))))))))))))))

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [6, 6, 64, 64]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

A(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 4) == 2)), ..(OMITTED).. 6) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))))))))))

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*14)*14) + (floordiv(h, 4)*14)) + floordiv(w, 4)), co]

placeholder = PLACEHOLDER [1, 56, 56, 64]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

placeholder = PLACEHOLDER [1, 1, 1, 64]

T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 16 (workload key: ["f2e3c09a00e7d0a9897f70497e089f1e"]) ==========

placeholder = PLACEHOLDER [1, 56, 56, 64]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [6, 6, 64, 64]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*14)*14) + (floordiv(h, 4)*14)) + floordiv(w, 4)), co]

placeholder = PLACEHOLDER [1, 1, 1, 64]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 17 (workload key: ["fa26946d7ac51126bfa859cb183f9ca1"]) ==========

placeholder = PLACEHOLDER [1, 56, 56, 64]

data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)

input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]

data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])

placeholder = PLACEHOLDER [6, 6, 64, 64]

bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])

inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])

conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*14)*14) + (floordiv(h, 4)*14)) + floordiv(w, 4)), co]

placeholder = PLACEHOLDER [1, 56, 56, 64]

T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])

========== Task 18 (workload key: ["ba2026d923536b75e9b4faed89287d5f"]) ==========

placeholder = PLACEHOLDER [1, 112, 112, 64]

pad_temp(ax0, ax1, ax2, ax3) = tir.if_then_else(((((ax1 >= 1) && (ax1 < 113)) && (ax2 >= 1)) && (ax2 < 113)), placeholder[ax0, (ax1 - 1), (ax2 - 1), ax3], -3.40282e+38f)

tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + dh), ((ax2*2) + dw), ax3]

placeholder = PLACEHOLDER [1, 1, 1, 64]

T_add(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 19 (workload key: ["a0eb8d6048282a4a0986cc2ccf14eaa2"]) ==========

placeholder = PLACEHOLDER [1, 224, 224, 3]

PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 3) && (i1 < 227)) && (i2 >= 3)) && (i2 < 227)), placeholder[i0, (i1 - 3), (i2 - 3), i3], 0f)

placeholder = PLACEHOLDER [7, 7, 3, 64]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

placeholder = PLACEHOLDER [1, 1, 1, 64]

T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])

T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

========== Task 20 (workload key: ["bf78a7bf0209980f72953637dfd14a6f"]) ==========

placeholder = PLACEHOLDER [1, 56, 56, 64]

PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]

placeholder = PLACEHOLDER [1, 1, 64, 64]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, (yy + ry), (xx + rx), rc]*placeholder[ry, rx, rc, ff])

========== Task 21 (workload key: ["6630936c26852f2b89dbfa2ff37fbb9c"]) ==========

placeholder = PLACEHOLDER [1, 56, 56, 64]

PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]

placeholder = PLACEHOLDER [1, 1, 64, 128]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

========== Task 22 (workload key: ["ba5f918733ccbbd4a1d7fd3724665a2f"]) ==========

placeholder = PLACEHOLDER [1, 28, 28, 128]

PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]

placeholder = PLACEHOLDER [1, 1, 128, 256]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

========== Task 23 (workload key: ["21ad409d72953de188314010134e3acd"]) ==========

placeholder = PLACEHOLDER [1, 14, 14, 256]

PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]

placeholder = PLACEHOLDER [1, 1, 256, 512]

Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])

Begin Tuning

Now, we set some options for tuning and launch the search tasks

measure_ctx launches a different process for measurement to provide isolation. It can protect the master process from GPU crashes during measurement and avoid other runtime conflicts.
min_repeat_ms defines the minimum duration of one “repeat” in every measurement. This can warmup the GPU, which is necessary to get accurate measurement results. Typically, we recommend a value >= 300 ms.
num_measure_trials is the number of measurement trials we can use during the tuning. You can set it to a small number (e.g., 200) for a fast demonstrative run. In practice, we recommend setting it around 900 * len(tasks), which is typically enough for the search to converge. For example, there are 24 tasks in resnet-18, so we can set it as 20000. You can adjust this parameter according to your time budget.
In addition, we use RecordToFile to dump measurement records into a log file, The measurement records can be used to query the history best, resume the search, and do more analyses later.
see auto_scheduler.TuningOptions, auto_scheduler.LocalRPCMeasureContext for more parameters.
- def run_tuning():
- print("Begin tuning...")
- measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
- tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
- tune_option = auto_scheduler.TuningOptions(
- num_measure_trials=200, # change this to 20000 to achieve the best performance
- runner=measure_ctx.runner,
- measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
- )
- tuner.tune(tune_option)
- # We do not run the tuning in our webpage server since it takes too long.
- # Uncomment the following line to run it by yourself.
- # run_tuning()

注意

解释调谐过程中打印的信息

在调整过程中，许多信息将打印在控制台上。它们用于调试目的。最重要的信息是任务调度器的输出。下表是一个示例输出。

----------------------------------------------------------------------

------------------------------ [ Task Scheduler ]

----------------------------------------------------------------------

-------------------------------------------------

| 0 | 0.005 | 0.88 | 64 |

| 1 | 0.010 | 99.10 | 64 |

| 2 | 0.006 | 0.00 | 64 |

| 3 | 0.145 | 979.78 | 384 |

| 4 | 0.130 | 1097.02 | 384 |

| 5 | 0.143 | 992.69 | 384 |

| 6 | 0.076 | 1526.86 | 192 |

| 7 | 0.115 | 999.44 | 320 |

| 8 | 0.079 | 1449.39 | 320 |

| 9 | 0.122 | 938.73 | 384 |

| 10 | 0.063 | 1832.98 | 192 |

| 11 | 0.072 | 1763.62 | 256 |

| 12 | 0.062 | 2036.40 | 192 |

| 13 | 0.068 | 1874.44 | 192 |

| 14 | 0.049 | 2346.50 | 128 |

| 15 | 0.076 | 1694.31 | 256 |

| 16 | 0.067 | 1933.30 | 448 |

| 17 | 0.076 | 1680.90 | 256 |

| 18 | 0.022 | 98.43 | 64 |

| 19 | 0.076 | 3112.55 | 192 |

| 20 | 0.013 | 2026.44 | 64 |

| 21 | 0.011 | 1136.69 | 64 |

| 22 | 0.013 | 992.47 | 64 |

| 23 | 0.020 | 627.56 | 64 |

-------------------------------------------------

Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3

此表列出了所有任务的延迟和（估计）速度。它还列出了所有任务的测量试验分配。最后可以粗略估计出这些任务的最后一行的执行时间。最后一行还打印测量试验的总次数、自动调谐所花费的总时间以及要优化的下一个任务的id。

还会出现一些“dmlc:：Error”和CUDA错误，因为自动调度程序会尝试一些无效的调度。如果调优可以继续，则可以安全地忽略它们，因为这些错误与主进程是隔离的。

注意

提前终止调谐

可以通过强制终止此进程来提前终止优化。只要为日志文件中的每个任务至少获得一个有效的计划，就应该能够进行编译（如下所示）。

Compile and Evaluate

在自动调整之后，可以用找到的最佳时间表来编译网络。在自动调整期间，所有测量记录都会转储到日志文件中，这样就可以读取日志文件并加载最佳的计划。

# Compile with the history best

print("Compile...")

with auto_scheduler.ApplyHistoryBest(log_file):

    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):

        lib = relay.build(mod, target=target, params=params)

# Create graph runtime

ctx = tvm.context(str(target), 0)

module = graph_runtime.GraphModule(lib["default"](ctx))

data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))

module.set_input("data", data_tvm)

# Evaluate

print("Evaluate inference time cost...")

ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)

prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond

print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))

Out:

Compile...

Evaluate inference time cost...

Mean inference time (std dev): 3.28 ms (0.01 ms)

Other Tips

During the tuning, the auto-scheduler needs to compile many programs and extract feature from them. This part is CPU-intensive, so a high-performance CPU with many cores is recommended for faster search.
You can use python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json to distill the large log file and only save the best useful records.
You can resume a search from the previous log file. You just need to add a new argument load_log_file when creating the task scheduler in function run_tuning. Say, tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)
If you have multiple target GPUs, you can use all of them for measurements to parallelize the measurements. Check this section to learn how to use the RPC Tracker and RPC Server. To use the RPC Tracker in auto-scheduler, replace the runner in TuningOptions with auto_scheduler.RPCRunner.

https://tvm.apache.org/docs/tutorials/auto_scheduler/tune_network_cuda.html

下载Python源代码：une_network_cuda.py

下载Jupyter笔记本：tune_network_cuda.ipynb

人工智能芯片与自动驾驶