Code 与 Note

二级标题按照调用路线，再次的标题按照Code和Note进行，另外的一些Note标在Code中。

Forward

Code

//@src/operator/convolution-inl.h
  virtual void Forward(const OpContext &ctx,
                       const std::vector<TBlob> &in_data,
                       const std::vector<OpReqType> &req,
                       const std::vector<TBlob> &out_data,
                       const std::vector<TBlob> &aux_args) {
    using namespace mshadow;
    using namespace mshadow::expr;
    CHECK_EQ(req[conv::kOut], kWriteTo);
    size_t expected = param_.no_bias ? 2 : 3;
    CHECK_EQ(in_data.size(), expected);
    CHECK_EQ(out_data.size(), 1U);
    CHECK_EQ(req[conv::kOut], kWriteTo);
    LayerSetUp(in_data[conv::kData].shape_, out_data[conv::kOut].shape_);
    Stream<xpu>* s = ctx.get_stream<xpu>();
    // allocate workspace for col_buffer
    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
    // calculate the shape of col_buffer
    TShape col_buffer_shape(num_spatial_axes_ + 1);           // num_spatial_axes = kernel.ndim =2
    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();   // 单个 filter 的大小
    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
      col_buffer_shape[i] = out_data[0].shape_[i+1];                               //  col_buffer_shape[-2:] = [out_H,out_W]
    }
    // create a column buffer using workspace and col_buffer_shape
    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);

    // initialize weight and col_buffer 3D tensors for using gemm

    index_t M = conv_out_channels_ / group_;                      // group_ 的default 值为1， M 即 filter 的数量
    index_t N = conv_out_spatial_dim_;                                 //  H x W
    index_t K = kernel_dim_;                                                  // conv_in_channel x kernel.Size()

    Tensor<xpu, 3, DType> weight_3d = in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(
      Shape3(group_, M, K), s);
    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
      Shape3(group_, K, N), s);
    Tensor<xpu, 4, DType> output_4d = out_data[conv::kOut].get_with_shape<xpu, 4, DType>(
      Shape4(num_, group_, M, N), s);
    for (index_t n = 0; n < num_; ++n) {                                //   num_ = batch_size
      // transform image to col_buffer in order to use gemm
      im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
             col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
             col_buffer.dptr<DType>());
      Tensor<xpu, 3, DType> output_3d = output_4d[n];
      for (index_t g = 0; g < group_; ++g) {
        ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
      }
    }
    if (bias_term_) {
      Tensor<xpu, 1, DType> bias = in_data[conv::kBias].get<xpu, 1, DType>(s);
      Tensor<xpu, 3, DType> output_3d = out_data[conv::kOut].get_with_shape<xpu, 3, DType>(
        Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
      // has bias term, broadcast it to the same shape of output_3d in channel dim
      output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
    }
  }

Note

内积方式是其思想。
index_t K暗示的实现思路是，将一个filter的weight全部提出，然后再将输入data中的对应元素提出，两者进行内积，有多少个filter就进行多少次内积（注意filter的个数正是输出map的channel数，index_t M）
index_t N说明，计算是以输出的map中每个pixel为基本单位进行。
之所以这里特地把这个思路关注一下，是因为之前自己实现的一个conv操作是通过稀疏矩阵与向量做内积进行的(Julia中)，这样实现的一个显著缺点是内存消耗增长过快；即使是稀疏矩阵，要存放以pixel数量的值为长度的方阵，仍然会将通用性达到谷底，16G的内存，单个的(15 imes15)左右的核已经使内存接近崩溃了。
做个对比，用稀疏矩阵的方法的一个主要的缺陷是重复的数据占用了大量的内存（(H imes W)列中占据内存的全是一个核的参数）。
后面来讨论这个思路的情况如何。

重点是两个数据。
weight_3d的尺寸:：(1 imes M imes K);
col_buffer_3d的尺寸：(1 imes K imes N)
以上默认group_=1。
我们再来回顾一下M,K,N的意义。
M代表了filter的数量（也是输出数据，channel的数量）；
K是单个filter的参数数量；
N是输出中，单个channel中二维feature map的大小。
weight_3d中存放的，显然是所有filter的weight；而col_buffer_3d，按照这种思路，应该用来存放对应的输入元素。直觉上，观察一下两个数据的尺寸（在二维角度下），可以发现其内积的尺寸就是(M imes N)，这正好就是一个样本对应的输出尺寸。
于是im2col便是用来将输入数据排列成对应的顺序，放到col_buffer_3d中。

im2col

这里关注GPU上的实现，在src/operator/nn/im2col.cuh中

Code

//@src/operator/nn/im2col.cuh

template <typename DType>
inline void im2col(mshadow::Stream<gpu>* s,
                   const DType* data_im, const TShape& im_shape,
                   const TShape& col_shape, const TShape& kernel_shape,
                   const TShape& pad, const TShape& stride,
                   const TShape& dilation, DType* data_col) {
  // num_axes should be smaller than block size
  index_t num_spatial_axes = kernel_shape.ndim();
  CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
  index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());

Note

这是前半部分，这里关注下index_t num_kernels。此处的num_kernel是指希望启动的CUDA中的kernel数量（实际分配数量受硬件限制，详见之后的cuda_get_num_blocks(num_kernels)）。
但观察下其值，可以猜测启动的每个kernel要做的事情。它的值是(conv_in_channel imes out_H imes out_W=K imes N/k.Size())，而这个函数的目的是要完成对col_buffer_3d的填充，所以每个线程kernel将对一个卷积kernel所覆盖的in_data中的数据进行排列操作。

im2col_gpu_kernel

im2col后半部分就是启动这个函数，所以直接到这里。这个函数就是之前猜测的实现，就不进行单独Note了，Code里面的注释可以提示进行操作。

//@src/operator/nn/im2col.cuh
template <typename DType>
__global__ void im2col_gpu_kernel(const int n, const DType* data_im,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int height_col, const int width_col,
    DType* data_col) {
/*
height_col= out_data.H
width_col = ~~      .W

n         = in_data.channel x out_data.H x out_data.W

data_col  : [ group , in_data.channel x kernel.size/group   , out_data.H x out_data.W ]
            [   1   ,                  K                    ,            N            ]

so, need  x kernel.size in this funcn...
*/
  CUDA_KERNEL_LOOP(index, n) {
    const int h_index = index / width_col;           //  in_data.ch_th x out_data.H_th
    const int h_col = h_index % height_col;          // out_data.H_th
    const int w_col = index % width_col;             // out_data.W_th
    const int c_im = h_index / height_col;           // in_data.ch_th
    const int c_col = c_im * kernel_h * kernel_w;    // see data_col's layout above, weight_3d : [ conv_out_ch, K ] so, ... dot(weight_3d, data_col)
    const int h_offset = h_col * stride_h - pad_h;   // (h_offset, w_offset) is the top-left point, where facing its counterpart in kernel
    const int w_offset = w_col * stride_w - pad_w;
    DType* data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;  // see data_col's layout above
    const DType* data_im_ptr = data_im;
    data_im_ptr += (c_im * height + h_offset) * width + w_offset;    // see data_col's layout above
    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        int h_im = h_offset + i * dilation_h;
        int w_im = w_offset + j * dilation_w;
        *data_col_ptr =
            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
            data_im_ptr[i * dilation_h * width + j * dilation_w] : static_cast<DType>(0);
        data_col_ptr += height_col * width_col;       //    +N  (next row)
      }
    }
  }
}

小结

conv的实现是通过内积进行的，而gpu在此处的加速作用，是并行地搬运数据（然而由于存在重复的数据，没有利用局部存储将会损失一部分性能）和内积操作。
回到之前的那个话题，用稀疏矩阵实现的conv操作，要对kernel size大小的数据进行img size的重复，这导致内存消耗攀升显著。但正在讨论中的这个方法也存在着数据重复问题：每个in_data中的数据将被重复kernel size数量次（因为filter在产生每个输出pixel中，都按照自身的需要排列输入数据，每个输入中的数据将在kernel size的范围被考虑。而这种排列是在计算前就已经完成了的——将为重复的数据付出内存代价）。正式地来看，重复数据量分别为：(k imes k imes H imes W)，(H imes W imes k imes k)。也就是说两者没有差别。
从程序上来看，im2col启动了线程kernel，但(我似乎记得)CUDA的kernel调用只会显式地阻塞，这就为mxnet的非阻塞式设计提供了便利。

跟进

MSHADOW_CUDA_POST_KERNEL_CHECK 的使用。