CNN压缩：为反向传播添加mask（caffe代码修改）

2024-03-26 17:05:52

神经网络压缩的研究近三年十分热门，笔者查阅到相关的两篇博客，博主们非常奉献的提供了源代码，但是发发现在使用gpu训练添加mask的网络上，稍微有些不顺，特此再进行详细说明。

此文是在基于Caffe的CNN剪枝[1]和 Deep Compression阅读理解及Caffe源码修改[2] 的基础上修改的。

mask的结构？

[1]中使用的blob，存储mask。blob是一块数据块，在初始化时，需要为gpu上的数据块申请一块空间，故有Addmask()函数。AddMask()是blob.hpp中的blob的成员方法，需要在blob.cpp中实现。使用时将Addmask()添加在innerproduct.cpp和base_conv.cpp中，使得网络在setuplayer的过程中，为fc层和conv层多开辟一块存放mask的syncedmemory。blob有一系列需要实现的cpu_data()/mutable_cpu_data()等，初始化中改变mask的值时需要注意使用合理的方式。

InnerProductLayer.cpp

 void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,

       const vector<Blob<Dtype>*>& top) {

     ...

     this->blobs_[].reset(new Blob<Dtype>(weight_shape));

     this->blobs_[]->Addmask();

     ...}

base_conv.cpp:

 template <typename Dtype>

 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,

       const vector<Blob<Dtype>*>& top) {

     ...

     this->blobs_[].reset(new Blob<Dtype>(weight_shape));

     this->blobs_[]->Addmask();

     ...}

修改blob.hpp和blob.cpp，添加成员mask_和相关的方法，在[1]文章的评论里作者已给出源代码。

[2]中使用layer结构定义mask，layer是相当于数据的一系列操作，或者说是blob的组合方法。

但是，想要实现在gpu上的操作，数据需要有gpu有关的操作。故此处采用[1]中的方法，将mask_添加到blob class中，实现mask_属性。

mask的初始化？

在Caffe框架下，网络的初始化有两种方式，一种是调用filler，按照模型中定义的初始化方式进行初始化，第二种是从已有的caffemodel或者snapshot中读取相应参数矩阵进行初始化[1]。

1、filler的方法

在程序开始时，网络使用net.cpp中的Init()进行初始化，由输入至输出，依次调用各个层的layersetup，建立网络结构。如下所示是caffe中使用xavier方法进行填充的操作。

 virtual void Fill(Blob<Dtype>* blob) {

     CHECK(blob->count());

     int fan_in = blob->count() / blob->num();

     int fan_out = blob->count() / blob->channels();

     Dtype n = fan_in;  // default to fan_in

     if (this->filler_param_.variance_norm() ==

         FillerParameter_VarianceNorm_AVERAGE) {

       n = (fan_in + fan_out) / Dtype();

     } else if (this->filler_param_.variance_norm() ==

         FillerParameter_VarianceNorm_FAN_OUT) {

       n = fan_out;

     }

     Dtype scale = sqrt(Dtype() / n);

     caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,

         blob->mutable_cpu_data());

     //Filler<Dtype>:: FillMask(blob);

     CHECK_EQ(this->filler_param_.sparse(), -)

          << "Sparsity not supported by this Filler.";

   }

filler的作用是，为建立的网络结构产生随机初始化值。

即使是从snapshot或caffemodel中读入数据，也执行随机填充操作。

2、从snapshot或caffemodel中读入数据

tools/caffe.cpp 中的phase:train可以从snapshot或caffemodel中提取参数，进行finetune。phase:test则可以从提取的参数中建立网络，进行预测过程。

这里笔者的网络结构是在pycaffe中进行稀疏化的，因此读入网络的proto文件是一个连接数不变、存在部分连接权值为零的网络。需要在读入参数的同时初始化mask_。因此修改blob.cpp中的fromproto函数：

 template <typename Dtype>

 void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {

   if (reshape) {

     vector<int> shape;

     if (proto.has_num() || proto.has_channels() ||

         proto.has_height() || proto.has_width()) {

       // Using deprecated 4D Blob dimensions --

       // shape is (num, channels, height, width).

       shape.resize();

       shape[] = proto.num();

       shape[] = proto.channels();

       shape[] = proto.height();

       shape[] = proto.width();

     } else {

       shape.resize(proto.shape().dim_size());

       for (int i = ; i < proto.shape().dim_size(); ++i) {

         shape[i] = proto.shape().dim(i);

       }

     }

     Reshape(shape);

   } else {

     CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)";

   }

   // copy data

   Dtype* data_vec = mutable_cpu_data();

   if (proto.double_data_size() > ) {

     CHECK_EQ(count_, proto.double_data_size());

     for (int i = ; i < count_; ++i) {

       data_vec[i] = proto.double_data(i);

     }

   } else {

     CHECK_EQ(count_, proto.data_size());

     for (int i = ; i < count_; ++i) {

       data_vec[i] = proto.data(i);

     }

   }

   if (proto.double_diff_size() > ) {

     CHECK_EQ(count_, proto.double_diff_size());

     Dtype* diff_vec = mutable_cpu_diff();

     for (int i = ; i < count_; ++i) {

       diff_vec[i] = proto.double_diff(i);

     }

   } else if (proto.diff_size() > ) {

     CHECK_EQ(count_, proto.diff_size());

     Dtype* diff_vec = mutable_cpu_diff();

     for (int i = ; i < count_; ++i) {

       diff_vec[i] = proto.diff(i);

     }

   }

   if(shape_.size()==||shape_.size()==){

     Dtype* mask_vec = mutable_cpu_data();

     CHECK(count_);

     for(int i=;i<count_;i++)

       mask_vec[i]=data_vec[i]?:;

 }

在读入proto文件的同时，如果层的大小是4D——conv层、或2D——fc层时，初始化mask_为data_vec[i]?1:0。当层的大小是1Ds——pool或relu层时，不进行mask的初始化。

反向传播的修改？

1、修改blob的更新方式，添加math_funcion.hpp头文件。

 template <typename Dtype>

 void Blob<Dtype>::Update() {

   // We will perform update based on where the data is located.

   switch (data_->head()) {

   case SyncedMemory::HEAD_AT_CPU:

     // perform computation on CPU

     caffe_axpy<Dtype>(count_, Dtype(-),

         static_cast<const Dtype*>(diff_->cpu_data()),

         static_cast<Dtype*>(data_->mutable_cpu_data()));

     caffe_mul<Dtype>(count_,

       static_cast<const Dtype*>(mask_->cpu_data()),

       static_cast<const Dtype*>(data_->cpu_data()),

       static_cast<Dtype*>(data_->mutable_cpu_data()));

     break;

   case SyncedMemory::HEAD_AT_GPU:

   case SyncedMemory::SYNCED:

 #ifndef CPU_ONLY

     // perform computation on GPU

     caffe_gpu_axpy<Dtype>(count_, Dtype(-),

         static_cast<const Dtype*>(diff_->gpu_data()),

         static_cast<Dtype*>(data_->mutable_gpu_data()));

     caffe_gpu_mul<Dtype>(count_,

       static_cast<const Dtype*>(mask_->gpu_data()),

       static_cast<const Dtype*>(data_->gpu_data()),

       static_cast<Dtype*>(data_->mutable_gpu_data()));

 #else

     NO_GPU;

 #endif

     break;

   default:

     LOG(FATAL) << "Syncedmem not initialized.";

   }

 }

2、为cpu下的计算和gpu下的计算分别添加形如weight[i]*=mask[i];的运算方式。

inner_product_layer.cpp:

 void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,

     const vector<bool>& propagate_down,

     const vector<Blob<Dtype>*>& bottom) {

   if (this->param_propagate_down_[]) {

     const Dtype* top_diff = top[]->cpu_diff();

     const Dtype* bottom_data = bottom[]->cpu_data();

     // Gradient with respect to weight

     Dtype* weight_diff = this->blobs_[]->mutable_cpu_diff();

     vector<int> weight_shape();

     if (transpose_) {

       weight_shape[] = K_;

       weight_shape[] = N_;

     } else {

       weight_shape[] = N_;

       weight_shape[] = K_;

     }

     int count = weight_shape[]*weight_shape[];

     const Dtype* mask = this->blobs_[]->cpu_mask();

     for(int j=;j<count;j++)

       weight_diff[j]*=mask[j];

     if (transpose_) {

       caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans,

           K_, N_, M_,

           (Dtype)., bottom_data, top_diff,

           (Dtype)., weight_diff);

     } else {

       caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans,

           N_, K_, M_,

           (Dtype)., top_diff, bottom_data,

           (Dtype)., weight_diff);

     }

   }

   if (bias_term_ && this->param_propagate_down_[]) {

     const Dtype* top_diff = top[]->cpu_diff();

     // Gradient with respect to bias

     caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)., top_diff,

         bias_multiplier_.cpu_data(), (Dtype).,

         this->blobs_[]->mutable_cpu_diff());

   }

   if (propagate_down[]) {

     const Dtype* top_diff = top[]->cpu_diff();

     // Gradient with respect to bottom data

     if (transpose_) {

       caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans,

           M_, K_, N_,

           (Dtype)., top_diff, this->blobs_[]->cpu_data(),

           (Dtype)., bottom[]->mutable_cpu_diff());

     } else {

       caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,

           M_, K_, N_,

           (Dtype)., top_diff, this->blobs_[]->cpu_data(),

           (Dtype)., bottom[]->mutable_cpu_diff());

     }

   }

 }

inner_product_layer.cu:

 template <typename Dtype>

 void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,

     const vector<bool>& propagate_down,

     const vector<Blob<Dtype>*>& bottom) {

   if (this->param_propagate_down_[]) {

     const Dtype* top_diff = top[]->gpu_diff();

     const Dtype* bottom_data = bottom[]->gpu_data();

     vector<int> weight_shape();

     if (transpose_) {

       weight_shape[] = K_;

       weight_shape[] = N_;

     } else {

       weight_shape[] = N_;

       weight_shape[] = K_;

     }

     int count = weight_shape[]*weight_shape[];

     caffe_gpu_mul<Dtype>(count,static_cast<const Dtype*>(this->blobs_[]->mutable_gpu_diff()),static_cast<const Dtype*>(this->blobs_[]->gpu_mask()),static_cast<Dtype*>(this->blobs_[]->mutable_gpu_diff()));

     Dtype* weight_diff = this->blobs_[]->mutable_gpu_diff();

     //for(int j=0;j<count;j++)

       //weight_diff[j]*=this->masks_[j];

     // Gradient with respect to weight

     if (transpose_) {

       caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans,

           K_, N_, M_,

           (Dtype)., bottom_data, top_diff,

           (Dtype)., weight_diff);

     } else {

       caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans,

           N_, K_, M_,

           (Dtype)., top_diff, bottom_data,

           (Dtype)., weight_diff);

     }

   }

   if (bias_term_ && this->param_propagate_down_[]) {

     const Dtype* top_diff = top[]->gpu_diff();

     // Gradient with respect to bias

     caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)., top_diff,

         bias_multiplier_.gpu_data(), (Dtype).,

         this->blobs_[]->mutable_gpu_diff());

   }

   if (propagate_down[]) {

     const Dtype* top_diff = top[]->gpu_diff();

     // Gradient with respect to bottom data

     if (transpose_) {

       caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans,

           M_, K_, N_,

           (Dtype)., top_diff, this->blobs_[]->gpu_data(),

           (Dtype)., bottom[]->mutable_gpu_diff());

     } else {

       caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans,

           M_, K_, N_,

          (Dtype)., top_diff, this->blobs_[]->gpu_data(),

          (Dtype)., bottom[]->mutable_gpu_diff());

     }

   }

 }

至此修改完毕。

另外，caffe在新的版本中已添加sparse_参数，参考 https://github.com/BVLC/caffe/pulls?utf8=%E2%9C%93&q=sparse

码农公寓

相关文章