MindSpore 1.0.0 可视化 体验

转载地址:https://bbs.huaweicloud.com/forum/thread-83182-1-1.html

作者:飞翔的企鹅

概述

最近从朋友那儿得知华为的深度学习框架MindSpore更新到了1.0.0版本,效率比起之前提高了好多,到MindSpore官网上看看了一番,官网上可视化教程记事本文档引起了我的兴趣。于是,打算 在这篇记事本文档基础之上,学习下如何记录训练过程的数据。

过程

首先是下载数据集

附上代码:

import os, shutil
import urllib.request
from urllib.parse import urlparse


def callbackfunc(blocknum, blocksize, totalsize):
    percent = 100.0 * blocknum * blocksize / totalsize
    if percent > 100:
        percent = 100
    print("downloaded {:.1f}".format(percent), end="\r")

def _download_dataset():
    ds_url = "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
    file_base_name = urlparse(ds_url).path.split("/")[-1]
    file_name = os.path.join("./datasets", file_base_name)
    if not os.path.exists(file_name):
        urllib.request.urlretrieve(ds_url, file_name, callbackfunc)
    print("{:*^40}".format("DataSets Downloaded"))
    shutil.unpack_archive(file_name, extract_dir="./datasets/cifar-10-binary")

def _copy_dataset(ds_part, dest_path):
    data_source_path = "./datasets/cifar-10-binary/cifar-10-batches-bin"
    ds_part_source_path = os.path.join(data_source_path, ds_part)
    if not os.path.exists(ds_part_source_path):
        _download_dataset()
    shutil.copy(ds_part_source_path, dest_path)

def download_cifar10_dataset():
    ds_base_path = "./datasets/cifar-10-batches-bin"
    train_path = os.path.join(ds_base_path, "train")
    test_path = os.path.join(ds_base_path, "test")
    print("{:*^40}".format("Checking DataSets Path."))
    if not os.path.exists(train_path) and not os.path.exists(train_path):
        os.makedirs(train_path)
        os.makedirs(test_path)
    print("{:*^40}".format("Downloading CIFAR-10 DataSets."))
    for i in range(1, 6):
        train_part = "data_batch_{}.bin".format(i)
        if not os.path.exists(os.path.join(train_path, train_part)):
           _copy_dataset(train_part, train_path)
        pops = train_part + " is ok"
        print("{:*^40}".format(pops))
    test_part = "test_batch.bin"
    if not os.path.exists(os.path.join(test_path, test_part)):
        _copy_dataset(test_part, test_path)
    print("{:*^40}".format(test_part+" is ok"))
    print("{:*^40}".format("Downloaded CIFAR-10 DataSets Already."))

download_cifar10_dataset()

原来这里是使用urllib模块将cifar10数据集下载到本地目录

处理数据集

数据集下载下来后还不能直接用于训练,需要进行一些预处理操作,研究了一下,主要是用了以下几个处理方法:

    import mindspore.dataset.transforms.c_transforms as C
    import mindspore.dataset.vision.c_transforms as CV

    resize_op = CV.Resize(size=(227, 227))
    rescale_op = CV.Rescale(rescale, shift)
    channel_swap_op = CV.HWC2CHW()
    typecast_op = C.TypeCast(mstype.int32)

    cifar_ds = cifar_ds.map(operations=resize_op, input_columns="image")
    cifar_ds = cifar_ds.map(operations=rescale_op, input_columns="image")
    cifar_ds = cifar_ds.map(operations=normalize_op, input_columns="image")
    cifar_ds = cifar_ds.map(operations=channel_swap_op, input_columns="image")

使用pyplot查看一下预处理的数据图像:

from matplotlib import pyplot as plt
import numpy as np

label_list = ["airplane", "automobile", "bird", "cat", "deer", "dog", "rog", "horse", "ship", "truck"]
print("The 32 images with label of the first batch in ds_train are showed below:")
ds_iterator = ds_train.create_dict_iterator()
ds_iterator.get_next()
batch_1 = ds_iterator.get_next()
batch_image = batch_1["image"].asnumpy()
batch_label = batch_1["label"].asnumpy()
%matplotlib inline
plt.figure(dpi=144)
for i,image in enumerate(batch_image):
    plt.subplot(4, 8, i+1)
    plt.subplots_adjust(wspace=0.2, hspace=0.2)
    image = image/np.amax(image)
    image = np.clip(image, 0, 1)
    image = np.transpose(image,(1,2,0))
    plt.imshow(image)
    num = batch_label<i>
    plt.title(f"image {i+1}\n{label_list[num]}", y=-0.65, fontdict={"fontsize":8})
    plt.axis('off')    
plt.show()

好激动,打印出了第一个batch的图像信息:

MindSpore 1.0.0 可视化 体验

定义网络

现在终于到了定义网络这一步了,还是用上终极大招:Ctrl + C/Ctrl + V,一步到位:

import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal
from mindspore.ops import operations as P

def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"):
    weight = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=kernel_size, stride=stride, padding=padding,
                     weight_init=weight, has_bias=False, pad_mode=pad_mode)

def fc_with_initialize(input_channels, out_channels):
    weight = weight_variable()
    bias = weight_variable()
    return nn.Dense(input_channels, out_channels, weight, bias)

def weight_variable():
    return TruncatedNormal(0.02)


class AlexNet(nn.Cell):
    """
    Alexnet
    """
    def __init__(self, num_classes=10, channel=3):
        super(AlexNet, self).__init__()
        self.conv1 = conv(channel, 96, 11, stride=4)
        self.conv2 = conv(96, 256, 5, pad_mode="same")
        self.conv3 = conv(256, 384, 3, pad_mode="same")
        self.conv4 = conv(384, 384, 3, pad_mode="same")
        self.conv5 = conv(384, 256, 3, pad_mode="same")
        self.relu = nn.ReLU()
        self.max_pool2d = P.MaxPool(ksize=3, strides=2)
        self.flatten = nn.Flatten()
        self.fc1 = fc_with_initialize(6*6*256, 4096)
        self.fc2 = fc_with_initialize(4096, 4096)
        self.fc3 = fc_with_initialize(4096, num_classes)
        # Init TensorSummary
        self.tensor_summary = P.TensorSummary()
        # Init ImageSummary
        self.image_summary = P.ImageSummary()

    def construct(self, x):
        # Record image by Summary operator
        self.image_summary("Image", x)
        x = self.conv1(x)
        # Record tensor by Summary operator
        self.tensor_summary("Tensor", x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.conv3(x)
        x = self.relu(x)
        x = self.conv4(x)
        x = self.relu(x)
        x = self.conv5(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

大概明白了,定义的AlexNet网络分为了好多层(construct方法中的),然后使用TensorSummary记录张量数据,ImageSummary记录图像数据。
但其他参数数据是怎么记录的呢,继续学习。

开始训练

不懂就是Ctrl+C

network = AlexNet(num_classes=10)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
lr = Tensor(get_lr(0, 0.002, 10, ds_train.get_dataset_size()))
net_opt = nn.Momentum(network.trainable_params(), learning_rate=lr, momentum=0.9)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
config_ck = CheckpointConfig(save_checkpoint_steps=1562, keep_checkpoint_max=10)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", config=config_ck)
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

summary_base_dir = "./summary_dir"
os.system(f"mindinsight start --summary-base-dir  {summary_base_dir} --port=8080")

# Init a SummaryCollector callback instance, and use it in model.train or model.eval
specified = {"collect_metric": True, "histogram_regular": "^conv1.*|^conv2.*", "collect_graph": True, "collect_dataset_graph": True}
summary_collector = SummaryCollector(summary_dir="./summary_dir/summary_01", collect_specified_data=specified, collect_freq=1, keep_default_action=False, collect_tensor_freq=200)

print("============== Starting Training ==============")
model.train(epoch=10, train_dataset=ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor(), summary_collector], dataset_sink_mode=True)

print("============== Starting Testing ==============")
param_dict = load_checkpoint("checkpoint_alexnet-10_1562.ckpt")
load_param_into_net(network, param_dict)
acc = model.eval(ds_eval, callbacks=summary_collector, dataset_sink_mode=True)
print("============== {} ==============".format(acc))

仔细看了一遍代码后,原来是使用了SummaryCollector算子记录标量数据和参数分布图的。SummaryCollector的使用方法原文中有很好的解释,附上我的训练结果:

epoch: 10 step: 1562, loss is 0.40318152
Epoch time: 116258.987, per step time: 74.430
============== Starting Testing ==============
============== {'Accuracy': 0.8340344551282052} ==============

一共迭代训练了10次,精度达到0.83,结果还不错,GPU加速下时间也还挺快的。

查看结果

按照文档中的方法,在本地浏览器中打开127.0.0.1:8080,终于查看到了记录的结果:

MindSpore 1.0.0 可视化 体验

经过许久的时间,终于得到了收获。MindSpore使用起来真的很方便,而且参考官方教程文档学习和使用加速了学习和体验的过程。深深地为国产AI框架打call!

联系邮箱:602642050@qq.com

飞翔的企鹅

上一篇:国产计算框架mindspore在gpu环境下编译分支r1.3,使用suod权限成功编译——(修复部分bug,给出具体编译过程)


下一篇:MindSpore模型精度调优实战:如何更快定位精度问题