常用代码片段及技巧

自动选择GPU和CPU

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model and tensor to device
vgg = models.vgg16().to(device)

切换当前目录

import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass

临时添加环境目录

import sys
sys.path.append('引用模块的地址')
print(sys.path)

打印模型参数

from torchsummary import summary
# 1 means in_channels
summary(model, (1, 28, 28))

将tensor的列表转换为tensor

x = torch.stack(tensor_list)

内存不够

Smaller batch size

torch.cuda.empty_cache()every few minibatches

分布式计算

训练数据和测试数据分开

每次用完之后删去variable，采用del x

debug tensor memory

resource` module is a Unix specific package as seen in https://docs.python.org/2/library/resource.html which is why it worked for you in Ubuntu, but raised an error when trying to use it in Windows.

Here is what solved it for me.

Downgrade to the Apache Spark 2.3.2 prebuild version

Install (or downgrade) jdk to version 1.8.0

My installed jdk was 1.9.0, which doesn't seem to be compatiable with spark 2.3.2 or 2.4.0

make sure that when you run java -version in cmd (command prompt), it show java version 8. If you are seeing version 9, you will need to change your system ENV PATH to ensure it points to java version 8.

Check this link to get help on changing the PATH if you have multiple java version installed.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def debug_memory():
    import collections, gc, resource, torch
    print('maxrss = {}'.format(
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    tensors = collections.Counter((str(o.device), o.dtype, tuple(o.shape))
                                  for o in gc.get_objects()
                                  if torch.is_tensor(o))
    for line in sorted(tensors.items()):
        print('{}	{}'.format(*line))
        
        
 # example
import tensor
 x = torch.tensor(3,3)
 debug_memory()
 
 y = torch.tensor(3,3)
 debug_memory()
 
 z = [torch.randn(i).long() for i in range(10)]
 debug_memory()

10-18-2019

Matlab绘虚线图

%matplotlib inline
from matplotlib import pyplot as plt
from IPython import display
import torch
import math

x = torch.arange(-7, 7, 0.01)
# Mean and variance pairs
parameters = [(0,1), (0,2), (3,1)]

# Display SVG rather than JPG
display.set_matplotlib_formats('svg')
plt.figure(figsize=(10, 6))
for (mu, sigma) in parameters:
    p = (1/math.sqrt(2 * math.pi * sigma**2)) * torch.exp(-(0.5/sigma**2) * (x-mu)**2)
    plt.plot(x.numpy(), p.numpy(), label='mean ' + str(mu) + ', variance ' + str(sigma))
plt.axhline(y=0, color='black', linestyle='dashed')
plt.legend()
plt.show()

loss训练代码（训练集与验证集）

lr = 0.03  # Learning rate
num_epochs = 3  # Number of iterations
net = linreg  # Our fancy linear model
loss = squared_loss  # 0.5 (y-y')^2

for epoch in range(num_epochs):
    # Assuming the number of examples can be divided by the batch size, all
    # the examples in the training data set are used once in one epoch
    # iteration. The features and tags of mini-batch examples are given by X
    # and y respectively
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y)  # Minibatch loss in X and y
        l.mean().backward()  # Compute gradient on l with respect to [w,b]
        sgd([w, b], lr, batch_size)  # Update parameters using their gradient
    with torch.no_grad():
        train_l = loss(net(features, w, b), labels)
        print('epoch %d, loss %f' % (epoch + 1, train_l.mean().numpy()))

保存最佳模型

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time() # 计时开始

    best_model_wts = model.state_dict() # 读取训练好的模型权重
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # 每个epoch中游训练和验证部分
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  
            else:
                model.train(False)  

            running_loss = 0.0
            running_corrects = 0


            for data in dataloaders[phase]:

                inputs, labels = data

                # 如果使用GPU，则使用Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # 初始化梯度值
                optimizer.zero_grad()

                # 前向
                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)

                # 后向，如果为训练集则进行梯度优化
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # 统计损失
                running_loss += loss.data[0]
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # 深度复制该模型
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # 载入最佳的模型
    model.load_state_dict(best_model_wts)
    return model