pytorch 文本多分类

在 https://github.com/jiangqy/LSTM-Classification-pytorch 基础上进行的修改

一、需求：短信文本分类

1.1 原始数据

以英语语言为主，人工打标签，分为四类：0，1，2，3。
文本长度：最长为300个单词。
已经经过预处理：去掉所有其它字符,只保留了字母，以空格作为分隔符。

df = pd.read_csv('./data/labeled.csv')
df=df[['clean_review','cat_id']]
df.sample(10)

二、构造训练样本

1. 特征和标签

import torch
from torch.utils.data.dataset import Dataset
import numpy as np
import pandas as pd

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):

    def __init__(self,sen_len):
        self.dictionary = Dictionary()
        self.dictionary.add_word("UNK") # 要指定样本长度，不够的补0，默认0对应的单词为"UNK"
        self.texts,self.labels = self.tokenize(sen_len)

    def tokenize(self,sen_len):
        """
        得到字典，完成文本从单词到数字的转换
        :param sen_len: 文本长度
        :return:
        """
        df = pd.read_csv("./data/clean_review.csv")
        token_text = []
        tokens = 0
        labels = []
        for item in df.iterrows():
            line = item[1]["clean_review"] # 该行中，clean_review字段对应的值
            labels.append(int(item[1]["cat_id"])) # 该行中，cat_id字段对应的值，即这个样本的标签
            words = line.split(" ") 

            tokens += len(words)
            for word in words:
                word = word.strip()
                if word:
                    self.dictionary.add_word(word)

            txt = torch.LongTensor(np.zeros(sen_len, dtype=np.int64)) # 构造长度为sen_len的tensor
            for index,word in enumerate(words[:sen_len]):
                word = word.strip()
                if word:
                    txt[index] = self.dictionary.word2idx[word]
            token_text.append(txt)

        return token_text,labels

2. 自定义Dataset

class LSTMDataset(Dataset):

    def __init__(self,sen_len, corpus):
        corpus = corpus
        self.token_text = corpus.texts
        self.labels = corpus.labels
        self.sen_len = sen_len

    def __getitem__(self, index):
        """
        根据索引获取对应的特征和标签
        :param index:
        :return:
        """
        text = self.token_text[index]
        label = torch.LongTensor([self.labels[index]])
        return text, label

    def __len__(self):
        return len(self.labels)

三、LSTM模型

import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable


class LSTMClassifier(nn.Module):
    """
    LSTM模型
    """
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.use_gpu = use_gpu

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) # 生成的嵌入矩阵
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=1) #
        self.hidden2label = nn.Linear(hidden_dim, label_size) # 四分类 label_size=4
        self.hidden = self.init_hidden()

    def init_hidden(self):
        if self.use_gpu:
            h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
            c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
        else:
            h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
            c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = embeds.view(len(sentence), self.batch_size, -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y = self.hidden2label(lstm_out[-1])
        y = F.softmax(y,dim=1)

        return y

# 得到最终的train_loader，训练过程就是对其进行遍历
corpus = Corpus(DATA_DIR,sentence_len)
train_set = LSTMDataset(sentence_len, corpus)
train_loader = DataLoader(train_set,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=4
                         )

四、损失函数和优化器

optimizer = optim.SGD(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

输出层经过softmax之后，可以使用CrossEntropyLoss作为损失函数。

五、开始训练

for epoch in range(epochs):
    optimizer = adjust_learning_rate(optimizer, epoch)

    ## training epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for iter, traindata in enumerate(train_loader):
        train_inputs, train_labels = traindata
        train_labels = torch.squeeze(train_labels)

        if use_gpu:
            train_inputs, train_labels = Variable(train_inputs.cuda()), train_labels.cuda()
        else:
            train_inputs = Variable(train_inputs)

        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train_inputs.t())

        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()

        # calc training acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == train_labels).sum()
        total += len(train_labels)
        total_loss += loss.item()

    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc / total)
    ## testing epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for iter, testdata in enumerate(test_loader):
        test_inputs, test_labels = testdata
        test_labels = torch.squeeze(test_labels)

        if use_gpu:
            test_inputs, test_labels = Variable(test_inputs.cuda()), test_labels.cuda()
        else:
            test_inputs = Variable(test_inputs)

        model.batch_size = len(test_labels)
        model.hidden = model.init_hidden()
        output = model(test_inputs.t())

        loss = loss_function(output, Variable(test_labels))

        # calc testing acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == test_labels).sum()
        total += len(test_labels)
        total_loss += loss.item()
    test_loss_.append(total_loss / total)
    test_acc_.append(total_acc / total)

    print('[Epoch: %3d/%3d] Training Loss: %.3f, Testing Loss: %.3f, Training Acc: %.3f, Testing Acc: %.3f'
          % (epoch, epochs, train_loss_[epoch], test_loss_[epoch], train_acc_[epoch], test_acc_[epoch]))

六、模型保存

traced_model = torch.jit.script(model)
traced_model.save("lstm.pt")

如果生成的模型提供给java调用，而不是python直接调用，使用pytorch提供的torch.jit.script方法
教程如下：https://liuzhian.github.io/2021/04/08/初识TorchScript/