DeepFM(3) torch实现

前言

源码在这个作者的github
https://github.com/EternalStarICe/recommendation-system-model
本文为其一个读后感。

1、数据加载

原始数据，除去Id和label列；13列数值特征前缀为I，26列类别特征前缀为C。

def load_data():
    # file_path = '../../DeepRecommendationModel/code/data/criteo_sample.txt'
    file_path = 'train.txt'
    raw_data = pd.read_csv(file_path)
    raw_data.drop(['Id'], axis=1, inplace=True)
    cat_cols = [col for col in raw_data.columns.values if 'C' in col] # 类别特征列列名列表
    num_cols = [col for col in raw_data.columns.values if 'I' in col] # 数值特征列列名列表
    return raw_data, cat_cols, num_cols

2、数据预处理

第一步得到的数据，数值特征保留原始值，类别特征进行LabelEncoder编码。

def preprocess_data(df, cat_cols, num_cols):
    df_cp = df.copy()
    df_cp[num_cols] = df_cp[num_cols].fillna(0.0) # 数值特征缺失值填充为0
    for col in num_cols:
        df_cp[col] = df_cp[col].apply(lambda x: np.log(x + 1) if x > -1 else -1)

    df_cp[cat_cols] = df_cp[cat_cols].fillna("-1") # 类别特征缺失值填充为-1
    for col in cat_cols:
        encoder = LabelEncoder()
        df_cp[col] = encoder.fit_transform(df_cp[col]) # 对目标标签进行编码，值在0到n_class -1之间
    return df_cp[cat_cols + num_cols]
train_data = preprocess_data(raw_data, cat_cols, num_cols)
train_data['label'] = raw_data['Label']

3、类别特征个数统计

def get_cat_tuple_list(df, cat_cols):
    cat_tuple = namedtuple('cat_tuple', ('name', 'vocab_size')) # Returns a new subclass of tuple with named fields。该元组可以通过name和vocab_size获取对应数据。
    cat_tuple_list = [cat_tuple(name=col, vocab_size=df[col].nunique()) for col in cat_cols]
    return cat_tuple_list
cat_tuple_list = get_cat_tuple_list(train_data, cat_cols)

4、训练集生成

X = torch.tensor(train_data.drop(['label'], axis=1).values, dtype=torch.float)
y = torch.tensor(train_data.label.values, dtype=torch.long)
dataset = Data.TensorDataset(X, y) # TensorDataset对给定的tensor数据(样本和标签)，将它们包装成dataset
data_iter = Data.DataLoader(dataset=dataset, batch_size=128, shuffle=True)

x的值:[26个类别特征编码后的索引值，13个原始数值特征],长度为39。
y [1,0,...]

5、DeepFM模型

模型初始化传入的参数：
num_cols：类别特征名列表
cat_cols：数值特征名列表
cat_tuple_list：列表特征每个特征的维度
emb_len:嵌入矩阵维度k

class DeepFM(nn.Module):
    def __init__(self, num_cols, cat_cols, cat_tuple_list, emb_len=4):
        super(DeepFM, self).__init__()
        self.cat_col_len = len(cat_cols) # 类别特征26个
        self.num_col_len = len(num_cols) # 数值特征13个
        self.cat_tuple_list = cat_tuple_list # [(name="",vocab_size=10),...]
        self.emb_len = emb_len # k=4
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.deep_input_dim = self.num_col_len + self.emb_len * self.cat_col_len # 神经网络输入层维度：数值特征个数 + 类别特征个数*嵌入维度。类别特征one-hot编码后再嵌入，数值特征维度为1。
        # three part: linear, fm, dnn

        # linear
        self.fm_linear_embeddings = nn.ModuleList()
        for fc in self.cat_tuple_list:
            self.fm_linear_embeddings.append(nn.Embedding(fc.vocab_size, 1)) # 每个类别特征得到一维输出。26个类别特征，需要优化26个一阶嵌入矩阵。

        self.linear_dense = nn.Linear(self.num_col_len, 1) # 不同与每个类别特征都有一维输出，13数值特征得到一维输出。这里相当于wx，13个x对应13个w_i。

        # FM
        self.fm_embeddings = nn.ModuleList()
        for fc in self.cat_tuple_list: # FM部分只处理了类别特征。有26个二阶嵌入矩阵
            self.fm_embeddings.append(nn.Embedding(fc.vocab_size, emb_len))

        # DNN部分有4个线性层，最后一层一个神经元，得到最后输出概率
        self.deep_linear1 = nn.Linear(self.deep_input_dim, 128)
        self.deep_linear2 = nn.Linear(128, 64)
        self.deep_linear3 = nn.Linear(64, 32)
        self.deep_final_linear = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.3)
        self.dropout3 = nn.Dropout(0.1)

        # total dense
        self.final_linear = nn.Linear(3, 2)

2、前向传播
输入的x为：[类别特征为特征编码之后的index,数值特征为原始值]

def FM(x): # x:(batch, feature_num, emb_len) 相当于公式中的xi,f*vi。根据公式，左边部分为求和之后取平方。右边部分为平方之后再求和。
    left = torch.square(torch.sum(x, dim=1, keepdim=True)) # (batch, 1, emb_len)
    right = torch.sum(torch.square(x), dim=1, keepdim=True) # (batch, 1, emb_len)
    res = 0.5 * torch.sum((left - right), dim=2) # (batch, 1) # 最后fm的输出为1/2*sum（左-右）
    return res

def forward(self, x):
    # print(self.deep_input_dim)
    cat_x = x[:, :self.cat_col_len] 
    num_x = x[:, self.cat_col_len:]

    # linear
    linear_num_output = self.linear_dense(num_x) # 13个数值特征的输出
    linear_output_list = []
    for i in range(se lf.cat_col_len):
        linear_output_list.append(self.fm_linear_embeddings[i](cat_x[:, i].long())) # 第i个类别特征经过一阶嵌入层之后的输出。计算过程为从嵌入矩阵中根据类别编码进行查找，得到对应行的那个值。
    linear_cat_output = linear_output_list[0]
    for i in range(1, self.cat_col_len):
        linear_cat_output += linear_output_list[i] # 所有类别特征的一阶输出求和
    linear_output = linear_cat_output + linear_num_output # 类别特征和数值特征求和，其shape为（batch_size,1)。v 

    # FM
    fm_input = self.fm_embeddings[0](cat_x[:, 0].long()).unsqueeze(dim=1) # a.unsqueeze(N) 就是在a中指定位置N加上一个维数为1的维度。第1个类别特征的二阶输出。torch.Size([batch_size, 1, 4])
    for i in range(1, self.cat_col_len):
        fm_input = torch.cat([fm_input, self.fm_embeddings[i](cat_x[:, i].long()).unsqueeze(dim=1)], dim=1) # 按照axis=1的维度进行拼接，所以拼接后的维度为[batch_size,26,4]
    fm_output = FM(fm_input) # 最终fm_input维度为：torch.Size([batch_size, 26, 4])。fm_output的维度为torch .Size([batch_size, 1])

    # dnn

    for i in range(self.cat_col_len): # 此时DNN的输入数据为：[数值特征原始值，类别特征经过二阶矩阵嵌入之后的值]
        num_x = torch.cat([num_x, self.fm_embeddings[i](cat_x[:, i].long())], dim=1)
    # print(num_x.shape)
    deep_output = self.dropout1(self.relu(self.deep_linear1(num_x)))
    deep_output = self.dropout2(self.relu(self.deep_linear2(deep_output)))
    deep_output = self.dropout3(self.relu(self.deep_linear3(deep_output)))
    deep_output = self.deep_final_linear(deep_output) # DNN的输出维度：torch.Size([batch_size, 1])

    # 总和
    output = self.sigmoid(linear_output + fm_output + deep_output) # torch.Size([batch_size, 1])
    return torch.cat([output, 1 - output], dim=1)  #
    # return self.final_linear(torch.cat([linear_output, fm_output, deep_output], dim=1))

fm部分注释：

fm_input = self.fm_embeddings[0](cat_x[:, 0].long()).unsqueeze(dim=1)

这部分的输出shape为：torch.Size([batch_size, 1, 4]),其对应fm公式中的(v_{i,f}x_i)。之前FM的代码里(v)是一个矩阵[特征数,隐向量维度]。现在有26个(v)，每个(v)有不同的特征数，因为每个类别特征编码之后的个数不同，但是有相同的隐向量维度。

left = torch.square(torch.sum(x, dim=1, keepdim=True)) # (batch, 1, emb_len)

x即为26个fm_input拼接成的长tensor，其形状为torch.Size([batch_size, 26, 4])。所谓DNN和FM共享权值，也就是共享x向量，继而共同训练嵌入矩阵(v)。对应的fm公式为：(v_{i,f}x_i)。(i)代表26个嵌入矩阵的第i个，(x_i)代表第i个类别特征的编码后的索引值；二者相乘，表示从嵌入矩阵(v_i)取对应编码索引那行的数据，该行数据维度为[1,4]，所以26行拼接得到的维度为 [26,4]。对拼接后的向量进行求和，便是26个特征的交互。
left对应的fm公式为：((sum_{i=1}^{n}{v_{i,f}x_i} )^2)。先求和再平方。

right = torch.sum(torch.square(x), dim=1, keepdim=True) # (batch, 1, emb_len)

right对应的fm公式为：(sum_{i=1}^{n}{v_{i,f}^2 x_i^2})。

res = 0.5 * torch.sum((left - right), dim=2) # (batch, 1)

res中的torch.sum(dim=2)对应fm公式中的(sum_{f=1}^{k})。在dim=2的维度进行求和，即维数为4的那个维度，隐向量求和。

DNN部分解释

DNN的输入数据为：[数值特征原始值，类别特征经过矩阵嵌入之后的值]。但是计算FM二阶特征交互的时候使用的只是类别特征经过矩阵嵌入之后的值。有一个考虑，在DeepFM中的fm，使用的不再是一个嵌入矩阵，而是多个，并且每个嵌入矩阵是针对类别特征进行embedding_lookup，即进行无需one-hot向量的embedding操作。所以数值特征被忽视了。在DNN阶段又将其加上了。