电商推荐学习代码,电影数据集中采用了多模型对比选优的方式

import pandas as pd
import numpy as np
import matplotlib as mpl

# 可视化
import seaborn as sns
import matplotlib.pyplot as plt
import random as rnd
from datetime import datetime
import re

import missingno as msno
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam, Adamax
from difflib import SequenceMatcher

# 机器学习
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec


# 忽略警告
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

# 忽略 SettingWithCopyWarning:
pd.options.mode.chained_assignment = None  # default='warn'










# -------------------------------- 读取数据 ----------------------------------------------------------------
# 加载数据集
data1 = pd.read_csv("train_full.csv", low_memory=False)
data_ds = pd.read_csv('train.csv')
data1.head()  # 罗列出头部结构
data1.info()  # 展示数据信息
# 数据集有73列和5802400行

# 把丢失的数据作为矩阵可视化
main_df=data1[['gender','location_type','language','OpeningTime','city_id','vendor_rating']]
msno.matrix(main_df)

# 性别
print(data1['gender'].value_counts())
gender_null = np.count_nonzero(data1['gender'].isnull())
print(gender_null)
print("Gender Null Ration : ", gender_null/data1.shape[0])

# 可视化性别类型
sns.countplot('gender',data=data1)
plt.show()

# 地点
print(data1['location_type'].value_counts())
location_null = np.count_nonzero(data1['location_type'].isnull())
print(location_null)
print("Location Null Ratio : ", location_null/data1.shape[0])

# 可视化地点类型
sns.countplot('location_type',data=data1)
plt.show()

# 不同位置&性别的交叉可视化
sns.countplot(data1['gender'],hue=data1['location_type'])
plt.show()

# 语言
print(data1['language'].value_counts())
language_null = np.count_nonzero(data1['language'].isnull())
print(language_null)
print("Language Null Ration : ", language_null/data1.shape[0])

# 商家营业时间
print(data1['OpeningTime'].value_counts())
open_time_null = np.count_nonzero(data1['OpeningTime'].isnull())
print(open_time_null)
print("Opening Time Ration : ",open_time_null/data1.shape[0])


# 商家的平均评级得分
print(data1['vendor_rating'].value_counts())
null = np.count_nonzero(data1['vendor_rating'].isnull())
print(null)
print("Vendor's average ration : ", null/data1.shape[0])

# 可视化
sns.countplot('vendor_rating',data=data1)
plt.show()

# ----------------------------------------- orders数据集 -------------------------------------------------
data2 = pd.read_csv("orders.csv")
data2.head()
data2.info()

# vendor_rating
print(data2['vendor_rating'].value_counts())
null = np.count_nonzero(data2['vendor_rating'].isnull())
print(null)
print("Vendor Ration : ", null/data2.shape[0])
sns.countplot('vendor_rating',data=data2)

# vendor_id
print(data2['vendor_id'].value_counts())

# customer_id
print(data2['customer_id'].value_counts())

# ------------------------------------------- 数据处理 ---------------------------------------------
# 基于协同过滤的数据处理
# 从train.full.csv总选择一些列作为data1
# 商家vendor_rating转化为 mean_rating
dataset1 = data1[['customer_id','gender','location_type','id','OpeningTime',
                  'language','vendor_rating','serving_distance','vendor_tag_name','delivery_charge']]
dataset1.rename(columns = {"vendor_rating": "mean_rating"}, inplace = True)

# 用dataset1的id和customer_id列制作派生列all
cols = ['customer_id', 'id']
dataset1['all'] = dataset1[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dataset1.head()

# 去重
dataset1.drop_duplicates(['all'],inplace=True)

# 修改data2(orders.csv)的一些列
dataset2 = data2[['akeed_order_id','customer_id','vendor_id', 'item_count',
                  'grand_total', 'vendor_rating']][:]  # 重命名:vendor_id - > id
dataset2.rename(columns = {"vendor_id": "id"}, inplace = True)

# 用dataset2的id和customer_id列制作派生列all
cols = ['customer_id', 'id']
dataset2['all'] = dataset2[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dataset2.head()

# 去重:删除重复的行
print(dataset1.shape)
print(dataset2.shape)

# 合并dataset1和dataset2
df1=pd.merge(dataset1,dataset2,on='all',how='inner')
df1.head()  # 显示头信息

# 重命名列名称并替换
df1.rename(columns = {"customer_id_x": "customer_id"}, inplace = True)
df1.rename(columns = {"id_x": "vendor_id"}, inplace = True)
df1.drop(['customer_id_y','id_y'],axis=1,inplace=True)

# 基于内容的数据处理
# 从dataset1(已清理的数据)中选择一些列
df2=dataset1[['customer_id','id','vendor_tag_name']]
df2.rename(columns={'id':'vendor_id'},inplace=True)
df2.head()

# ------------------------------------------- 数据清洗---------------------------------------------
cols=[ 'serving_distance', 'delivery_charge',
       'item_count', 'grand_total', 'vendor_rating']

def null_check(x):
 # print(df1_train[x].value_counts())
  null = np.count_nonzero(df1[x].isnull())
  print(null)
  return null/df1.shape[0]

# 打印 null ratio
for i in cols:
  print(i,'null ratio :', null_check(i))

# 删除语言列
df1.drop(['language'],axis=1,inplace=True)

# 删除空的性别列
df1 = df1[df1['gender'].notnull()].reset_index(drop=True)

# 修改 'gender' 列-,gender - > one-hot encoding (int)
sex=pd.get_dummies(df1["gender"], columns = ['gender'],prefix="sex",drop_first=True)

df1=pd.concat([df1,sex],axis=1)

# 删除原始性别列
df1.drop(['gender'],axis=1,inplace=True)
df1.rename(columns={'vendor_rating': 'rating'}, inplace=True)
print(df1.shape)
df1.head()

df1_train_for_anal = df1[:]

# ---------------- 基于内容的数据清洗 ---------------------
train_for=df1[:]
train_contents = train_for[['customer_id','vendor_id','OpeningTime','vendor_tag_name']]
train_contents.head()

# 对'OpeningTime'列进行预处理
train_contents['OpeningTime'].fillna('-',inplace=True)  # 将Opening time 列拆分为2列(打开/关闭)

time_split= train_contents.OpeningTime.str.split('-')
open=time_split.str.get(0)
close=time_split.str.get(1)

train_contents['Open']=open
train_contents['Close']=close

# 将空白填充为空值
train_contents['Open'].fillna('',inplace=True)
train_contents['Close'].fillna('',inplace=True)

print(train_contents['Open'].unique())
print(train_contents['Close'].unique())

#  以“早、午、晚”的开放时间和关闭时间为基础制作新列
def morning_func(x) :
  if x == "" :
    return None
  else :
    x1 = int(x[:2].replace(":", "").replace("a", ""))
    x2 = x[-2:]
    if (x1>=7 and x1 <= 10) and x2 == ("AM" or "am"):
      return 1
    elif x1 <=10 and len(x) <= 2 :
      return 1
    else :
      return 0

def afternoon_func(x) :
  if x == "" :
    return None
  else :
    x1 = int(x[:2].replace(":", "").replace("a", ""))
    x2 = x[-2:]
    if x1 <= 1 and x2 == "PM":
      return 1
    elif x1 == 12 and x2 == "PM":
      return 1
    elif x2 == ("AM" or "am"):
      return 1
    elif x1 <=10 and len(x) <= 2 :
      return 1
    else :
      return 0

def evening_func(x) :
  if x == "" :
    return None
  else :
    x1 = int(x[:2].replace(":",""))
    x2 = x[-2:]
    if (x1 >= 6 and x2 == "PM") or x2 == ("Am" or "am") :
      return 1
    elif x1 >= 22 and len(x)<=2:
      return 1
    else :
      return 0

train_contents["morning"] = train_contents["Open"].apply(morning_func)
train_contents["afternoon"] = train_contents["Open"].apply(afternoon_func)
train_contents["evening"] = train_contents["Close"].apply(evening_func)

# train_contents[:2]

# 时间
# 计算vendor_tag_name的空白比例
null = np.count_nonzero(train_contents['vendor_tag_name'].isnull())
print(null)
print(null/train_contents.shape[0])  # 1%

# 移除空值
train_contents= train_contents[train_contents['vendor_tag_name'].notnull()].reset_index(drop=True)
null = np.count_nonzero(train_contents['vendor_tag_name'].isnull())
print(null)


# 清理 vendor_tag_name
# 把字符都转为小写
train_contents['vendor_tag_name']=train_contents['vendor_tag_name'].apply(lambda x:x.lower())

# str -> list
train_contents['vendor_tag']= train_contents['vendor_tag_name'].str.split(',')
train_contents['vendor_tag'].head()


# 如果'breakfast'在vendor_tag_name中,并且morning值为null(空白),则将morning和afternoon值填充为1
# 如果标签上有breakfast,那么就加上morning,设为1
def breakfast1(tag,x2):
  if any('breakfast' in i for i in tag) and np.isnan(x2)==True:
    return 1
  # elif any('breakfast' in i for i in tag) and int(x2)==0 :
  #   return 1
  else:
    return x2

train_contents['mor2'] = train_contents.apply(
    lambda x: breakfast1(x['vendor_tag'], x['morning']), axis=1)

# 其他不变把afternon设为1
for i in range(len(train_contents['afternoon'])):
  if (np.isnan(train_contents['morning'][i])==True) and (train_contents['mor2'][i]==1.0) :
    train_contents['afternoon'][i]=1
  else:
    pass

# 其他不变把evening设为0
for i in range(len(train_contents['evening'])):
  if (np.isnan(train_contents['morning'][i])==True) and (train_contents['mor2'][i]==1.0) :
    train_contents['evening'][i]=0
  else:
    pass

#--------------检查并删除空值-------------
# 检查并移除空值
null = np.count_nonzero(train_contents['mor2'].isnull())
print(null)
print(null/train_contents.shape[0]) # 0.4 %

train_contents= train_contents[train_contents['mor2'].notnull()].reset_index(drop=True)
train_contents.drop(['morning'],axis=1,inplace=True)

train_contents.rename(columns={'mor2':'morning'},inplace=True)


# ----------------- 基于评级的模型:协同过滤 -----------------------

# 提取CF所需的变量
cus_ven_ratings = df1_train_for_anal[['customer_id', 'vendor_id', 'rating']]
# cus_ven_ratings

# 仅通过有效评级计算平均评级(缺失和评级为零除外)
ratings_not_none = []

for i in range(0, cus_ven_ratings.shape[0] - 1):
    if pd.isnull(cus_ven_ratings.iloc[i][2]) == False and cus_ven_ratings.iloc[i][2] != 0:
        ratings_not_none.append(cus_ven_ratings.iloc[i][2])

valid_rating_mean = np.mean(np.array(ratings_not_none))

# 用有效评级代替缺失评级和零评级
def rating_missing_func(x) :
  if pd.isnull(x) == True :
    return valid_rating_mean
  elif x == 0:
    return valid_rating_mean
  else :
    return x

cus_ven_ratings["rating2"] = cus_ven_ratings["rating"].apply(rating_missing_func)
# cus_ven_ratings

# 重组dataframe(rename colums)
cus_ven_ratings = cus_ven_ratings[['customer_id', 'vendor_id', 'rating2']]
cus_ven_ratings.rename(columns={'rating2':'rating', 1:'customer_id_num'}, inplace=True)
# cus_ven_ratings

# 按组均值融入个人评级
cus_ven_ratings_mean = cus_ven_ratings.groupby(['customer_id', 'vendor_id']).mean()
# cus_ven_ratings_mean

df_cus_ven_ratings_mean = cus_ven_ratings_mean.reset_index()
# df_cus_ven_ratings_mean

# Making Full Matrix(Sparse Matrix)
rating_full_matrix = df_cus_ven_ratings_mean.pivot(index='customer_id', columns='vendor_id', values='rating')
# rating_full_matrix

# 从全矩阵中计算所有客户对的相似度
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_dummy = rating_full_matrix.copy().fillna(0)
customer_similarity = cosine_similarity(rating_matrix_dummy, rating_matrix_dummy)
customer_similarity = pd.DataFrame(customer_similarity, index = rating_full_matrix.index, columns=rating_full_matrix.index)
# customer_similarity

# 计算精度(均方根误差)的函数
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

#---将RMSE应用于CF模型的函数---
def knn_score(model, neigbor_size=0) :
  id_pairs = zip(df_cus_ven_ratings_mean['customer_id'], df_cus_ven_ratings_mean['vendor_id'])
  y_pred = np.array([model(customer, vendor, neigbor_size) for (customer, vendor) in id_pairs])
  y_true = np.array(df_cus_ven_ratings_mean['rating'])
  return RMSE(y_true, y_pred)


# -------------------------CF模型(限制neighbor大小)----------------
def cf_knn(customer_id, vendor_id, neighbor_size=0):
    if vendor_id in rating_full_matrix:
        # 输入客户与其他客户的相似度
        sim_scores = customer_similarity[customer_id].copy()
        # 所有客户对输入供应商(餐厅)的评分
        vendor_ratings = rating_full_matrix[vendor_id].copy()
        # 非输入评级供应商的客户索引
        none_rating_idx = vendor_ratings[vendor_ratings.isnull()].index
        # Exception rating(null) which of customers who are not rate inputted vendor
        # 排除客户不是供应商输入评级的空的评级
        vendor_ratings = vendor_ratings.drop(none_rating_idx)
        # Exception similarity which of customers who are not rate inputted vendor
        # 排除不是输入评级的供应商客户的相似性
        sim_scores = sim_scores.drop(none_rating_idx)

        # 未指定邻居大小的情况
        if neighbor_size == 0:
            # 对输入的供应商进行评分的客户的评分加权平均数
            mean_rating = np.dot(sim_scores, vendor_ratings) / sim_scores.sum()

        # 指定邻居大小的情况
        else:
            # 2人或2人以上评分输入供应商的情况
            if len(sim_scores) > 1:
                # 输入邻居大小与输入供应商评级客户数之间的最小值
                neighbor_size = min(neighbor_size, len(sim_scores))
                # 转置到Numpy数组以使用argsort
                sim_scores = np.array(sim_scores)
                vendor_ratings = np.array(vendor_ratings)
                # 相似性排序
                customer_idx = np.argsort(sim_scores)
                # 和邻居大小一样相似性
                sim_scores = sim_scores[customer_idx][-neighbor_size:]
                # 和邻居大小一样的评级
                vendor_ratings = vendor_ratings[customer_idx][-neighbor_size:]
                # 计算最终预测
                mean_rating = np.dot(sim_scores, vendor_ratings) / sim_scores.sum()
            else:
                # 在其他情况下替换为有效均值
                mean_rating = valid_rating_mean
    else:
        # 在其他情况下替换为有效均值
        mean_rating = valid_rating_mean
    return mean_rating

knn_score(cf_knn, neigbor_size=20)


# ---------CF对顾客的推荐 --------------
# 由CF为特定客户提供推荐列表的功能
def cf_recom_vendor(customer_id, n_items, neighbor_size=0):
    # Vendors which rated by inputted customer
    customer_vendor = rating_full_matrix.loc[customer_id].copy()

    for vendor in rating_full_matrix:
        # 已输入客户评分的供应商除外
        if pd.notnull(customer_vendor.loc[vendor]):
            customer_vendor.loc[vendor] = 0
        # 计算未按输入客户评分的供应商的预测评分
        else:
            customer_vendor.loc[vendor] = cf_knn(customer_id, vendor, neighbor_size)

    # 按预测评级对供应商排序
    vendor_sort = customer_vendor.sort_values(ascending=False)[:n_items]
    recom_vendors_temp = df1_train_for_anal.loc[vendor_sort.index]
    recom_vendors_temp2 = recom_vendors_temp[['vendor_id', 'mean_rating', 'vendor_tag_name']]
    recom_vendors = recom_vendors_temp2.reset_index(drop=True)
    return recom_vendors


# 推荐列表的示例
get_recom = cf_recom_vendor(customer_id='ZZV76GY', n_items=5, neighbor_size=30)
print(get_recom)

# --------------------------------------- 基于评级的模型:深度学习模型 --------------------------------------
# ----- 使用深度学习实现矩阵分解 -----
# 提取MF所需的变量
ratings = cus_ven_ratings

# 按集团平均值纳入个人评级
ratings = ratings.groupby(['customer_id', 'vendor_id']).mean().reset_index()

# 制作临时预处理的全矩阵
R_temp = ratings.pivot(index='customer_id', columns='vendor_id', values='rating').fillna(0)

# 将客户id映射到索引(连续的数字行名)
customer_id_index = []

for i, one_id in enumerate(R_temp.T):
  customer_id_index.append([one_id, i])

df_customer_id_index = pd.DataFrame(customer_id_index)
df_customer_id_index.rename(columns={0:'customer_id', 1:'customer_idx'}, inplace=True)
# df_customer_id_index

# 将商家id映射到索引(连续的数字列名)
vendor_id_index = []

for i, one_id in enumerate(R_temp):
    vendor_id_index.append([one_id, i])

df_vendor_id_index = pd.DataFrame(vendor_id_index)
df_vendor_id_index.rename(columns={0: 'vendor_id', 1: 'vendor_idx'}, inplace=True)
# df_vendor_id_index

# 合并评级和每个索引
ratings_with_index = pd.merge(ratings, df_customer_id_index, on='customer_id')
ratings_with_index = pd.merge(ratings_with_index, df_vendor_id_index, on='vendor_id')
ratings = ratings_with_index[['customer_idx', 'vendor_idx', 'rating']].astype(int)
ratings.rename(columns={'customer_id_num':'customer_idx', 'vendor_id_num':'vendor_idx', 'rating':'rating'}, inplace=True)

# 制作全矩阵(稀疏矩阵)
rating_full_matrix_by_index_with_nan = ratings.pivot(index='customer_idx', columns='vendor_idx', values='rating')
# rating_full_matrix_by_index_with_nan

# 将None转置为0
rating_full_matrix_by_index = ratings.pivot(index='customer_idx', columns='vendor_idx', values='rating').fillna(0)
# rating_full_matrix_by_index


# ------------------------------- 建立DNN模型 ------------------ ----------------
# 潜在因素数
K = 100
# 总的平均值
mu = ratings.rating.mean()
# 顾客数量
M = ratings.customer_idx.unique().shape[0]
# 商家数目
N = ratings.vendor_idx.unique().shape[0]
# 误差
bias_pre = 0.15

# 损失函数,计算 RMSE
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

# embedding处理高维和稀疏性矩阵
customer = Input(shape=(1, ))
vendor = Input(shape=(1, ))
P_embedding = Embedding(M, K, embeddings_regularizer=l2())(customer)
Q_embedding = Embedding(N, K, embeddings_regularizer=l2())(vendor)
customer_bias = Embedding(M, 1, embeddings_regularizer=l2())(customer)
vendor_bias = Embedding(N, 1, embeddings_regularizer=l2())(vendor)

# Keras层
from tensorflow.keras.layers import Dense, Concatenate, Activation
P_embedding = Flatten()(P_embedding)
Q_embedding = Flatten()(Q_embedding)
customer_bias = Flatten()(customer_bias)
vendor_bias = Flatten()(vendor_bias)
R = Concatenate()([P_embedding, Q_embedding, customer_bias, vendor_bias])

# 搭建网络
R = Dense(2048)(R)
R = Activation('linear')(R)
R = Dense(1024)(R)
R = Activation('linear')(R)
R = Dense(512)(R)
R = Activation('linear')(R)
R = Dense(256)(R)
R = Activation('linear')(R)
R = Dense(128)(R)
R = Activation('linear')(R)
R = Dense(64)(R)
R = Activation('linear')(R)
R = Dense(1)(R)

# 模型编译
model = Model(inputs=[customer, vendor], outputs=R)
model.compile(
  loss=RMSE,
  optimizer=Adamax(),
  metrics=[RMSE])

# 模型预览
model.summary()

# 模型训练
result = model.fit(
  x=[ratings.customer_idx.values, ratings.vendor_idx.values],
  y=ratings.rating.values - mu,
  epochs=7,
  batch_size=512,
  validation_data=(
    [ratings.customer_idx.values, ratings.vendor_idx.values],
    ratings.rating.values - mu
  )
)

# 画图
import matplotlib.pyplot as plt
plt.plot(result.history['RMSE'], label="RMSE")
plt.xlabel('epoch')
plt.ylabel('RMSE')
plt.legend()
plt.show()

# 相似度计算
def compute_sim(y):
    tmp = []
    for ix in range(y.size):
        if tf.abs(4 - y[ix]) < bias_pre:
            tmp.append(1)
        else:
            tmp.append(0)
    return sum(tmp) / len(tmp)


# 实际和预测评级的效果对比展示
customer_ids = ratings.customer_idx.values[0:6]
vendor_ids = ratings.vendor_idx.values[0:6]
predictions = model.predict([customer_ids, vendor_ids]) + mu
print("Actuals: 
", ratings[0:6])
print("
")
print("Predictions: 
", predictions)

Predictions = predictions.reshape([-1])
Predictions = np.array(Predictions)
input_ratings = ratings[0:6]
print("accuracy : ", compute_sim(Predictions))


# MF对顾客的推荐
def recom_vendor(customer_idx, n_items):
    # 按输入客户评分的供应商
    customer_vendor = rating_full_matrix_by_index_with_nan.loc[customer_idx].copy()

    for vendor in rating_full_matrix_by_index_with_nan:
        # 已输入客户评分的供应商除外
        if pd.notnull(customer_vendor.loc[vendor]):
            customer_vendor.loc[vendor] = 0
        # 计算未按输入客户评级的供应商的预测评级
        else:
            customer_vendor.loc[vendor] = round(
                min(model.predict([np.array([customer_idx]), np.array([vendor])])[0][0] + mu, 5), 3)

    # 按预测评级对供应商排序
    vendor_sort = customer_vendor.sort_values(ascending=False)[:n_items]
    df_vendor_sort = pd.DataFrame(vendor_sort)
    df_vendor_sort.rename(columns={'vendor_idx': 'vendor_idx', customer_idx: 'predicted_rating'}, inplace=True)

    return df_vendor_sort

# 重新映射客户ID和客户索引
customer_id_idx = ratings_with_index[['customer_id', 'customer_idx']]
customer_id_idx = customer_id_idx.drop_duplicates()

# 重新映射供应商ID和供应商索引
vendor_id_idx = ratings_with_index[['vendor_id', 'vendor_idx']]
vendor_id_idx = vendor_id_idx.drop_duplicates()

# 准备dataframe以提取供应商标记
mf_df1 = df1_train_for_anal[['vendor_id', 'mean_rating', 'vendor_tag_name']]
mf_df1 = mf_df1.drop_duplicates()

# 通过MF和DL为特定客户提供推荐列表的功能
def mf_dl_recom_vendor_list(customer_id, n_items):
    df_specified_customer = customer_id_idx[customer_id_idx['customer_id'] == customer_id]
    specified_customer_idx = df_specified_customer.iloc[0][1]
    mf_recom_list_temp = recom_vendor(customer_idx=specified_customer_idx, n_items=n_items)
    mf_recom_list_temp2 = pd.merge(mf_recom_list_temp, vendor_id_idx, how='inner', on='vendor_idx')
    mf_recom_list_temp3 = mf_recom_list_temp2[['vendor_id', 'predicted_rating']]
    mf_recom_list = pd.merge(mf_recom_list_temp3, mf_df1, how='inner', on='vendor_id')
    return mf_recom_list


# 对ZZV76GY客户的推荐列表的示例
mf_list = mf_dl_recom_vendor_list(customer_id='ZZV76GY', n_items=5)
print(mf_list)

# --------------------------------------------- 基于内容的推荐系统 -----------------------------------------------
# 检查类似的词,当相似度大于0.8时,更改单词
df1_contents_for_anal=train_contents[:]
df1_contents_for_anal.head()
df1_contents_for_anal['vendor_tag']= df1_contents_for_anal['vendor_tag_name'].str.split(',')

# 拆分
df1_contents_for_anal['vendor_tag']=df1_contents_for_anal['vendor_tag'].apply(lambda x:[str.lower(i.replace(" ",""))for i in x])

# 检查相似词
def similar(a, b):
    ratio=SequenceMatcher(None, a, b).ratio()
    return print("Similarity of {} and {}  : {}".format(a,b,ratio))

similar('pasta','pastas')
similar('pasta','pastry')
similar('pizza','pizzas')
similar('soups','shuwa')
similar('shawarma','shuwa')
similar('thali','thai')
similar('milkshakes','mishkak')

# 当相似度大于0.8时修改单词
df1_contents_for_anal['vendor_tag']=df1_contents_for_anal['vendor_tag'].apply(lambda x:[ i.replace("pastas","pasta")for i in x])
df1_contents_for_anal['vendor_tag']=df1_contents_for_anal['vendor_tag'].apply(lambda x:[ i.replace("pizzas","pizza")for i in x])
df1_contents_for_anal['vendor_tag']=df1_contents_for_anal['vendor_tag'].apply(lambda x:[ i.replace("thali","thai")for i in x])

df1_contents_for_anal['vendor_tag1']=df1_contents_for_anal['vendor_tag'].apply(lambda x:' '.join(x))

df1_contents_for_anal.head()
df1_contents_for_anal['vendor_id'].value_counts()


# 去重
prac= df1_contents_for_anal.drop_duplicates("vendor_id", keep="first", inplace=False)
print(prac.shape)

prac['vendor_id'] = prac['vendor_id'].astype(str)
prac1 = prac[:]


# -------------------- TF-IDF模型 ------------------------------
prac.set_index('vendor_id', inplace=True)
prac.head(2)

vectorizer = TfidfVectorizer()
count_matrix = vectorizer.fit_transform(prac['vendor_tag1'])
print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)

indices = pd.Series(prac.index)
# indices[:5]

cosine_sim = cosine_similarity(count_matrix,count_matrix)
print(cosine_sim)

prac=prac.reset_index()

def get_recommendations(id, cosine_sim=cosine_sim):
    indices = pd.Series(prac.index, index = prac['vendor_id']).drop_duplicates()

    # 从vendor_id得到下标
    idx = indices[id]

    # 余弦相似度
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 最相似供应商id(10)
    sim_scores = sim_scores[1:11]

    return sim_scores

get_r = get_recommendations('113')
print(get_r)












# ----------------------------------------------- 基于集成学习的推荐方法 --------------------------------------------------
print("|===========================================================================================================|")
print("------------------------------------------- 基于集成学习的推荐方法 ----------------------------------------------")

# 分析数据列
print(data_ds.columns.values)
data_ds.info()

# 预览头部数据
data_ds.head()

# 预览尾部数据
data_ds.tail()

print('_'*40)

data_ds.describe()
data_ds.describe(include=['O'])
d2=data_ds[(data_ds['type'] == 'Movie')]
d2.describe(include=['O'])

# 将列的数据类型转换为数值
title_mapping = {"TV Show": 1, "Movie": 0}
data_ds['type'] = data_ds['type'].map(title_mapping)
data_ds.head()

# 将持续时间转换为数值数据
data_ds['duration'] = data_ds.duration.str.extract('([0-9]+)', expand=False)
data_ds['duration'] = pd.to_numeric(data_ds['duration'])
data_ds.head()

# 将显示Id转换为数字数据
data_ds['show_id'] = data_ds.show_id.str.extract('([0-9]+)', expand=False)
data_ds['show_id'] = pd.to_numeric(data_ds['show_id'])
data_ds.head()

# 将评级列转换为数字数据
data_ds['rating'] = data_ds['rating'].astype(str)
r = {'TV-MA':1, 'R': 2,  'PG-13':3, 'TV-14':4, 'TV-PG':5 ,'NR':6 ,'TV-G':7 ,
     'TV-Y':8 , 'TV-Y7':9, 'PG':10, 'G':11, 'NC-17': 12,  'TV-Y7-FV' :13, 'UR':14}
data_ds['rating'] = data_ds['rating'].map(r)
data_ds['rating'] = data_ds['rating'].fillna(1)
data_ds['rating'] = data_ds['rating'].astype(int)
data_ds.head()
data_ds.head()

# 将添加到数字数据中的日期的转换
data_ds['date_added'] = pd.to_datetime(data_ds['date_added'], errors="coerce")
dateTimeObj = datetime.now()
data_ds['date_added'] = pd.DatetimeIndex(data_ds['date_added']).year
df=data_ds['date_added'].value_counts().idxmax()
print(data_ds.date_added.describe())
data_ds['date_added'] = data_ds['date_added'].fillna(2019)
data_ds['date_added'] = data_ds['date_added'].astype(int)
data_ds.head()

#从列中提取genre类型
data_ds['genre']=data_ds.listed_in.str.extract(r'(Horror|Action & Adventure|Sci-Fi & Fantasy|Romantic|'
                                                     r'Comedies|Dramas|Sports|Trillers|Classic|cult|Children & '
                                                     r'Family|Science & Nature|Music)', expand=False)
g={"Horror": 1,"Action & Adventure": 2,"Sci-Fi & Fantasy": 3,
   "Romantic": 4, "Comedies": 5, "Dramas": 6,
   "Sports": 7, "Trillers": 8, "Classic": 9, "cult": 10,
   "Children & Family": 11, "Science & Nature": 12}


# 数值数据中genre的提取
data_ds['genre'] = data_ds['genre'].map(g)
data_ds['genre'] = data_ds['genre'].fillna(0)
data_ds['genre'] = data_ds['genre'].astype(int)
pd.crosstab(data_ds['genre'], data_ds['type'])
data_ds[["genre", "type"]].groupby(['genre'], as_index=False).mean().sort_values(by='type', ascending=False)

# 不同大陆的国家
# 将一个国家转换成数字数据
con=data_ds.country.unique()
asia=['Russian', 'China', 'India', 'Kazakhstan','Saudi Arabia', 'Iran',
      'Mongolia', 'Indonesia',  'Pakistan',  'Turkey',  'Myanmar',
      'Afghanistan',  'Yemen',  'Thailand', 'Turkmenistan', 'Uzbekistan',
      'Iraq', 'Japan', 'Vietnam','Malaysia' ,'Oman', 'Philippines','Laos',
      'Kyrgyzstan', 'Nepal','Tajikistan','North Korea',' South Korea',
      'Jordan', 'Azerbaijan','syria', 'combodia' ,'Bangladash',
      'United Arab Emirates','Georgia', 'Sri Lanka', 'Bhutan', 'Taiwan',
      'Armenia', 'Israel kuwait', 'Timor-Leste', 'Qatar', 'Lebanon',
      'Cyprus', 'Palestine','Brunei','Bahrain','Singapore', 'Maldives']
europe=['Germany','United Kingdom','France','Italy','Spain','Ukraine',
        'Poland','Romania','Netherlands','Belgium','Czech Republic',
        'Greece','Portugal','Sweden','Hungary','Belarus','Austria',
        'Serbia','Switzerland','Bulgaria','Denmark','Finland',
        'Slovakia','Norway','Ireland','Croatia','Moldova','Bosnia',
        'Albania','Lithuania','North Macedonia','Slovenia','Latvia',
        'Estonia','Montenegro','Luxembourg','Malta','Iceland','ndorra',
        'Monaco','Liechtenstein','San Marino','Holy See']
Africa=['Ethiopia', 'Nigeria','Egypt','DR Congo','Tanzania',
        'South Africa','Kenya','Uganda','Algeria','Sudan','Morocco',
        'Angola','Mozambique','Ghana','Madagascar','Cameroon',
        'Côte dIvoire','Niger','Burkina Faso','Mali','Malawi','Zambia',
        'Senegal','Chad','Somalia','Zimbabwe','Guinea','Rwanda',
        'Benin','Burundi','Tunisia','South Sudan','Togo','Sierra Leone',
        'Libya','Congo','Liberia','Central African Republic',
        'Mauritania','Eritrea','Namibia','Gambia','Botswana','Gabon',
        'Lesotho','Guinea-Bissau','Equatorial Guinea','Mauritius',
        'Eswatini','Djibouti','Co','Cabo Verde','Sao Tome','Seychelles']
Australia=['Micronesia', 'Fiji', 'Kiribati', 'Marshall Islands', 'Nauru',
           'New Zealand', 'Palau', 'Papua New Guinea', 'Samoa',
           'Solomon Islands', 'Tonga', 'Tuvalu','Vanuatu']
America=['Anguilla','United Kingdom','Barbuda','Argentina','Aruba',
         'Netherlands','Bahamas','Barbados','Belize','Bermuda',
         'Bolivia','Bonaire','Norway','Brazil','British Virgin Islands',
         'Canada','Cayman Islands','Chile','Clipperton Island','Colombia',
         'Costa Rica','Cuba','Curaçao','Dominica','Dominican Republic',
         'Ecuador','El Salvador','Falkland Islands','French Guiana' ,
         'Greenland','Denmark','Grenada','Guadeloupe','Guatemala',
         'Guyana','Haiti','Honduras','Jamaica','Martinique','Mexico',
         'Montserrat','Navassa Island','United States','Nicaragua',
         'Panama','Paraguay','Peru','Puerto Rico','Saba','Saint Barthélemy',
         'Saint Kitts','Saint Lucia','Saint Martin','Saint Pierre',
         'Saint Vincent','Sint Eustatius','Sint Maarten', 'South Georgia',
         'South Sandwich Islands','Suriname','Trinidad','Tobago','Turks',
         'Caicos Islands','Virgin Islands','United States of America',
         'Uruguay','Venezuela']
data_ds['continenta']=data_ds.country.str.contains(r'(Russian|China|India|Kazakhstan|Saudi'
                                                         r' Arabia|Iran|Mongolia|Indonesia|'
                                                         r'Pakistan|Turkey|Myanmar|Afghanistan|Yemen|Thailand|'
                                                         r'Turkmenistan|Uzbekistan|Iraq|Japan|Vietnam|Malaysia|'
                                                         r'Oman|Philippines|Laos|Kyrgyzstan|Nepal|Tajikistan'
                                                         r'|North Korea|South Korea|Jordan|Azerbaijan|syria|'
                                                         r'combodia|Bangladash|United Arab Emirates|Georgia|'
                                                         r'Sri Lanka|Bhutan|Taiwan|Armenia|Israel|kuwait|'
                                                         r'Timor-Leste|Qatar|Lebanon|Cyprus|Palestine|Brunei|'
                                                         r'Bahrain|Singapore|Maldives)')
data_ds.loc[data_ds.continenta == True, "continenta"] = "Asia"
data_ds['continente']=data_ds.country.str.contains(r'(Germany|United Kingdom|France|Italy|Spain|'
                                                         r'Ukraine|Poland|Romania|Netherlands|Belgium|'
                                                         r'Czech Republic|Greece|Portugal|Sweden|Hungary|'
                                                         r'Belarus|Austria|Serbia|Switzerland|Bulgaria|'
                                                         r'Denmark|Finland|Slovakia|Norway|Ireland|Croatia|'
                                                         r'Moldova|Bosnia|Albania|Lithuania|North Macedonia|'
                                                         r'Slovenia|Latvia|Estonia|Montenegro|Luxembourg|Malta|'
                                                         r'Iceland|ndorra|Monaco|Liechtenstein|San Marino|Holy See)')
data_ds.loc[data_ds.continente == True, "continente"] = "Europe"
data_ds['continentaf']=data_ds.country.str.contains(r'(Ethiopia| Nigeria|Egypt|DR Congo|Tanzania|'
                                                          r'South Africa|Kenya|Uganda|Algeria|Sudan|Morocco|'
                                                          r'Angola|Mozambique|Ghana|Madagascar|Cameroon|Côte dIvoire|'
                                                          r'Niger|Burkina Faso|Mali|Malawi|Zambia|Senegal|Chad|'
                                                          r'Somalia|Zimbabwe|Guinea|Rwanda|Benin|Burundi|Tunisia|'
                                                          r'South Sudan|Togo|Sierra Leone|Libya|Congo|Liberia|'
                                                          r'Central African Republic|Mauritania|Eritrea|Namibia|'
                                                          r'Gambia|Botswana|Gabon|Lesotho|Guinea-Bissau|'
                                                          r'Equatorial Guinea|Mauritius|Eswatini|Djibouti|'
                                                          r'Co|Cabo Verde|Sao Tome|Seychelles)')
data_ds.loc[data_ds.continentaf == True, "continentaf"] = "Africa"
data_ds['continentau']=data_ds.country.str.contains(r'(Micronesia| Fiji|Kiribati|Marshall Islands|Nauru|'
                                                          r'New Zealand|Palau|Papua New Guinea|Samoa|'
                                                          r'Solomon Islands|Tonga|Tuvalu|Vanuatu)')
data_ds.loc[data_ds.continentau == True, "continentau"] = "Australia"
data_ds['continentam']=data_ds.country.str.contains(r'(Anguilla|United Kingdom|Barbuda|Argentina|'
                                                          r'Aruba|Netherlands|Bahamas|Barbados|Belize|'
                                                          r'Bermuda|Bolivia|Bonaire|Norway|Brazil|'
                                                          r'British Virgin Islands|Canada|Cayman Islands|Chile|'
                                                          r'Clipperton Island|Colombia|Costa Rica|Cuba|Curaçao|'
                                                          r'Dominica|Dominican Republic|Ecuador|El Salvador|'
                                                          r'Falkland Islands|French Guiana |Greenland|Denmark|'
                                                          r'Grenada|Guadeloupe|Guatemala|Guyana|Haiti|Honduras|'
                                                          r'Jamaica|Martinique|Mexico|Montserrat|Navassa Island|'
                                                          r'United States|Nicaragua|Panama|Paraguay|Peru|Puerto Rico|'
                                                          r'Saba|Saint Barthélemy|Saint Kitts|Saint Lucia|'
                                                          r'Saint Martin|Saint Pierre|Saint Vincent|Sint Eustatius|'
                                                          r'Sint Maarten| South Georgia|South Sandwich Islands|'
                                                          r'Suriname|Trinidad|Tobago|Turks|Caicos Islands|'
                                                          r'Virgin Islands|United States of America|Uruguay|Venezuela)')
data_ds.loc[data_ds.continentam == True, "continentam"] = "America"



# 将评级列转换为数字数据
contin = {'Asia':1, 'Europe': 2,  'Africa':3, 'Australia':4, 'America':5}
data_ds['continenta'] = data_ds['continenta'].map(contin)
data_ds['continente'] = data_ds['continente'].map(contin)
data_ds['continentaf'] = data_ds['continentaf'].map(contin)
data_ds['continentau'] = data_ds['continentau'].map(contin)
data_ds['continentam'] = data_ds['continentam'].map(contin)
data_ds['continenta'] = data_ds['continenta'].fillna(0)
data_ds['continente'] = data_ds['continente'].fillna(0)
data_ds['continentaf'] = data_ds['continentaf'].fillna(0)
data_ds['continentau'] = data_ds['continentau'].fillna(0)
data_ds['continentam'] = data_ds['continentam'].fillna(0)
data_ds['continent'] = data_ds['continenta'] + data_ds['continente'] 
                       + data_ds['continentaf'] + data_ds['continentau'] + data_ds['continentam']
data_ds['continent'] = data_ds['continent'].astype(int)
data_ds.loc[(data_ds.continent > 5), 'continent']=5
data_ds['continent'] = data_ds['continenta'] + data_ds['continente'] 
                       + data_ds['continentaf'] + data_ds['continentau'] + data_ds['continentam']
data_ds['continent'] = data_ds['continent'].astype(int)
data_ds.loc[(data_ds.continent > 5), 'continent']=5
data_ds = data_ds.drop(['continenta', 'continente', 'continentaf', 'continentau', 'continentam'], axis=1)
data_ds.head(10)

#  date _added
data_ds[['date_added', 'type']].groupby(['date_added'], as_index=False).mean().sort_values(by='type', ascending=False)

# Release year
data_ds[['release_year', 'type']].groupby(['release_year'], as_index=False).mean().sort_values(by='type', ascending=False)

# Continent
data_ds[['continent', 'type']].groupby(['continent'], as_index=False).mean().sort_values(by='type', ascending=False)

# Rating
data_ds[['rating', 'type']].groupby(['rating'], as_index=False).mean().sort_values(by='type', ascending=False)

# Genre
data_ds[['genre', 'type']].groupby(['genre'], as_index=False).mean().sort_values(by='type', ascending=False)

# Duration
data_ds[['genre', 'type']].groupby(['genre'], as_index=False).mean().sort_values(by='type', ascending=False)

# 相关数字特征:类型和添加的日期
h2 = sns.FacetGrid(data_ds, col='type')
h2.map(plt.hist, 'date_added', bins=5)

# 相关数字特征:类型和发行年份
h2 = sns.FacetGrid(data_ds, col='type')
h2.map(plt.hist, 'release_year', bins=5)
h2 = sns.FacetGrid(data_ds, col='type')
h2.map(plt.hist, 'duration', bins=5)

# 相关数字特征:类型和分级和类别
h2 = sns.FacetGrid(data_ds, col='type')
h2.map(plt.hist, 'rating', bins=5)
h2 = sns.FacetGrid(data_ds, col='type')
h2.map(plt.hist, 'genre', bins=5)

# 删除不需要的列
print("Before", data_ds.shape)
data_ds = data_ds.drop(['title', 'country', 'cast', 'director', 'listed_in', 'description'], axis=1)
print("After", data_ds.shape)

# 完全转换成数值数据后的头部信息展示
data_ds.head()

# 划分数据集:将数据分为训练和测试
feature_cols = ['continent', 'date_added','release_year', 'rating','duration','genre']
X = data_ds[feature_cols]  # 特征
y = data_ds.type  # 目标值
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)  # 逻辑回归
print(X_train.shape, y_train.shape, X_test.shape)

# 逻辑回归算法
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print("acc_logistic = ", acc_log)

# 相关列的pandas加载
coeff_df = pd.DataFrame(data_ds.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=True)

# 线性SVM
svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, y_train) * 100, 2)
print("acc_svc = ", acc_svc)

# KNN算法
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
print("acc_knn = ", acc_knn)

# 朴素贝叶斯
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
print("acc_gaussian = ", acc_gaussian)

# 感知器算法
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)
print("acc_perceptron = ", acc_perceptron)

# 线性SVM中的linear svc算法
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)
print("acc_linear_svc = ", acc_linear_svc)

# 随机梯度下降:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, y_train) * 100, 2)
print("acc_sgd = ", acc_sgd)

# 决策树算法:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print("acc_decision_tree = ", acc_decision_tree)

# 随机森林算法
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print("acc_random_forest = ", acc_random_forest)

# 多模型融合评估
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
              'Random Forest', 'Naive Bayes', 'Perceptron',
              'Stochastic Gradient Decent', 'Linear SVC',
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log,
              acc_random_forest, acc_gaussian, acc_perceptron,
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

# 提交模型
submission = pd.DataFrame({"type": Y_pred})
submission.sort_values(by='type', ascending=False)
print("submission: 
", submission)
原文地址:https://www.cnblogs.com/niubidexiebiao/p/14788092.html