DF平台:消费金融场景下的用户购买预测

赛题链接:https://www.datafountain.cn/competitions/287/details

赛题任务:利用招商银行客户的个人属性、信用卡消费数据,以及部分客户在掌上生活APP上的一个月的操作行为日志,设计合理的特征工程与模型算法方案,预测客户在未来一周内(4月1日-7日),是否会购买掌上生活APP上的优惠券(包括饭票、影票等)。考虑到客户隐私,客户的个人属性数据与信用卡消费数据,采用脱敏并标准化处理为V1,V2,…,V30数值型属性。客户在APP上的行为日志,一些字段也进行了相应加密。

由于比赛后期时间紧张,排名一直往下掉,最终B榜排名75名,本博客记录在此比赛中的一些过程和心得。

赛题数据介绍

比赛提供包括训练集和测试集,训练集包括部分如下:

(1)个人属性与信用卡消费数据:包含80000名信用卡客户的个人属性与信用卡消费数据,其中包含枚举型特征和数值型特征,均已转为数值并进行了脱敏和标准化处理。

(2)APP操作行为日志:上述信用卡客户中,部分已绑定掌上生活APP的客户,在近一个月时间窗口内的所有点击行为日志。

(3)标注数据:包括客户号及标签。其中,标签数据为用户是否会在未来一周,购买掌上生活APP上的优惠券。

评分方式:AUC

(一)数据EDA分析

在这部分我并没有过多的花心思,主要是先load一下数据,看下缺失值,看下数据类型,老套路,用pandas_profiling.ProfileReport看一下每个表的各种信息,十分方便。然后就是各个表的merge,由于用的Jupyter Notebook,这部分代码比较散,整个代码可以参考github。

(二)特征工程

这部分包括各个表的处理,尤其是log表的一些操作行为日志的挖掘,包括一些滑动时间窗口操作

import time
from datetime import datetime
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler  
import pandas as pd
import numpy as np
import pickle   #序列化
import os
import missingno as msno
import pandas_profiling
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import math

if not os.path.exists('tmp'):
    os.mkdir('tmp')

#log表特征(App操作行为日志)
train_log_path = '../data/train/train_log.csv'
test_log_path = '../data/test/test_log.csv'

def gen_log_feat():
    dump_path = './tmp/log_feat.pkl'
    if os.path.exists(dump_path):
        log_copy = pickle.load(open(dump_path,'rb'))
    else:
        train_log = pd.read_csv(train_log_path,sep='	')
        test_log = pd.read_csv(test_log_path,sep='	')
        log = pd.concat([train_log,test_log],copy=False)
        EVT_LBL = log[['USRID','EVT_LBL']]
        getdummies_EVT_LBL = pd.get_dummies(EVT_LBL)
        getdummies_EVT_LBL1 = getdummies_EVT_LBL.copy()
        L1 = getdummies_EVT_LBL1.groupby('USRID',as_index=False).sum()
        
        # USRID count 7-9
#         EVT_LBL2 = EVT_LBL.copy()
#         USRID_count = EVT_LBL2.groupby(['USRID'],as_index=False)['USRID'].agg({'cnt':'count'})

#         log['EVT_LBL_0'] = log['EVT_LBL'].apply(lambda x: x.split('-')[0])
#         log['EVT_LBL_1'] = log['EVT_LBL'].apply(lambda x: x.split('-')[1])
#         log['EVT_LBL_2'] = log['EVT_LBL'].apply(lambda x: x.split('-')[2])
#         del log['EVT_LBL']
        #时间转换成秒,计算用户下一次的时间差特征
        log['OCC_TIM'] = log['OCC_TIM'].apply(lambda x:time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S")))
        log = log.sort_values(['USRID','OCC_TIM'])
        log['next_time'] = log.groupby(['USRID'])['OCC_TIM'].diff(-1).apply(np.abs)
        statistic_log = log.copy()
        log_copy = log.copy()
        stat_feat = ['min','mean','max','std','median']
        statistic_log = statistic_log.groupby(['USRID'],as_index=False)['next_time'].agg(stat_feat).reset_index()
        statistic_log.columns = ['USRID'] + ['next_time_' + col for col in stat_feat]
        log_copy = pd.merge(log_copy,statistic_log,how='outer',on='USRID')
        log_copy = log_copy.groupby(['USRID'],as_index=False).mean()
        log_copy = pd.merge(log_copy,L1,how='left',on='USRID')
#         log_copy = pd.merge(log_copy,USRID_count,how='left',on='USRID')
        pickle.dump(log_copy,open(dump_path,'wb'))
    return log_copy

log_copy = gen_log_feat()

#agg表特征(个人属性与信用卡消费数据)
train_agg_path = '../data/train/train_agg.csv'
test_agg_path = '../data/test/test_agg.csv'

def get_stat_feat(df,values,action,days1,days2):  # day1 起始时间  days2:终止时间
    df = df[df['day'] > days1]
    df = df[df['day'] <= days2]
    stat_feat = ['min','mean','max','median','count','sum','std','var']
    df = df.groupby('USRID')[values].agg(stat_feat).reset_index()   #所以说在进行get_stat_feat之前,uid并不唯一
    df.columns = ['USRID'] + ['%s_%s_%s_' % (values,action,days2) + col for col in stat_feat] #loan_7_min,loan_7_max
    return df

def gen_filter_agg_feat():
    dump_path = './tmp/filter_agg_feat.pkl'
    if os.path.exists(dump_path):
        filter_agg = pickle.load(open(dump_path,'rb'))
    else:
        train_agg = pd.read_csv(train_agg_path,sep='	')
        test_agg = pd.read_csv(test_agg_path,sep='	')
        agg = pd.concat([train_agg,test_agg],copy=False)
        
        #处理偏斜数据 7-9
#         agg_columns = agg.columns
#         skewed_feats = agg[agg_columns].apply(lambda x: x.skew())
#         skewed_feats = skewed_feats[skewed_feats > 10 ]
#         skewed_feats = skewed_feats.index
#         agg[skewed_feats] = np.log1p(agg[skewed_feats])
        
        aggV28 = agg[['USRID','V28']]
        aggV25 = agg[['USRID','V25']]
        aggV20 = agg[['USRID','V20']]
#         aggV19 = agg[['USRID','V19']]
#         aggV18 = agg[['USRID','V18']]
        
        train_log = pd.read_csv(train_log_path,sep='	')
        test_log = pd.read_csv(test_log_path,sep='	')
        log = pd.concat([train_log,test_log],copy=False)
        
        log2 = log.copy()
        log2['day'] = log2['OCC_TIM'].map(lambda x:int(x.split('-')[2].split(' ')[0]))
        log1 = log2[['USRID','day']]
        aggV28_day = pd.merge(log1,aggV28,on=['USRID'],how='left',copy=False)
        aggV25_day = pd.merge(log1,aggV25,on=['USRID'],how='left',copy=False)
        aggV20_day = pd.merge(log1,aggV20,on=['USRID'],how='left',copy=False)
#         aggV19_day = pd.merge(log1,aggV19,on=['USRID'],how='left',copy=False)
#         aggV18_day = pd.merge(log1,aggV18,on=['USRID'],how='left',copy=False)
        USRID = aggV28_day['USRID'].unique()
        exclu1 = [1]*len(USRID)
        exclu2 = [1]*len(USRID)
        exclu3 = [1]*len(USRID)
#         exclu4 = [1]*len(USRID)
#         exclu5 = [1]*len(USRID)
        
        days_df1 = pd.DataFrame({'USRID':USRID,'exclu1':exclu1})
        days_df2 = pd.DataFrame({'USRID':USRID,'exclu2':exclu2})
        days_df3 = pd.DataFrame({'USRID':USRID,'exclu3':exclu3})
#         days_df4 = pd.DataFrame({'USRID':USRID,'exclu4':exclu4})
#         days_df5 = pd.DataFrame({'USRID':USRID,'exclu5':exclu5})
        
        day_list = [0,3,7,14,21,28,31]
        for i in range(len(day_list)-1):
            days1 = day_list[i]
            days2 = day_list[i+1]
            df_V28 = aggV28_day.copy()
            df_V25 = aggV25_day.copy()
            df_V20 = aggV20_day.copy()
#             df_V19 = aggV19_day.copy()
#             df_V18 = aggV18_day.copy()
#             VS = ['V28','V25']
#             for Vi in VS:
            day_dfV28 = get_stat_feat(df_V28,'V28','agg',days1,days2)
            day_dfV25 = get_stat_feat(df_V25,'V25','agg',days1,days2)
            day_dfV20 = get_stat_feat(df_V20,'V20','agg',days1,days2)
#             day_dfV19 = get_stat_feat(df_V19,'V19','agg',days1,days2)
#             day_dfV18 = get_stat_feat(df_V18,'V18','agg',days1,days2)

            days_df1 = pd.merge(days_df1,day_dfV28,how='left',on='USRID')
            days_df2 = pd.merge(days_df2,day_dfV25,how='left',on='USRID')
            days_df3 = pd.merge(days_df3,day_dfV20,how='left',on='USRID')
#             days_df4 = pd.merge(days_df4,day_dfV19,how='left',on='USRID')
#             days_df5 = pd.merge(days_df5,day_dfV18,how='left',on='USRID')
        days_df1 = days_df1.fillna(0.)
        days_df2 = days_df2.fillna(0.)
        days_df3 = days_df3.fillna(0.)
#         days_df4 = days_df4.fillna(0.)
#         days_df5 = days_df5.fillna(0.)
        del days_df1['exclu1']
        del days_df2['exclu2']
        del days_df3['exclu3']
#         del days_df4['exclu4']
#         del days_df5['exclu5']
        filter_agg1 = pd.merge(agg,days_df1,how='left',on='USRID')
        filter_agg2 = pd.merge(filter_agg1,days_df2,how='left',on='USRID')
#         filter_agg3 = pd.merge(filter_agg2,days_df3,how='left',on='USRID')
#         filter_agg4 = pd.merge(filter_agg3,days_df4,how='left',on='USRID')
        filter_agg = pd.merge(filter_agg2,days_df3,how='left',on='USRID')
        
#         agg_V3 = agg[['USRID','V3']]
#         agg_V3["VV3"]=agg_V3["V3"].astype(str).astype("str")
#         getdummies_agg_V3 = pd.get_dummies(agg_V3)
#         filter_agg = pd.merge(filter_agg,getdummies_agg_V3,how='left',on='USRID')
        #del filter_agg['V3']
        
    
        filter_agg.fillna(0.)
        pickle.dump(filter_agg,open(dump_path,'wb'))
    return filter_agg

# V20,V25,V28

filter_agg = gen_filter_agg_feat()
print("sucessful!!!")

#flg表处理
train_flg_path = '../data/train/train_flg.csv'
test_flg_path = '../data/submit_sample.csv'

def gen_flg():
    dump_path = './tmp/flg.pkl'
    if os.path.exists(dump_path):
        flg = pickle.load(open(dump_path,'rb'))
    else:
        train_flg = pd.read_csv(train_flg_path,sep='	')
        test_flg = pd.read_csv(test_flg_path,sep='	')
        test_flg['FLAG']=-1
        del test_flg['RST']
        flg = pd.concat([train_flg,test_flg],copy=False)
        pickle.dump(flg,open(dump_path,'wb'))
    return flg

log_copy = gen_log_feat()

#表merge
def make_data():
    dump_path = './tmp/data.pkl'
    if os.path.exists(dump_path):
        data = pickle.load(open(dump_path,'rb'))
    else:
        log_copy = gen_log_feat()
        filter_agg = gen_filter_agg_feat()
        flg = gen_flg()
        data = pd.merge(filter_agg,flg,how='left',on='USRID')
        data = pd.merge(data,log_copy,how='left',on='USRID')
        pickle.dump(data,open(dump_path,'wb'))
        
    return data

data = make_data()
#train = data[data['FLAG']!=-1]
#test = data[data['FLAG']==-1]
#test = test.drop(['FLAG'],axis=1)
#labels = train.pop('FLAG')
#labels = labels[:len(train)]
#target = np.zeros([len(labels), len(np.unique(labels))])
#target[:, 0] = labels == 0
#target[:, 1] = labels == 1

(三)XGBoost模型搭建

import pickle   #序列化
import os
import missingno as msno
import pandas_profiling
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import lightgbm as lgb
import time
import math
from sklearn.model_selection import KFold

data_path = './tmp/data.pkl'

data = pickle.load(open(data_path,'rb'))
data = data.fillna(0.)
train = data[data['FLAG']!=-1]
test = data[data['FLAG']==-1]
y = train.pop('FLAG')
col = train.columns
X = train[col].values
test = test.drop(['FLAG'],axis=1)

folds = KFold(n_splits=6,shuffle=True,random_state=546799)
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
print("oof_preds.shape:",oof_preds.shape)
print("sub_preds.shape:",sub_preds.shape)

ignore_features = ['USRID']
features=[f for f in train.columns if f not in ignore_features]
for n_fold,(trn_idx,val_idx) in enumerate(folds.split(train)):
    print("trn_idx:",trn_idx)
    trn_x,trn_y = train[features].iloc[trn_idx],y.iloc[trn_idx]
    val_x,val_y = train[features].iloc[val_idx],y.iloc[val_idx]
    clf = XGBClassifier(
        object = 'binary:logistic', 
        booster = "gbtree",
        eval_metric = 'auc',
#         nthread = 8,    # 如果你希望使用CPU全部的核,那就不要输入这个参数,算法会自动检测它。
        eta = 0.025,    
        gamma = 0,        # 用于控制是否后剪枝的参数,越大越保守,一般选0.1,0.2这样子
        # lamda = 2,  #控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合
        max_depth = 6,
        subsample = 0.8,   # 随机采样训练样本  原来是0.8
        colsample_bytree = 0.632,     # 生成树的列采样 原来是0.632
        colsample_bylecel = 0.8,   
        min_child_weight = 19,  # 原来是19   #这个参数默认为1,是每个叶子里面 h 的和至少为多少,对正负样本不均衡的0-1分类而言,假设 h 在0.01附近
        # min_child_weight 为1意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果,控制叶子节点中二阶导和的最小值,该参数越小,越容易overfiting
        alpha = 0,
        #random_state = 42,
#         reg_alpha=100,
        nrounds = 8000,
        scale_pos_weight = 1,
        seed = 4396,  #2018,4396
        n_estimators = 1000,
        learning_rate = 0.1
        # silent :0 ,#设置成1则没有运行信息输出,最好是设置为0.
    )
    clf.fit(trn_x,trn_y,eval_set = [(trn_x,trn_y),(val_x,val_y)],verbose=10,early_stopping_rounds=30)
    oof_preds[val_idx] = clf.predict_proba(val_x)[:,1]
    sub_preds+=clf.predict_proba(test[features])[:,1] / folds.n_splits
    print('Fold %2d AUC: %.6f' % (n_fold + 1,roc_auc_score(val_y,oof_preds[val_idx])))
    del clf,trn_x,trn_y,val_x,val_y
print('Full AUC score %.6f' % roc_auc_score(y,oof_preds))

test['RST'] = sub_preds
time_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
test[['USRID','RST']].to_csv('../submit/%s_%s.csv'%(str(time_date),str(roc_auc_score(y,oof_preds)).split('.')[1]),index=False,sep='	')

#test[['USRID','RST']].to_csv("F:/Jupyter_Notebook_dir/DataFountain_JN/submit/submission_03.csv",index = False,float_format = '%.8f',sep='	')

(四)网格搜索,模型调参

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
import gc
import matplotlib as mpl
from pandas.core.frame import DataFrame
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
sns.set_context("poster",font_scale=1.3)
from missingno import missingno
import missingno as msno
import pandas_profiling
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import cross_validation, metrics 
from sklearn.model_selection import GridSearchCV
import pickle
import time
import os
import math
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
# from xgboost.sklearn import GridSearchCV
import pickle

data_path = 'G:/xjl_docunment/DataFountain/code/tmp/data.pkl'
data = pickle.load(open(data_path,'rb'))
train = data[data['FLAG']!=-1]
test = data[data['FLAG']==-1]
# y = train.pop('FLAG')
col = train.columns
X = train[col].values
test = test.drop(['FLAG'],axis=1)
print('Sucessful')

def modelfit(alg,dtrain,predictors,useTrainCV=True,cv_folds=5,early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgbtrain = xgb.DMatrix(dtrain[predictors].values,label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param,xgbtrain,num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,metrics='auc',early_stopping_rounds=early_stopping_rounds,
                          show_stdv=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors],dtrain['FLAG'],eval_metric='auc')
    
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

    #Print model report:
    print("
Model Report") 
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['FLAG'].values, dtrain_predictions)) 
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['FLAG'], dtrain_predprob)) 
    
    print('sucessful')

    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
        
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
target = 'FLAG'
IDcol = 'USRID'
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target,IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
#  nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)
print('Sucessful')

1)max_depth 和 min_weight 参数调优:先对这两个参数调优,因为它们对最终的结果有很大的影响,先大范围的粗调参数,然后小范围微调

param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,20,3)
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,
                                               min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,
                                               objective='binary:logistic',scale_pos_weight=1,seed=27),
                       param_grid=param_test1,scoring='roc_auc',n_jobs=8,iid=False,cv=5)
gsearch1.fit(train[predictors],train['FLAG'])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

 

param_test11 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,20,3)
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=140,max_depth=5,
                                               min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,
                                               objective='binary:logistic',scale_pos_weight=1,seed=27),
                       param_grid=param_test11,scoring='roc_auc',n_jobs=8,iid=False,cv=5)
gsearch1.fit(train[predictors],train['FLAG'])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

  

params_test2 = {
    'max_depth':[3,5,6],
    'min_child_weight':[1,5,6]
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,
                                              min_child_weight=2,gamma=0,subsample=0.8,
                                               colsample_bytree=0.632,objective = 'binary:logistic',
                                               scale_pos_weight=1,seed=27),param_grid = params_test2,
                        scoring='roc_auc',iid=False, cv=5)

gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_,gsearch2.best_score_

  

param_test2b = {
 'min_child_weight':[1,5,8,10,12,13,19,21]
 }
gsearch2b = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.632, objective= 'binary:logistic',
                                                   scale_pos_weight=1,seed=27),
                         param_grid = param_test2b, scoring='roc_auc',n_jobs=16,iid=False, cv=5)

gsearch2b.fit(train[predictors],train[target])
modelfit(gsearch2b.best_estimator_, train, predictors)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_

2)gamma参数调优:在已经调整好其他参数的基础上我们可以进行gamma参数调优,gamma参数取值的范围可以很大

param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3,
                                                  min_child_weight=19, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  scale_pos_weight=1,seed=27), param_grid = param_test3, 
                        scoring='roc_auc',iid=False, cv=5)

gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

3)调整subsample和colsample——bytree参数:分两个阶段来进行这个步骤,这两个步骤都取0.6,0.7,0.8,0.9作为起始值

 

param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3,
                                                  min_child_weight=19, gamma=0.2, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  scale_pos_weight=1,seed=27), 
                        param_grid = param_test4, scoring='roc_auc',n_jobs=16,iid=False, cv=5)

gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

  

param_test5 = {
    'subsample':[i/100.0 for i in range(75,90,5)],
    'colsample_bytree':[i/100.0 for i in range(65,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, 
                                                  max_depth=3, min_child_weight=19, gamma=0, 
                                                  subsample=0.9, colsample_bytree=0.7, 
                                                  objective= 'binary:logistic', scale_pos_weight=1,
                                                  seed=27), 
                        param_grid = param_test5, scoring='roc_auc',n_jobs=16,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

4)正则化参数调优:应用正则化参数来降低过拟合,由于gamma函数提供了一种更加有效地降低过拟合的方法

param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, 
                                                  min_child_weight=19, gamma=0.2, subsample=0.85, 
                                                  colsample_bytree=0.65, objective= 'binary:logistic', 
                                                  scale_pos_weight=1,seed=27), 
                        param_grid = param_test6, scoring='roc_auc',n_jobs=16,iid=False, cv=5)

gsearch6.fit(train[predictors],train[target])
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

  

param_test7 = {
    'reg_alpha':[100,150,200,500]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, 
                                                  min_child_weight=19, gamma=0.2, subsample=0.85, 
                                                  colsample_bytree=0.65, objective= 'binary:logistic', 
                                                  scale_pos_weight=1,seed=27), 
                        param_grid = param_test7, scoring='roc_auc',n_jobs=16,iid=False, cv=5)

gsearch7.fit(train[predictors],train[target])
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

5)降低学习速率:最后,我们使用较低的学习速率以及使用更多的决策树,我们可以使用XGBoost中的CV函数来进行这一步的工作

 

xgb4 = XGBClassifier(learning_rate =0.01,n_estimators=1000,max_depth=4,min_child_weight=19,gamma=0.2,subsample=0.85,colsample_bytree=0.65,
                     objective= 'binary:logistic',reg_alpha=100,nthread=16,scale_pos_weight=1,seed=27)
modelfit(xgb4, train, predictors)

其实,感觉网格搜索调参,在这里提升的效果并没有特别明显,不如特征工程提升明显,但还是加上了

(五)模型融合

这里采用stacking的形式,融合RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier,SVC

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self,clf,seed=0,params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self,x_train,y_train):
        self.clf.fit(x_train,y_train)
    
    def predict(self,x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

# Class to extend XGboost classifer

#对每个分类器生成5折交叉验证的预测值
def get_oof(clf,x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))   #生成一个向量
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_index,test_index) in enumerate(kf):#enumerate会将一个数组a或者列表生成(0,a[0]),(1,a[1])...
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr,y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i,:] = clf.predict(x_test)  #每一次交叉验证得到的在x_test测试集上的预测值
        
    oof_test[:]=oof_test_skf.mean(axis=0)   #对五次的在测试集上的预测值取均值,
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)   #我们就得到了训练集,验

# Random Forest parameters
rf_params = {
    'n_jobs':-1,
    'n_estimators':500,
    'warm_start':True,     ##True在前面基础上增量训练(重设参数减少训练次数) False默认擦除重新训练
    'max_depth':6,
    'min_samples_leaf':2,
    'max_features':'sqrt',
    'verbose':0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }
    
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = y.ravel()
# train = train.drop(['FLAG'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values# Creats an array of the test data,这种方式可以把DataFrame格式或者Series格式的数据转化为数组形式。
#train.as_matrix()这种方法也可以

# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

gbm = xgb.XGBClassifier(
#learning_rate = 0.02,
n_estimators= 2000,
max_depth= 4,
min_child_weight= 2,
#gamma=1,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread= -1,
scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict_proba(x_test)[:,1]
print("sucessful")

USRID = test['USRID']

import time
test['RST'] = predictions
time_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
test[['USRID','RST']].to_csv('../submit/StackingSubmission.csv',index=False,sep=' ')

  

原文地址:https://www.cnblogs.com/xiaodongsuibi/p/9558634.html