XGBoost使用教程（进阶篇）三

一、Importing all the libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
二、Reading the file

还是蘑菇数据集，直接采用Kaggle竞赛中22维特征 https://www.kaggle.com/uciml/mushroom-classification

数据集下载地址：http://download.csdn.net/download/u011630575/10266626

# path to where the data lies
dpath = './data/'
data = pd.read_csv(dpath+"mushrooms.csv")
data.head(6)
三、Let us check if there is any null values
data.isnull().sum() #检查数据有没有空值
四、check if we have two claasification. Either the mushroom is poisonous or edible
data['class'].unique() #检查是否只有蘑菇的种类，有毒，可使用
print(data.dtypes)
五、check if 22 features(1st one is label) and 8124 instances
data.shape #22个特征 8124个样例第一个是标签
六、The dataset has values in strings. We need to convert all the unique values to integers. Thus we perform label encoding on the data 标准化标签

from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder() #标准化标签，将标签值统一转换成range(标签值个数-1)范围内
for col in data.columns:
data[col] = labelencoder.fit_transform(data[col])

data.head()
Separating features and label

X = data.iloc[:,1:23] # 获取1-23行特征
y = data.iloc[:, 0] # 获取0行标签
X.head()
y.head()
Splitting the data into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
七、default Logistic Regression

from sklearn.linear_model import LogisticRegression
model_LR= LogisticRegression()
model_LR.fit(X_train,y_train)
y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_LR.score(X_test, y_pred)
注：np.where(condition,x,y) 是三元运算符，conditon条件成立则结果为x，否则为y。

accuracy

auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)
八、Logistic Regression(Tuned model) 调整模型

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

LR_model= LogisticRegression()

tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
'penalty': ['l1','l2']
}
九、CV

from sklearn.model_selection import GridSearchCV

LR= GridSearchCV(LR_model, tuned_parameters,cv=10)
LR.fit(X_train,y_train)
print(LR.best_params_)
y_prob = LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
LR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)
十、Default Decision Tree model

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
y_prob = model_tree.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_tree.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十一、Let us tune the hyperparameters of the Decision tree model

from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()

tuned_parameters= { 'max_features': ["auto","sqrt","log2"],
'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
}
#tuned_parameters= { 'max_features': ["auto","sqrt","log2"] }

#If “auto”, then max_features=sqrt(n_features).
from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD, tuned_parameters,cv=10)
DD.fit(X_train, y_train)
print(DD.grid_scores_)
print(DD.best_score_)
print(DD.best_params_)
y_prob = DD.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
DD.score(X_test, y_pred)
auc_roc=metrics.classification_report(y_test,y_pred)
print(auc_roc)
十二、Default Random Forest
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()
model_RR.fit(X_train,y_train)
y_prob = model_RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_RR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十三、Let us tuned the parameters of Random Forest just for the purpose of knowledge
1) max_features

2) n_estimators 估计量

3) min_sample_leaf

from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

tuned_parameters = {'min_samples_leaf' range(10,100,10), 'n_estimators' : range(10,100,10),
'max_features':['auto','sqrt','log2']
}

from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR, tuned_parameters,cv=10)

RR.fit(X_train,y_train)

print(RR.grid_scores_)

print(RR.best_score_)

print(RR.best_params_)

y_prob = RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十四、Default XGBoost

from xgboost import XGBClassifier
model_XGB=XGBClassifier()
model_XGB.fit(X_train,y_train)
y_prob = model_XGB.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_XGB.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc
十五、特征重要性
在XGBoost中特征重要性已经自动算好，存放在featureimportances

print(model_XGB.feature_importances_)
from matplotlib import pyplot
pyplot.bar(range(len(model_XGB.feature_importances_)), model_XGB.feature_importances_)
pyplot.show()
# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model_XGB)
pyplot.show()
可以根据特征重要性进行特征选择
from numpy import sort
from sklearn.feature_selection import SelectFromModel

# Fit model using each importance as a threshold
thresholds = sort(model_XGB.feature_importances_)
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(model_XGB, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
# eval model
select_X_test = selection.transform(X_test)
y_pred = selection_model.predict(select_X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
accuracy*100.0))