scikit-learn机器学习(四)使用决策树做分类,并画出决策树,随机森林对比

数据来自 UCI 数据集 匹马印第安人糖尿病数据集

载入数据

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif']=[u'simHei']
matplotlib.rcParams['axes.unicode_minus']=False
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import load_breast_cancer

data_set = pd.read_csv('pima-indians-diabetes.csv')
data = data_set.values[:,:]

y = data[:,8]
X = data[:,:8]
X_train,X_test,y_train,y_test = train_test_split(X,y)

建立决策树,网格搜索微调模型

# In[1] 网格搜索微调模型
pipeline = Pipeline([
        ('clf',DecisionTreeClassifier(criterion='entropy'))
        ])
parameters={
        'clf__max_depth':(3,5,10,15,20,25,30,35,40),
        'clf__min_samples_split':(2,3),
        'clf__min_samples_leaf':(1,2,3)
        }
#GridSearchCV 用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数。
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,scoring='f1')
grid_search.fit(X_train,y_train)

# 获取搜索到的最优参数
best_parameters = grid_search.best_estimator_.get_params()
print("最好的F1值为:",grid_search.best_score_)
print('最好的参数为:')
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name,best_parameters[param_name]))
    
# In[2] 输出预测结果并评价
predictions = grid_search.predict(X_test)
print(classification_report(y_test,predictions))
最好的F1值为: 0.5573515325670498
最好的参数为:
tclf__max_depth: 5
tclf__min_samples_leaf: 1
tclf__min_samples_split: 2

评价模型

# In[2] 输出预测结果并评价
predictions = grid_search.predict(X_test)
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

         0.0       0.74      0.89      0.81       124
         1.0       0.67      0.43      0.52        68

画出决策树

# In[3]打印树
from sklearn import tree  
feature_name=data_set.columns.values.tolist()[:-1]   # 列名称
DT = tree.DecisionTreeClassifier(criterion='entropy',max_depth=5,min_samples_split=2,min_samples_leaf=5)
DT.fit(X_train,y_train)

'''
# 法一
import pydotplus
from sklearn.externals.six import StringIO
dot_data = StringIO()
tree.export_graphviz(DT,out_file = dot_data,feature_names=feature_name,
                     class_names=["有糖尿病","无病"],filled=True,rounded=True,
                     special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("Tree.pdf")
print('Visible tree plot saved as pdf.')
'''

# 法二
import graphviz
#ID3为决策树分类器fit之后得到的模型,注意这里必须在fit后执行,在predict之后运行会报错
dot_data = tree.export_graphviz(DT, out_file=None,feature_names=feature_name,class_names=["有糖尿病","无病"]) # doctest: +SKIP
graph = graphviz.Source(dot_data) # doctest: +SKIP
#在同级目录下生成tree.pdf文件
graph.render("tree2") # doctest: +SKIP

随机森林

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif']=[u'simHei']
matplotlib.rcParams['axes.unicode_minus']=False
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import load_breast_cancer

data_set = pd.read_csv('pima-indians-diabetes.csv')
data = data_set.values[:,:]

y = data[:,8]
X = data[:,:8]
X_train,X_test,y_train,y_test = train_test_split(X,y)

RF = RandomForestClassifier(n_estimators=10,random_state=11)
RF.fit(X_train,y_train)
predictions = RF.predict(X_test)
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

         0.0       0.82      0.91      0.86       126
         1.0       0.78      0.61      0.68        66

   micro avg       0.81      0.81      0.81       192
   macro avg       0.80      0.76      0.77       192
weighted avg       0.80      0.81      0.80       192
原文地址:https://www.cnblogs.com/caiyishuai/p/11192156.html