决策树实战

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#鸢尾花类型
def iris_type(s):
    it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
    return it[s]


# 花萼长度、花萼宽度，花瓣长度，花瓣宽度
# iris_feature = 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'

if __name__ == "__main__":
    #设置读取格式
    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False

    path = '.\8.iris.data'  # 数据文件路径
    #载入数据,第4列数据需要进行相应的映射
    data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
    #切分数据,前4列是x,后面的是y
    x, y = np.split(data, (4,), axis=1)
    # 为了可视化，仅使用前两列特征
    x = x[:, :2]
    #测试数据分成百分之30,训练数据分成70%
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

    # 决策树参数估计
    model = Pipeline([
        #标准正态分布
        ('ss', StandardScaler()),
        #创建决策树模型,criterion='entropy':采用熵的形式,max_depth=3:最大深度是3
        ('DTC', DecisionTreeClassifier(criterion='entropy', max_depth=3))])
    #训练模型
    model = model.fit(x_train, y_train)
    #对模型进行验证,y_test_hat为预测结果
    y_test_hat = model.predict(x_test)      # 测试数据

    # 保存
    f = open('.\iris_tree.dot', 'w')
    #把生成的决策树写入到'.\iris_tree.dot'
    tree.export_graphviz(model.get_params('DTC')['DTC'], out_file=f)
    f.close()

    # 画图
    # 横纵各采样多少个值
    N, M = 100, 100
    #获取第0个特征的最大值和最小值
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    # 获取第1个特征的最大值和最小值
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    #平均划分为N等分,包括x1_max
    t1 = np.linspace(x1_min, x1_max, N)
    # 平均划分为M等分,包括x2_max
    t2 = np.linspace(x2_min, x2_max, M)
    # 生成网格采样点,t2为y,t2的一行不变,然后横轴依次往后推
    x1, x2 = np.meshgrid(t1, t2)
    #x1.flat:二维变一维
    x_show = np.stack((x1.flat, x2.flat), axis=1)  # 测试点

    cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    #用模型对网格点进行预测
    y_show_hat = model.predict(x_show)
    # 一维转成二维,使之与输入的形状相同
    y_show_hat = y_show_hat.reshape(x1.shape)
    #设置前景色
    plt.figure(facecolor='pink')
    # 对预测值的方块进行显示
    plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)
    #画测试的数据   c是一行的数据,ravel把列转成行   marker='*'表示测试数据用五角星
    plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test.ravel(), edgecolors='k', s=100, cmap=cm_dark, marker='*')
    # 全部数据
    plt.scatter(x[:, 0], x[:, 1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark)
    #x轴
    plt.xlabel(iris_feature[0], fontsize=15)
    #y轴
    plt.ylabel(iris_feature[1], fontsize=15)
    #从x1最小到x2最小画
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid(True)
    plt.title(u'鸢尾花数据的决策树分类', fontsize=17)
    # plt.show()

    # 训练集上的预测结果
    #y_test_hat 为预测值
    #y_test 为实际的值
    y_test = y_test.reshape(-1)
    print(y_test_hat)
    print(y_test)
    result = (y_test_hat == y_test)   # True则预测正确，False则预测错误
    acc = np.mean(result)
    print('准确度: %.2f%%' % (100 * acc))

    # 过拟合：错误率
    depth = np.arange(1, 15)
    #用于存放错误率
    err_list = []
    for d in depth:
        #决策树分类器
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
        #进行训练
        clf = clf.fit(x_train, y_train)
        #对测试数据进行预测
        y_test_hat = clf.predict(x_test)
        # True则预测正确，False则预测错误
        result = (y_test_hat == y_test)
        #统计正确率
        err = 1 - np.mean(result)
        err_list.append(err)
        print(d, '层决策树的 准确度: %.2f%%' % (100 * err))

    plt.figure(facecolor='pink')
    plt.plot(depth, err_list, 'ro-', lw=2)
    plt.xlabel(u'决策树深度', fontsize=15)
    plt.ylabel(u'错误率', fontsize=15)
    plt.title(u'决策树深度与过拟合', fontsize=17)
    plt.grid(True)
    plt.show()

选不同特征建立决策树

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.tree import DecisionTreeClassifier


# def iris_type(s):
#     it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
#     return it[s]

#鸢尾花类型
def iris_type(s):
    it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
    return it[s]

# 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'

if __name__ == "__main__":
    # 设置读取格式
    mpl.rcParams['font.sans-serif'] = [u'SimHei']  # 黑体 FangSong/KaiTi
    mpl.rcParams['axes.unicode_minus'] = False

    # 数据文件路径
    path = '.\8.iris.data'
    # 载入数据,第4列数据需要进行相应的映射
    data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
    # 切分数据,前4列是x,后面的是y
    x_prime, y = np.split(data, (4,), axis=1)

    #分别使用两个特征
    feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
    #设置显示
    plt.figure(figsize=(10, 9), facecolor='#FFFFFF')
    #分别用不同的特征进行测试
    for i, pair in enumerate(feature_pairs):
        # 准备数据  取出所有的行中对应的列
        x = x_prime[:, pair]

        # 决策树学习
        # 创建决策树模型,criterion='entropy':采用熵的形式,min_samples_leaf=3:最小的叶子节点的样本数目要大于等于3
        clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
        #进行训练
        dt_clf = clf.fit(x, y)

        # 画图
        # 横纵各采样多少个值
        N, M = 500, 500
        # 第0列的范围
        x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
        # 第1列的范围
        x2_min, x2_max = x[:, 1].min(), x[:, 1].max()


        t1 = np.linspace(x1_min, x1_max, N)
        t2 = np.linspace(x2_min, x2_max, M)
        # 生成网格采样点,t2为y,t2的一行不变,然后横轴依次往后推
        x1, x2 = np.meshgrid(t1, t2)
        #形成测试点 x1.flat:二维变一维
        x_test = np.stack((x1.flat, x2.flat), axis=1)

        # 训练集上的预测结果
        y_hat = dt_clf.predict(x)
        y = y.reshape(-1)
        c = np.count_nonzero(y_hat == y)    # 统计预测正确的个数
        print('特征：  ', iris_feature[pair[0]], ' + ', iris_feature[pair[1]],)
        print('	预测正确数目：', c)
        print('	准确率: %.2f%%' % (100 * float(c) / float(len(y))))

        # 显示
        cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
        cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])

        # 所有采样点的预测值
        y_hat = dt_clf.predict(x_test)
        # 使之与输入的形状相同
        y_hat = y_hat.reshape(x1.shape)
        plt.subplot(2, 3, i+1)
        # 预测值
        plt.pcolormesh(x1, x2, y_hat, cmap=cm_light)
        # 样本
        plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', cmap=cm_dark)
        plt.xlabel(iris_feature[pair[0]], fontsize=14)
        plt.ylabel(iris_feature[pair[1]], fontsize=14)
        plt.xlim(x1_min, x1_max)
        plt.ylim(x2_min, x2_max)
        plt.grid()
    plt.suptitle(u'决策树对鸢尾花数据的两特征组合的分类结果', fontsize=18)
    plt.tight_layout(2)
    plt.subplots_adjust(top=0.92)
    plt.show()

决策树回归

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor


if __name__ == "__main__":
    N = 100
    #生成[-3,3)的均匀分布
    x = np.random.rand(N) * 6 - 3
    x.sort()
    #sin函数+噪声 噪声的均值是0,标准差是0.05
    y = np.sin(x) + np.random.randn(N) * 0.05
    print(y)
    x = x.reshape(-1, 1)  # 转置后，得到N个样本，每个样本都是1维的
    print(x)

    #决策树回归
    #criterion='mse':使用均方误差
    reg = DecisionTreeRegressor(criterion='mse', max_depth=9)
    #训练模型
    dt = reg.fit(x, y)
    #生成数据
    x_test = np.linspace(-3, 3, 50).reshape(-1, 1)
    #进行预测
    y_hat = dt.predict(x_test)
    plt.plot(x, y, 'r*', linewidth=2, label='Actual')
    #画预测的结果
    plt.plot(x_test, y_hat, 'g-', linewidth=2, label='Predict')
    plt.legend(loc='upper left')
    plt.grid()
    plt.show()

    # 比较决策树的深度影响
    depth = [2, 4, 6, 8, 10]
    clr = 'rgbmy'
    reg = [DecisionTreeRegressor(criterion='mse', max_depth=depth[0]),
           DecisionTreeRegressor(criterion='mse', max_depth=depth[1]),
           DecisionTreeRegressor(criterion='mse', max_depth=depth[2]),
           DecisionTreeRegressor(criterion='mse', max_depth=depth[3]),
           DecisionTreeRegressor(criterion='mse', max_depth=depth[4])]

    plt.plot(x, y, 'k^', linewidth=2, label='Actual')
    x_test = np.linspace(-3, 3, 50).reshape(-1, 1)
    #用不同深度的决策树进行分析
    for i, r in enumerate(reg):
        dt = r.fit(x, y)
        y_hat = dt.predict(x_test)
        plt.plot(x_test, y_hat, '-', color=clr[i], linewidth=2, label='Depth=%d' % depth[i])
    plt.legend(loc='upper left')
    plt.grid()
    plt.show()

多输出的决策树回归

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

if __name__ == "__main__":
    N = 300
    # [-4,4) 的均匀分布
    x = np.random.rand(N) * 8 - 4
    x.sort()
    y1 = 16 * np.sin(x) ** 3 + np.random.randn(N)*0.05
    y2 = 13 * np.cos(x) - 5 * np.cos(2*x) - 2 * np.cos(3*x) - np.cos(4*x) + 0.1*np.random.randn(N)
    #两行进行堆叠
    y = np.vstack((y1, y2))
    #转置后,就形成了一个个点
    y = np.vstack((y1, y2)).T
    # 转置后，得到N个样本，每个样本都是1维的
    x = x.reshape(-1, 1)

    #做五层的均方误差回归
    deep = 5
    reg = DecisionTreeRegressor(criterion='mse', max_depth=deep)
    #多输出的回归树
    dt = reg.fit(x, y)


    x_test = np.linspace(-4, 4, num=1000).reshape(-1, 1)
    print(x_test)
    # 进行预测
    y_hat = dt.predict(x_test)
    print(y_hat)
    plt.scatter(y[:, 0], y[:, 1], c='r', s=40, label='Actual')
    plt.scatter(y_hat[:, 0], y_hat[:, 1], c='g', marker='s', s=100, label='Depth=%d' % deep, alpha=1)
    plt.legend(loc='upper left')
    plt.xlabel('y1')
    plt.ylabel('y2')
    plt.grid()
    plt.show()

随机森林

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.ensemble import RandomForestClassifier


def iris_type(s):
    it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
    return it[s]

# 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'

if __name__ == "__main__":
    mpl.rcParams['font.sans-serif'] = [u'SimHei']  # 黑体 FangSong/KaiTi
    mpl.rcParams['axes.unicode_minus'] = False

    path = '.\8.iris.data'  # 数据文件路径
    data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
    x_prime, y = np.split(data, (4,), axis=1)

    #分别取出不同的列进行训练
    feature_pairs = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
    plt.figure(figsize=(10, 9), facecolor='#FFFFFF')

    #i代表的是第几个的意思
    for i, pair in enumerate(feature_pairs):
        # 准备数据
        x = x_prime[:, pair]

        # 随机森林
        # n_estimators = 200 使用200棵树
        # criterion = 'entropy' 用熵作为划分依据
        clf = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=4)
        #x中的每一列当成一个行来看待
        rf_clf = clf.fit(x, y.ravel())

        # 画图
        N, M = 500, 500  # 横纵各采样多少个值
        x1_min, x1_max = x[:, 0].min(), x[:, 0].max()  # 第0列的范围
        x2_min, x2_max = x[:, 1].min(), x[:, 1].max()  # 第1列的范围
        t1 = np.linspace(x1_min, x1_max, N)
        t2 = np.linspace(x2_min, x2_max, M)
        x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
        x_test = np.stack((x1.flat, x2.flat), axis=1)  # 测试点

        # 训练集上的预测结果
        y_hat = rf_clf.predict(x)
        y = y.reshape(-1)
        c = np.count_nonzero(y_hat == y)    # 统计预测正确的个数
        print('特征：  ', iris_feature[pair[0]], ' + ', iris_feature[pair[1]],)
        print('	预测正确数目：', c)
        print('	准确率: %.2f%%' % (100 * float(c) / float(len(y))))

        # 显示
        cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
        cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
        y_hat = rf_clf.predict(x_test)  # 预测值
        y_hat = y_hat.reshape(x1.shape)  # 使之与输入的形状相同
        plt.subplot(2, 3, i+1)
        plt.pcolormesh(x1, x2, y_hat, cmap=cm_light)  # 预测值
        plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', cmap=cm_dark)  # 样本
        plt.xlabel(iris_feature[pair[0]], fontsize=14)
        plt.ylabel(iris_feature[pair[1]], fontsize=14)
        plt.xlim(x1_min, x1_max)
        plt.ylim(x2_min, x2_max)
        plt.grid()
    plt.tight_layout(2.5)
    plt.subplots_adjust(top=0.92)
    plt.suptitle(u'随机森林对鸢尾花数据的两特征组合的分类结果', fontsize=18)
    plt.show()