使用遗传算法进行调参【决策树】

背景

最近接到一个项目，使用遗传算法对决策树进行调参；以前都是使用网格搜索来调参，没想到也可以用ga来做这件事情，再加上以前也写过比较多的ga算法，也就接了下来，本来以为要花一点时间来搞，实际上熟悉的话2-3个小时就能搞定。

算法

做项目肯定是要用库的啦（不可能自己写的），选择使用sklearn的决策树，ga算法流程比较清晰，就自己手写了，下面关键介绍ga算法的几个步骤是如何做的。

初始化

选择决策树比较重要的三个参数"max_depth", "min_samples_split", "max_leaf_nodes"，穷举这三个参数可能的值进行初始化

1 def init():
2     forest = []
3     for max_depth in range(5, 31, 3):
4         for min_samples_split in range(5, 25, 5):
5             for max_leaf_nodes in range(5, 25, 5):
6                 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
7     return forest

选择

使用准确率作为评分依据得到累计概率

1 def tree_score(X, Y, clf):
2     kf = KFold(n_splits=5)
3     score = []
4     for train_index, valid_index in kf.split(X):
5         clf.fit(X[train_index], Y[train_index])
6         pred = clf.predict(X[valid_index])
7         score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred))
8     return np.mean(score)

 1 def adaption(X, Y, forest):
 2     score = []
 3     for t in forest:
 4         score.append(tree_score(X, Y, t))
 5     best_pos = np.argmax(score)
 6     global BEST_TREE
 7     BEST_TREE = copy.deepcopy(forest[best_pos])
 8     sm = np.sum(score)
 9     ada = score / sm
10     for i in range(1, len(ada)):
11         ada[i] = ada[i] + ada[i - 1]
12     return ada

选择这里可以注意一下，可以使用精英策略，即：把当前这一轮最好的个体，直接送入下一代中。这个策略在提升算法的稳定性上又很大用处

交叉

交叉使用的是参数的交叉，比如clf1，和clf2 然后随机得到一个找到一个交换参数的位置p，进行交叉

 1 def _cross_2_tree(t1, t2):
 2     sz = len(param)
 3 
 4     t1_param_value = _dict_get_value_list(t1.__dict__, param)
 5     t2_param_value = _dict_get_value_list(t2.__dict__, param)
 6     pos = random.randint(0, sz - 1)
 7     t1_left = t1_param_value[0:pos + 1]
 8     t1_right = t1_param_value[pos + 1:]
 9 
10     t2_left = t2_param_value[0:pos + 1]
11     t2_right = t2_param_value[pos + 1:]
12 
13     t1_left.extend(t2_right)
14     t2_left.extend(t1_right)
15     return [make_tree(t1_left), make_tree(t2_left)]
16 
17 
18 def cross(forest):
19     result = []
20     sz = len(forest)
21     for i in range(1, sz, 2):
22         result.extend(_cross_2_tree(forest[i - 1], forest[i]))
23     return result

变异

这一步使用比较简单的策略，直接在参数上进行+1或者-1操作

 1 def variation(forest):
 2     result = []
 3     for t in forest:
 4         r = random.random()
 5         if r < VAR_P:
 6             result.append(t)
 7             continue
 8 
 9         # 变异
10         sz = len(param)
11         pos = random.randint(0, sz - 1)
12         val = t.__dict__[param[pos]]
13         up = random.random()
14 
15         if up > 0.5:
16             val = val + 1
17         else:
18             val = val - 1
19 
20         if val < 2:
21             val = 2
22         t.__dict__[param[pos]] = val
23         result.append(t)
24     return result

完整代码

  1 import pandas as pd
  2 import numpy as np
  3 from sklearn.tree import DecisionTreeClassifier
  4 from sklearn.model_selection import train_test_split
  5 from sklearn.model_selection import KFold
  6 from sklearn.metrics import accuracy_score
  7 import random
  8 import copy
  9 import matplotlib.pyplot as plt
 10 
 11 param = ["max_depth", "min_samples_split", "max_leaf_nodes"]
 12 epochs = 300
 13 VAR_P = 0.4
 14 BEST_TREE = None
 15 
 16 
 17 def make_tree(param_value):
 18     p = dict(zip(param, param_value))
 19     return DecisionTreeClassifier(**p)
 20 
 21 
 22 def init():
 23     forest = []
 24     for max_depth in range(5, 31, 3):
 25         for min_samples_split in range(5, 25, 5):
 26             for max_leaf_nodes in range(5, 25, 5):
 27                 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
 28     return forest
 29 
 30 def tree_score(X, Y, clf):
 31     kf = KFold(n_splits=5)
 32     score = []
 33     for train_index, valid_index in kf.split(X):
 34         clf.fit(X[train_index], Y[train_index])
 35         pred = clf.predict(X[valid_index])
 36         score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred))
 37     return np.mean(score)
 38 
 39 
 40 def evulate_forest(X, Y, forest):
 41     score = []
 42     for t in forest:
 43         score.append(tree_score(X, Y, t))
 44     worse_pos = np.argmin(score)
 45     global BEST_TREE
 46     forest[worse_pos] = BEST_TREE
 47     score[worse_pos] = tree_score(X, Y, BEST_TREE)
 48 
 49     score.sort(reverse=True)
 50     return score, np.mean(score)
 51 
 52 
 53 def adaption(X, Y, forest):
 54     score = []
 55     for t in forest:
 56         score.append(tree_score(X, Y, t))
 57     best_pos = np.argmax(score)
 58     global BEST_TREE
 59     BEST_TREE = copy.deepcopy(forest[best_pos])
 60     sm = np.sum(score)
 61     ada = score / sm
 62     for i in range(1, len(ada)):
 63         ada[i] = ada[i] + ada[i - 1]
 64     return ada
 65 
 66 
 67 def choose_trees(forest, ada):
 68     sz = len(forest)
 69     result = []
 70     for i in range(sz):
 71         r = random.random()
 72         for j in range(len(ada)):
 73             if r <= ada[j]:
 74                 result.append(copy.deepcopy(forest[j]))
 75                 break
 76     return result
 77 
 78 
 79 def _dict_get_value_list(mp, key_list):
 80     value_list = []
 81     for key in key_list:
 82         value_list.append(mp.get(key))
 83     return value_list
 84 
 85 
 86 def _cross_2_tree(t1, t2):
 87     sz = len(param)
 88 
 89     t1_param_value = _dict_get_value_list(t1.__dict__, param)
 90     t2_param_value = _dict_get_value_list(t2.__dict__, param)
 91     pos = random.randint(0, sz - 1)
 92     t1_left = t1_param_value[0:pos + 1]
 93     t1_right = t1_param_value[pos + 1:]
 94 
 95     t2_left = t2_param_value[0:pos + 1]
 96     t2_right = t2_param_value[pos + 1:]
 97 
 98     t1_left.extend(t2_right)
 99     t2_left.extend(t1_right)
100     return [make_tree(t1_left), make_tree(t2_left)]
101 
102 
103 def cross(forest):
104     result = []
105     sz = len(forest)
106     for i in range(1, sz, 2):
107         result.extend(_cross_2_tree(forest[i - 1], forest[i]))
108     return result
109 
110 
111 def variation(forest):
112     result = []
113     for t in forest:
114         r = random.random()
115         if r < VAR_P:
116             result.append(t)
117             continue
118 
119         # 变异
120         sz = len(param)
121         pos = random.randint(0, sz - 1)
122         val = t.__dict__[param[pos]]
123         up = random.random()
124 
125         if up > 0.5:
126             val = val + 1
127         else:
128             val = val - 1
129 
130         if val < 2:
131             val = 2
132         t.__dict__[param[pos]] = val
133         result.append(t)
134     return result
135 
136 
137 df = pd.read_csv("../dataset/data.csv", index_col=0)
138 X = df.iloc[:, 1:].values
139 Y = df.iloc[:, 0].values
140 forest = init()
141 
142 mean_score_arr = []
143 
144 for i in range(epochs):
145     ada = adaption(X, Y, forest)
146     forest = choose_trees(forest, ada)
147     forest = cross(forest)
148     forest = variation(forest)
149     score, mean = evulate_forest(X, Y, forest)
150     mean_score_arr.append(mean)
151 
152     print(i, "/", epochs, ":")
153     print("mean:", mean)
154 
155 plt.plot(np.arange(len(mean_score_arr)), mean_score_arr)
156 plt.show()

总结

感觉使用ga进行调参很鸡肋，还不如使用网格搜索来的快，但是作为一种思想可以学习一下的。

最近搞了一个人工智能交流的群：831852635，有兴趣的可以加一下！