使用Graphlab参加Kaggle比赛(2017-08-20 发布于知乎)

之前用学生证在graphlab上申了一年的graphlab使用权（华盛顿大学机器学习课程需要）然后今天突然想到完全可以用这个东东来参加kaggle.

下午参考了一篇教程，把notebook上面的写好了

本文很多代码参考了turi官网的一个教程，有兴趣的同学可以去看原版 https://turi.com/learn/gallery/notebooks/who_survived_the_titanic.html

代码

import graphlab as gl
%matplotlib inline
import matplotlib.pyplot as mpl 
mpl.rcParams['figure.figsize']=(15.0,8.0) 
import numpy as np

第一步：数据探索

导入数据

train = graphlab.SFrame.read_csv('train.csv')

数据探索与数据可视化

#看看除了Survived这一列以外其他列的缺值情况
columns = ("Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")
not_null=[sum(1 for el in train[column] if el or el == 0)for column in columns]
null = [len(train) - el for el in not_null]
#数字指代第几列
indexes = np.arange(len(columns))
width = 0.5
#用柱形图表示缺值情况
not_null_bar = mpl.bar(indexes, not_null, width, color='green', edgecolor='white', alpha=0.8)#非空为绿，底色为白
null_bar = mpl.bar(indexes, null, width, color='red', edgecolor='white', bottom=not_null, alpha=0.8)#空值为红，底色为白
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)#横轴的范围
#柱形图标题
mpl.title('Null values for each column', fontsize=20, weight='bold')
#x轴单位长度
mpl.xticks(indexes + width/2., columns, fontsize=16)
#y轴单位长度
mpl.yticks(np.arange(0,1200,100))
#右上角为图例
mpl.legend( (not_null_bar[0], null_bar[0]), ('Not Null', 'Null') )

观察上图我们知道Age列有少量缺值，Cabin列有大量的缺值，于是我们需要补全Age缺值，但是把Cabin列整个忽略

直接用Age的均值补全空值

train = train.fillna('Age',train['Age'].mean())
#看看除了Survived这一列以外其他列的缺值情况
columns = ("Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")
not_null=[sum(1 for el in train[column] if el or el == 0)for column in columns]
null = [len(train) - el for el in not_null]
#数字指代第几列
indexes = np.arange(len(columns))
width = 0.5
#用柱形图表示缺值情况
not_null_bar = mpl.bar(indexes, not_null, width, color='green', edgecolor='white', alpha=0.8)#非空为绿，底色为白
null_bar = mpl.bar(indexes, null, width, color='red', edgecolor='white', bottom=not_null, alpha=0.8)#空值为红，底色为白
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)#横轴的范围
#柱形图标题
mpl.title('Null values for each column', fontsize=20, weight='bold')
#x轴单位长度
mpl.xticks(indexes + width/2., columns, fontsize=16)
#y轴单位长度
mpl.yticks(np.arange(0,1200,100))
#右上角为图例
mpl.legend( (not_null_bar[0], null_bar[0]), ('Not Null', 'Null') )

我们看看Pclass与生存率的关系

passenger_class = train["Pclass"].astype(str)
#观察每个Pclass的存活率
#用groupby方法
class_distribution = train.groupby(["Pclass", "Survived"], {'count':gl.aggregate.COUNT()})
#用0和1过滤出生存和死亡
survived = class_distribution.filter_by(1,'Survived').sort("Pclass")
died = class_distribution.filter_by(0,'Survived').sort("Pclass")

width = 0.5 
#柱形图的参数
survived_bar = mpl.bar(survived["Pclass"], survived["count"], width, color='green', edgecolor='white', alpha=0.8)
died_bar = mpl.bar(died["Pclass"], died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)

mpl.title('Survived by Passengers Class', fontsize=20, weight='bold')
mpl.xticks(survived["Pclass"] + width/2., survived["Pclass"], fontsize=16)
mpl.xlim(0.5,4)
mpl.yticks(np.arange(0,600,50))
mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )

for ind in np.arange(len(survived)):
    ind = int(ind)
    x = 1 + ind + width / 2.
    y = survived["count"][ind] + died["count"][ind] + 10 
    percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
    mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')

由此可见，Pclass的存活率从1到3逐次下降

我们看看性别与生存率的关系

sex_distribution = train.groupby(["Sex", "Survived"], {'count':gl.aggregate.COUNT()})

survived = sex_distribution.filter_by(1,'Survived').sort("Sex")
died = sex_distribution.filter_by(0,'Survived').sort("Sex")

indexes = np.arange(len(survived["Sex"]))


width = 0.5 

survived_bar = mpl.bar(indexes, survived["count"], width, color='green', edgecolor='white', alpha=0.8)
died_bar = mpl.bar(indexes, died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)

mpl.title('Survived by Sex', fontsize=20, weight='bold')
survived["Sex"] = [sex.capitalize() for sex in survived["Sex"]]
mpl.xticks(indexes + width/2., survived["Sex"], fontsize=16)
mpl.xlim(-0.5,2)
mpl.yticks(np.arange(0,700, 50))
mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )

for ind in indexes:
    ind = int(ind)
    x = ind + width / 2.
    y = survived["count"][ind] + died["count"][ind] + 10 
    percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
    mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')
mpl.show()

我们看看年龄与生存率的关系

为了更加直观的体现成人与小孩的区别，我再增加一个"Categorized_Age"列

我们使用apply方法来对每个元素进行作用，小于18岁称为小孩，其余均为大人。

#增加列，18以下称为child
train['Categorized_Age'] = train['Age'].apply(lambda x: "Child" if x <= 18 else "Adult")
#用groupby方法把二者关联
age_distribution = train.groupby(["Categorized_Age", "Survived"], {'count':gl.aggregate.COUNT()}).dropna()
#过滤数据
survived = age_distribution.filter_by(1,'Survived').sort("Categorized_Age")
died = age_distribution.filter_by(0,'Survived').sort("Categorized_Age")
#柱形图参数设置
indexes = np.arange(len(survived["Categorized_Age"]))


width = 0.5 

survived_bar = mpl.bar(indexes, survived["count"], width, color='green', edgecolor='white', alpha=0.8)
died_bar = mpl.bar(indexes, died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)

mpl.title('Survived by Age Categorization', fontsize=20, weight='bold')
survived["Categorized_Age"] = [sex.capitalize() for sex in survived["Categorized_Age"]]
mpl.xticks(indexes + width/2., survived["Categorized_Age"], fontsize=16)
mpl.xlim(-0.5,2)
mpl.yticks(np.arange(0,700, 50))
mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )

for ind in indexes:
    ind = int(ind)
    x = ind + width / 2.
    y = survived["count"][ind] + died["count"][ind] + 10 
    percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
    mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')


mpl.show()

由上图可知，未成年人的存活率远大于成人

我们看看家眷人数与生存率的关系

下面的代码算出了家眷人数与生存率的关系。第一个for循环(line 6)是画图需要，遍历分组完生存率的各个家庭，若某个规模的所有家庭没有人生存，还是要加上一列。事实上，bar方法(line 12,13) 希望在每一个家庭规模都要对应的生存率，但是有5或者8个家眷的家庭都gg了。因此，我们用append方法 (line 8) 增加了两列，生存率记为0。

sibling_spouses = train["SibSp"].astype(str)
sibsp_distribution = train.groupby(["SibSp", "Survived"], {'count':gl.aggregate.COUNT()}).sort(["SibSp"])

survived = sibsp_distribution.filter_by(1,"Survived")
died = sibsp_distribution.filter_by(0,"Survived")

for sibsp in sibsp_distribution["SibSp"]:
    if not survived.filter_by(sibsp, "SibSp"):
        survived = survived.append(gl.SFrame({'SibSp': [sibsp], 'Survived': [1], 'count':[0]}))

width = 0.5 

survived_bar = mpl.bar(survived["SibSp"], survived["count"], width, color='green', edgecolor='white', alpha=0.8)
died_bar = mpl.bar(died["SibSp"], died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)

mpl.title('Survived by SibSp', fontsize=20, weight='bold')
mpl.xticks(survived["SibSp"] + width/2., survived["SibSp"], fontsize=16)
mpl.xlim(-0.5,9)
mpl.yticks(np.arange(0,750,50))
mpl.xlabel("SibSp", fontsize=16)
mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )

for ind in np.arange(len(survived)):
    ind = int(ind)
    x = survived["SibSp"][ind] + width / 2.
    y = survived["count"][ind] + died["count"][ind] + 10 
    percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
    mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')

mpl.show()

由上图可知，有一个配偶的家庭生存率最高，三口之家次之，接下来才是单身狗，而家眷超过三人生存希望渺茫.

我们看看有没有孩子与生存率的关系

parents_children = train["Parch"].astype(str)
parch_distribution = train.groupby(["Parch", "Survived"], {'count':gl.aggregate.COUNT()})

survived = parch_distribution.filter_by(1,"Survived")
died = parch_distribution.filter_by(0,"Survived")

for parch in parch_distribution["Parch"]:
    if not survived.filter_by(parch, "Parch"):
        survived = survived.append(gl.SFrame({'Parch': [parch], 'Survived': [1], 'count':[0]}))

survived = survived.sort("Parch")
died = died.sort("Parch")

width = 0.5 

survived_bar = mpl.bar(survived["Parch"], survived["count"], width, color='green', edgecolor='white', alpha=0.8)
died_bar = mpl.bar(died["Parch"], died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)

mpl.title('Survived by Parch', fontsize=20, weight='bold')
mpl.xticks(survived["Parch"] + width/2., survived["Parch"], fontsize=16)
mpl.xlim(-0.5,7)
mpl.yticks(np.arange(0,800,50))
mpl.xlabel("Parch", fontsize=16)
mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )

for ind in np.arange(len(survived)):
    ind = int(ind)
    x = survived["Parch"][ind] + width / 2.
    y = survived["count"][ind] + died["count"][ind] + 10 
    percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
    mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')

mpl.show()

我们看看船费与生存率的关系（有钱人可能有特权

fare = train["Fare"]
survived = train.filter_by(1,'Survived')["Fare"]
died = train.filter_by(0,'Survived')["Fare"]

data_to_plot = [died, survived]

bp = mpl.boxplot(data_to_plot,patch_artist=True, vert=0)

## change outline color, fill color and linewidth of the boxes
for box in bp['boxes']:
    # change outline color
    box.set( color='#7570b3', linewidth=2)
    # change fill color
    box.set( facecolor = '#1b9e77' )

## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
    whisker.set(color='#7570b3', linewidth=2)

## change color and linewidth of the caps
for cap in bp['caps']:
    cap.set(color='#7570b3', linewidth=2)

## change color and linewidth of the medians
for median in bp['medians']:
    median.set(color='#b2df8a', linewidth=2)

## change the style of fliers and their fill
for flier in bp['fliers']:
    flier.set(marker='o', color='#e7298a', alpha=0.5)


mpl.yticks([1,2],['Died', 'Survived'], fontsize=20)
mpl.xticks(np.arange(0,700, 20))
mpl.xlim(-10,515)
mpl.title("Survived by Fare", fontsize=20, weight='bold')
mpl.show()

这个图是反着看的，活下来的人跟死去的人花的船费对比。活下来的人普遍花了较多的船费，均值在35刀。而死去的人花费均值才几美刀。（注意有个花500多刀的真·土豪

我们看看上船渡口与生存率的关系

port = train["Embarked"].apply(
                                    lambda el: el + " (S = Southampton)" if el == "S" 
                                        else ( el + " (C = Cherbourg)" if el == "C" 
                                         else (el + " (Q = Queenstown)" if el == "Q" else None))) 
port.tail(1) # force the lambda to materialize before .show() is processed
port.show()

embarked_distribution = train.groupby(["Embarked", "Survived"], {'count':gl.aggregate.COUNT()}).dropna()

survived = embarked_distribution.filter_by(1,'Survived').sort("Embarked")
survived = survived[1:]
died = embarked_distribution.filter_by(0,'Survived').sort("Embarked")

indexes = np.arange(len(survived["Embarked"]))

width = 0.5 

survived_bar = mpl.bar(indexes, survived["count"], width, color='green', edgecolor='white', alpha=0.8)
died_bar = mpl.bar(indexes, died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)

mpl.title('Survived by Port of Embarkation', fontsize=20)
labels = [ el + "
(S = Southampton)" if el == "S" else ( el + "
(C = Cherbourg)" if el == "C" else el + "
(Q = Queenstown)") for el in survived["Embarked"]] 
mpl.xticks(np.arange(len(survived["Embarked"])) + width/2.,labels, fontsize=16)


for ind in indexes:
    ind = int(ind)
    x = ind + width / 2.
    y = survived["count"][ind] + died["count"][ind] + 10 
    percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
    mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')


mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )

mpl.show()

所以Cherbourg上船的人存活率巨高……我个人不太明白为什么

第二步：模型构建

在Embarked列中有一些缺值，我们补全一下

train["Embarked"] = train["Embarked"].apply(lambda x: x if x != '' else "S")
port_of_embarkation = train["Embarked"]
port_of_embarkation.tail(1)
port_of_embarkation.show()

在训练集中再取80%来训练模型，20%来验证模型。

train_set, test_set = train.random_split(0.8, seed=4)
print "Rows for training:", train_set.num_rows()
print "Rows for testing:", test_set.num_rows()

试一下 gradient boosted tree 这个模型

model_4 = gl.boosted_trees_regression.create(train_set,target='Survived', 
                                  features=['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Embarked', 'Fare'])
result_4 = model_4.evaluate(test_set)

print result_4


下面是训练过程

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

Boosted trees regression:
--------------------------------------------------------
Number of examples          : 663
Number of features          : 7
Number of unpacked features : 7
+-----------+--------------+--------------------+----------------------+---------------+-----------------+
| Iteration | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
+-----------+--------------+--------------------+----------------------+---------------+-----------------+
| 1         | 0.094067     | 0.640132           | 0.640625             | 0.413718      | 0.452089        |
| 2         | 0.095066     | 0.736341           | 0.741699             | 0.361147      | 0.435120        |
| 3         | 0.097067     | 0.799792           | 0.795940             | 0.326181      | 0.414205        |
| 4         | 0.098068     | 0.843834           | 0.853179             | 0.300373      | 0.418672        |
| 5         | 0.099068     | 0.866550           | 0.875894             | 0.284084      | 0.414071        |
| 6         | 0.100069     | 0.886572           | 0.895917             | 0.268531      | 0.401603        |
+-----------+--------------+--------------------+----------------------+---------------+-----------------+
{'max_error': 0.9767722487449646, 'rmse': 0.3790493668897309}

三、导入测试集进行预测

test = graphlab.SFrame.read_csv('test.csv')
model_4.predict(test)
dtype: float
Rows: 418
[0.24605900049209595, 0.1579868197441101, 0.09492728114128113, 0.08076220750808716, 0.820347249507904, 0.13742545247077942, 0.46745458245277405, 0.08334535360336304, 0.6385629177093506, 0.053301453590393066, 0.7933655977249146, 0.10734456777572632, 0.9794546365737915, 0.0696893036365509, 0.9803991913795471, 0.9651352167129517, 0.08926722407341003, 0.32400867342948914, 0.8363758325576782, 0.1579868197441101, 0.4781973361968994, 0.6420668363571167, 0.4161583185195923, 0.28341546654701233, 0.9170076847076416, 0.0696893036365509, 0.9794546365737915, 0.17743894457817078, 0.5841416120529175, 0.7112432718276978, 0.0696893036365509, 0.09834089875221252, 0.7118383646011353, 0.36395323276519775, 0.47720423340797424, 0.2933708429336548, 0.4699748754501343, 0.16753268241882324, 0.0941736102104187, 0.5083406567573547, 0.2918650507926941, 0.7348397970199585, 0.10613331198692322, 0.9710206985473633, 0.9803991913795471, 0.14958679676055908, 0.42003297805786133, 0.5664023756980896, 0.9672679901123047, 0.7332731485366821, 0.5267215967178345, 0.1717779040336609, 0.9495010375976562, 0.9067643880844116, 0.8308284282684326, 0.05739110708236694, 0.08792659640312195, 0.11708483099937439, 0.8308284282684326, 0.9750292301177979, 0.06759494543075562, 0.13685157895088196, 0.10684752464294434, 0.7940642237663269, 0.1582772135734558, 0.7426018714904785, 0.7501979470252991, 0.1021573543548584, 0.2818759083747864, 0.8806270360946655, 0.7940642237663269, 0.06759494543075562, 0.7951251268386841, 0.2818759083747864, 0.9750292301177979, 0.28711044788360596, 0.8174170255661011, 0.9488879442214966, 0.13685157895088196, 0.7940642237663269, 0.893699049949646, 0.04857367277145386, 0.20609065890312195, 0.7933655977249146, 0.6543059349060059, 0.8308284282684326, 0.9047337770462036, 0.16753268241882324, 0.8481748104095459, 0.9108253717422485, 0.5572522878646851, 0.7125066518783569, 0.35652855038642883, 0.8174170255661011, 0.28670477867126465, 0.28864753246307373, 0.9726588726043701, 0.16057392954826355, 0.70356285572052, 0.1119779646396637, ... ]