机器学习项目实战

#机器学习项目实战1-泰坦尼克号获救预测

#1-1数据导入pandas库
import pandas as pd
pd.set_option("max_columns",1000) #设置最大展示列的数目为1000
pd.set_option("max_rows",1000)    #设置最大展示行的数目为1000
data=pd.read_csv("train.csv")
print(data.head())                #打印出数据的前五行

#1-2数据初步分析
print(data.describe())            #简单的数据统计分析-只针对连续性变量float

#1-3数据预处理-
#第一步：缺失值补充（一般采用均值或者中位数进行填充）
data["Age"]=data["Age"].fillna(data["Age"].median()) #使用均值进行填充
print(data.describe())
#第二步：对于str的字符/文本数据需要进行数据的编码与数据形式转换
print(data["Sex"].unique())  #进行数据类型的输出，看看数据的不同种类
data.loc[data["Sex"]=="male","Sex"]=0
data.loc[data["Sex"]=="female","Sex"]=1
print(data["Sex"].unique())

print(data["Embarked"].unique())        #进行数据类型的输出，看看数据的不同种类
print(data["Embarked"].value_counts())  #进行各个类型数据个数的统计
data["Embarked"]=data["Embarked"].fillna("S") #对于字符型数据可以使用最多的值进行填充
data.loc[data["Embarked"]=="S","Embarked"]=0
data.loc[data["Embarked"]=="C","Embarked"]=1
data.loc[data["Embarked"]=="Q","Embarked"]=2
print(data["Embarked"].unique())

#1-4建立机器学习模型

from sklearn.linear_model import LinearRegression #导入线性回归算法
from sklearn.model_selection import KFold   #交叉验证Kfold方式
#选择需要预测的相关特征列表
predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
kf=KFold(n_splits=3,random_state=1,shuffle=True)
x=data[predictors]  #输入数据集
y=data["Survived"]
print(x)
print(y)

from sklearn.model_selection import cross_val_score  #采用交叉验证的方式进行模型的对比输出
from sklearn.linear_model import LogisticRegression
alg=LogisticRegression()
import numpy as np
scores=cross_val_score(alg,x,y,cv=5)
score=np.mean(scores)
print(score)

#特征选择工程,特征重要性和相关性分析如下
import numpy as np
from sklearn.feature_selection import  SelectKBest,f_classif
import matplotlib.pyplot as plt

selector=SelectKBest(f_classif,k=5)
selector.fit(x,y)
s=-1*np.log10(selector.pvalues_)
plt.bar(range(len(predictors)),s)
plt.xticks(range(len(predictors)),predictors)
plt.show()

#选择方法2—基于集成学习模型的特征重要性输出
from sklearn import ensemble
params={"n_estimators":800,"max_depth":4,"min_samples_split":2,"learning_rate":0.01,"loss":"ls"}
clf=ensemble.GradientBoostingRegressor(**params)
clf.fit(x,y)
feature_importance=clf.feature_importances_
feature_importance=100.0*(feature_importance/feature_importance.max())
sort_index=np.argsort(feature_importance)
pos=np.arange(sort_index.shape[0])+0.5
plt.barh(pos,feature_importance[sort_index],align="center")
plt.xlabel("Relative Importance")
plt.title("Veriable Importance")
plt.show()
print(sort_index)
plt.bar(range(len(predictors)),feature_importance)
plt.xticks(range(len(predictors)),predictors)
plt.show()

#对于无序的特征数据进行数据的独热编码one_hot编码
#pandas方式编码-比较简单方便
f=["Sex","Embarked"]
for i in f:
    temp=pd.get_dummies(data[i])
    data=pd.concat([data,temp],axis=1) #将转换的列合并
    data=data.drop([i],axis=1)  #删掉之前的变量
print(data.shape)
print(data.head())
'''
#sklearn方法
#先将文本信息进行分列编码
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
la1=LabelEncoder()
la1.fit(list(data["Embarked"].values))
data["Embarked"]=la1.transform(list(data["Embarked"].values))
print(data["Embarked"].head())
df=OneHotEncoder().fit_transform(data["Embarked"].values.reshape(-1,1)).toarray()
df=pd.DataFrame(df)
data=pd.concat([data,df],axis=1) #将转换的列合并
data=data.drop(["Embarked"],axis=1)  #删掉之前的变量
print(data.head())
'''