pandas学习之DataFrame结构titanic示例

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.head())
print(titanic_survival.shape)
#判断年龄的缺失值
age = titanic_survival["Age"]
print(age.loc[0:10]) #打印前十行的年龄
age_is_null = pd.isnull(age)#年龄是空值的布尔值
print(age_is_null)
age_null_true = age[age_is_null] #输出年龄为空的索引
print(age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)  #输出年龄为空的数目
#如果不去除缺失值，就无法计算平均年龄
good_age = titanic_survival["Age"][age_is_null==False]
print(good_age) #只打印不为空的年龄的值
print(sum(good_age)/len(good_age)) #年龄的平均值
print(titanic_survival["Age"].mean()) #调用.mean()也可以直接计算年龄的平均值
#每个等级船票的平均价格
passenger_class = [1,2,3]
fares_by_class = {}
for this_class in passenger_class:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    # 这一级乘客的票价的均值
    fare_class_mean = pclass_rows["Fare"].mean()
    fares_by_class[this_class] = fare_class_mean #把三个等级的票价传给字典
print(fares_by_class)
#快速数据统计  pivot_table
passenger_survival = titanic_survival.pivot_table(index="Pclass",
                                            values="Survived",aggfunc=np.mean)
print(passenger_survival)#对于每个Pclass,他们的平均获救概率
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)      #各等舱乘客的平均年龄，aggfunc没指定值，按照求均值的方式做
port_starts = titanic_survival.pivot_table(index="Embarked",
                                           values=["Fare","Survived"],
                                           aggfunc=np.sum) #各个码头的总船票和获救人数
print(port_starts)
#丢掉缺失值 dropna()
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"])
print(new_titanic_survival)  #如果age,sex有缺失值，就直接丢掉这一行
#通过行号和列号找出指定的值,如果超出就指定最后一个
row_index_83_age = titanic_survival.loc[83,"Age"]
print(row_index_83_age)

age_sort = titanic_survival.sort_values("Age",ascending=False)
print(age_sort[0:10])  #年龄降序排序
titanic_reindexed = age_sort.reset_index(drop=True)#index也重新排序
print(titanic_reindexed.loc[0:10])
#上面几个也可以写成如下形式
age_sort = titanic_survival.sort_values("Age",ascending=False).reset_index(drop=True)
#自定义函数 ：apply（）函数
def hundredth_row(column):
    hundredth_item = column.loc[99]
    return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row) #输出第一百行数据

def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)               #每一列的缺失值
column_null_coumt = titanic_survival.apply(not_null_count)
print(column_null_coumt)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age<18:
        return "minor"
    else:
        return "adult"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)
titanic_survival["age_labels"]=age_labels  #各个年龄段获救的概率
age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_group_survival)