十分钟Pandas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 创建Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
# 创建DataFrame
dates = pd.date_range('20200101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df2 = pd.DataFrame({ 'A' : 1.,
     'B' : pd.Timestamp('20130102'),
     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
     'D' : np.array([3] * 4,dtype='int32'),
     'E' : pd.Categorical(["test","train","test","train"]),
     'F' : 'foo' })

查看

df.index    # 索引
df.columns  # 列
df.values   # 值

df.describe() # 快速汇总

df.T # 数据转置

# 排序
df.sort_index(axis=1, ascending=False) # 按轴进行排序
df.sort_values(by="B")                 # 按值排序

选择

df['A']        # 选择单列
df[['A', 'B']] # 选择多列

# 对行切片
df[0: 3]
df['20200101': '20200103']

# 使用标签选择
df.loc[dates[0]]

# 通过标签在多个轴上选择
df.loc[:, ['A', 'B']]

# 标签切片
df.loc['20200101': '20200103', ['A', 'B']]
df.loc['20200101', ['A', 'B']]

# 获取一个值
df.loc[dates[0], 'A']
df.at[dates[0], 'A']

# 通过位置选择
df.iloc[3]               # 选择行
df.iloc[3:5,0:2]         # 切片
df.iloc[[1,2,4],[0,2]]   # 指定位置列表
df.iloc[1: 3, :]         # 行切片
df.iloc[:, 1: 3]         # 列切片
df.iloc[1, 1]            # 特定值
df.iat[1, 1]             # 同上

# 布尔索引
df[df.A > 0]
df[df > 0]
# isin判断
df['E'] = ['one', 'one','two','three','four','three']
df[df['E'].isin(['one', 'two'])]

增加

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20200102', periods=6))
df['F'] = s1

df.at[dates[0], 'A'] = 0                 # 通过标签设置
df.iat[0, 1] = 0                         # 通过位置设置
df.loc[:, 'D'] = np.array([5] * len(df)) # 通过numpy数组设置

# 通过where操作设置
df[df > 0] = -df

缺失值的处理

# reindex() 可以对指定轴上的索引值进行改变/增加/删除，返回原数据的拷贝
df1 = df.reindex(index=dates[0:4], columns=list(df.coulmns) + ["E"])
df1.loc[dates[0]: dates[1], 'E'] = 1


df1.dropna(how='any')    # 删除包含缺失值的行
df1.fillna(value=5)      # 对缺失值进行填充
pd.isnull(df1)           # 判断缺失值

统计

# 描述性统计
df.mean()
df.mean(1)

s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

# 对数据应用函数
df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())

# 合并数据
pieces = [df[:3], df[3: 7], df[7:]]
pd.concat(pieces)

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s = df.iloc[3]
df.append(s, ignore_index=True)

Join

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

pd.merge(left, right, on='key')

分组合并

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
     'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
     'C' : np.random.randn(8),
     'D' : np.random.randn(8)})

df.groupby('A').sum()
df.groupby(['A', 'B']).sum()

透视表

df = pd.DataFrame({
     'A' : ['one', 'one', 'two', 'three'] * 3, 
     'B' : ['A', 'B', 'C'] * 4, 
     'C' : ['foo', 'foo', 'foo', 'bar','bar', 'bar'] * 2, 
     'D' : np.random.randn(12),
     'E' : np.random.randn(12)})
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

时间序列

rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min').sum()

# 时区表示
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts_utc = ts.tz_localize('UTC')
# 时区转换
ts_utc.tz_convert('US/Eastern')

Categorical

画图

pd.set_option('display.mpl_style', 'default') # 使图表漂亮一些

持久化数据

# sep 分隔符
# parse_dates 解析日期
# index_col 将该列设置为索引
# encoding 设置编码
df = pd.read_csv("data.csv", sep=";", encoding="utf-8", parse_dates=["Date"], 
     dayfirst=True, index_col="Date")