pandas 基础

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

创建一个Series ,同时让pandas自动生成索引列

s = pd.Series([1,3,5,np.nan,6,8])
# 查看s
s
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

创建一个DataFrame数据框

### 创建一个DataFrame ,可以传入一个numpy array 可以自己构建索引以及列标
dates = pd.date_range('2018-11-01',periods=7)
#### 比如说生成一个时间序列,以20181101 为起始位置的,7个日期组成的时间序列,数据的类型为datetime64[ns]
dates
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(7,4),index= dates,columns=list('ABCD'))
df
# 产生随机正态分布的数据,7行4列,分别对应的index的长度以及column的长度
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 -0.721314
### 同时用可以使用dict的实行创建DataFrame
df2 = pd.DataFrame({"A":1,
                   "B":"20181101",
                   'C':np.array([3]*4,dtype='int32'),
                   'D':pd.Categorical(['test','train','test','train']),
                   "E":1.5},
                  )
df2
A B C D E
0 1 20181101 3 test 1.5
1 1 20181101 3 train 1.5
2 1 20181101 3 test 1.5
3 1 20181101 3 train 1.5
df2.dtypes
### 查看数据框中的数据类型,常见的数据类型还有时间类型以及float类型
A       int64
B      object
C       int32
D    category
E     float64
dtype: object

查看数据


# 比如说看前5行
df.head()
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
# 后4行
df.tail(4)
A B C D
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 -0.721314
# 查看DataFrame的索引
df.index
DatetimeIndex(['2018-11-01', '2018-11-02', '2018-11-03', '2018-11-04',
               '2018-11-05', '2018-11-06', '2018-11-07'],
              dtype='datetime64[ns]', freq='D')
# 查看DataFrame的列索引
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')
# 查看DataFrame的数据,将DataFrame转化为numpy array 的数据形式
df.values
array([[ 2.19709382,  0.90891281, -0.64802911, -1.32554721],
       [ 0.35466158, -1.22424591, -0.50120854, -1.49017025],
       [-0.24583358, -1.04959585,  2.36622453,  0.6373212 ],
       [-0.6899396 ,  0.47128154, -1.41740143,  0.26890482],
       [-0.54804068, -0.84193368,  0.57312781, -1.05517487],
       [-0.6910726 ,  0.93301611,  1.85764662,  0.77552552],
       [ 0.46707509,  0.36240665,  2.31937488, -0.721314  ]])

数据的简单统计

# 可以使用describe函数对DataFrame中的数值型数据进行统计
df.describe()
A B C D
count 7.000000 7.000000 7.000000 7.000000
mean 0.120563 -0.062880 0.649962 -0.415779
std 1.031487 0.942664 1.553537 0.955789
min -0.691073 -1.224246 -1.417401 -1.490170
25% -0.618990 -0.945765 -0.574619 -1.190361
50% -0.245834 0.362407 0.573128 -0.721314
75% 0.410868 0.690097 2.088511 0.453113
max 2.197094 0.933016 2.366225 0.775526
df2.describe()
### 对于其他的数据类型的数据describe函数会自动过滤掉
A C E
count 4.0 4.0 4.0
mean 1.0 3.0 1.5
std 0.0 0.0 0.0
min 1.0 3.0 1.5
25% 1.0 3.0 1.5
50% 1.0 3.0 1.5
75% 1.0 3.0 1.5
max 1.0 3.0 1.5
### DataFrame 的转置,将列索引与行索引进行调换,行数据与列数进行调换
df.T
2018-11-01 00:00:00 2018-11-02 00:00:00 2018-11-03 00:00:00 2018-11-04 00:00:00 2018-11-05 00:00:00 2018-11-06 00:00:00 2018-11-07 00:00:00
A 2.197094 0.354662 -0.245834 -0.689940 -0.548041 -0.691073 0.467075
B 0.908913 -1.224246 -1.049596 0.471282 -0.841934 0.933016 0.362407
C -0.648029 -0.501209 2.366225 -1.417401 0.573128 1.857647 2.319375
D -1.325547 -1.490170 0.637321 0.268905 -1.055175 0.775526 -0.721314
df
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 -0.721314

数据的排序

df.sort_index(ascending=False)
### 降序,按照列进行降序,通过该索引列
A B C D
2018-11-07 0.467075 0.362407 2.319375 -0.721314
2018-11-06 -0.691073 0.933016 1.857647 0.775526
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-01 2.197094 0.908913 -0.648029 -1.325547

print(df.sort_values(by=['B','A']))
#  默认是升序,可以选择多指排序,先照B,后排A,如果B中的数据一样,则按照A中的大小进行排序
df.sort_values(by='B')
                   A         B         C         D
2018-11-02  0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596  2.366225  0.637321
2018-11-05 -0.548041 -0.841934  0.573128 -1.055175
2018-11-07  0.467075  0.362407  2.319375 -0.721314
2018-11-04 -0.689940  0.471282 -1.417401  0.268905
2018-11-01  2.197094  0.908913 -0.648029 -1.325547
2018-11-06 -0.691073  0.933016  1.857647  0.775526
A B C D
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175
2018-11-07 0.467075 0.362407 2.319375 -0.721314
2018-11-04 -0.689940 0.471282 -1.417401 0.268905
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-06 -0.691073 0.933016 1.857647 0.775526

选择数据(类似于数据库中sql语句)

df['A']
# 取出单独的一列数据,等价于df.A
2018-11-01    2.197094
2018-11-02    0.354662
2018-11-03   -0.245834
2018-11-04   -0.689940
2018-11-05   -0.548041
2018-11-06   -0.691073
2018-11-07    0.467075
Freq: D, Name: A, dtype: float64
# 通过[]进行行选择切片
df[0:3]
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
# 同时对于时间索引而言,可以直接使用比如
df['2018-11-01':'2018-11-04']
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
2018-11-04 -0.689940 0.471282 -1.417401 0.268905

另外可以使用标签来选择


df.loc['2018-11-01']
A    2.197094
B    0.908913
C   -0.648029
D   -1.325547
Name: 2018-11-01 00:00:00, dtype: float64
#### 通过标签来进行多个轴上的进行选择
df.loc[:,["A","B"]] # 等价于df[["A","B"]]
A B
2018-11-01 2.197094 0.908913
2018-11-02 0.354662 -1.224246
2018-11-03 -0.245834 -1.049596
2018-11-04 -0.689940 0.471282
2018-11-05 -0.548041 -0.841934
2018-11-06 -0.691073 0.933016
2018-11-07 0.467075 0.362407
df.loc["2018-11-01":"2018-11-03",["A","B"]]
A B
2018-11-01 2.197094 0.908913
2018-11-02 0.354662 -1.224246
2018-11-03 -0.245834 -1.049596
#### 获得一个标量数据
df.loc['2018-11-01','A']
2.1970938156943904

通过位置获取数据

df.iloc[3]  # 获得第四行的数据
A   -0.689940
B    0.471282
C   -1.417401
D    0.268905
Name: 2018-11-04 00:00:00, dtype: float64
df.iloc[1:3,1:4]  #  与numpy中的ndarray类似
B C D
2018-11-02 -1.224246 -0.501209 -1.490170
2018-11-03 -1.049596 2.366225 0.637321
# 可以选取不连续的行或者列进行取值
df.iloc[[1,3],[1,3]]
B D
2018-11-02 -1.224246 -1.490170
2018-11-04 0.471282 0.268905
#  对行进行切片处理
df.iloc[1:3,:]
A B C D
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-03 -0.245834 -1.049596 2.366225 0.637321
# 对列进行切片
df.iloc[:,1:4]
B C D
2018-11-01 0.908913 -0.648029 -1.325547
2018-11-02 -1.224246 -0.501209 -1.490170
2018-11-03 -1.049596 2.366225 0.637321
2018-11-04 0.471282 -1.417401 0.268905
2018-11-05 -0.841934 0.573128 -1.055175
2018-11-06 0.933016 1.857647 0.775526
2018-11-07 0.362407 2.319375 -0.721314
# 获取特定的值
df.iloc[1,3]
-1.4901702546027098

布尔值索引

# 使用单列的数据作为条件进行筛选
df[df.A>0]
A B C D
2018-11-01 2.197094 0.908913 -0.648029 -1.325547
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170
2018-11-07 0.467075 0.362407 2.319375 -0.721314
 #很少用到,很少使用这种大范围的条件进行筛选
df[df>0] 
A B C D
2018-11-01 2.197094 0.908913 NaN NaN
2018-11-02 0.354662 NaN NaN NaN
2018-11-03 NaN NaN 2.366225 0.637321
2018-11-04 NaN 0.471282 NaN 0.268905
2018-11-05 NaN NaN 0.573128 NaN
2018-11-06 NaN 0.933016 1.857647 0.775526
2018-11-07 0.467075 0.362407 2.319375 NaN
# 使用isin()方法过滤
df2.head()
A B C D E
0 1 20181101 3 test 1.5
1 1 20181101 3 train 1.5
2 1 20181101 3 test 1.5
3 1 20181101 3 train 1.5
df2[df2['D'].isin(['test'])]
A B C D E
0 1 20181101 3 test 1.5
2 1 20181101 3 test 1.5

设定数值(类似于sql update 或者add)

  • 设定一个新的列
df['E'] = [1,2,3,4,5,6,7]
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 1
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 2
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
  • 通过标签设定新的值
df.loc['2018-11-01','E']= 10  # 第一行,E列的数据修改为10
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 10
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 2
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
df.iloc[1,4]=5000  # 第二行第五列数据修改为5000
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 10
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 5000
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
df3 =df.copy()
df3[df3<0]= -df3
df3  # 都变成非负数
A B C D E
2018-11-01 2.197094 0.908913 0.648029 1.325547 10
2018-11-02 0.354662 1.224246 0.501209 1.490170 5000
2018-11-03 0.245834 1.049596 2.366225 0.637321 3
2018-11-04 0.689940 0.471282 1.417401 0.268905 4
2018-11-05 0.548041 0.841934 0.573128 1.055175 5
2018-11-06 0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 0.721314 7

缺失值处理

df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 -1.325547 10
2018-11-02 0.354662 -1.224246 -0.501209 -1.490170 5000
2018-11-03 -0.245834 -1.049596 2.366225 0.637321 3
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 4
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 5
2018-11-06 -0.691073 0.933016 1.857647 0.775526 6
2018-11-07 0.467075 0.362407 2.319375 -0.721314 7
df['E']=[1,np.nan,2,np.nan,4,np.nan,6]
df.loc['2018-11-01':'2018-11-03','D']=np.nan
df
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 0.354662 -1.224246 -0.501209 NaN NaN
2018-11-03 -0.245834 -1.049596 2.366225 NaN 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 NaN
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 NaN
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
  • 去掉缺失值的行
df4 = df.copy()
df4.dropna(how='any')
A B C D E
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
df4.dropna(how='all')
# """DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)""" 
# aixs 轴0或者1 index或者columns
# how 方式
# thresh 超过阈值个数的缺失值
# subset 那些字段的处理
# inplace 是否直接在原数据框中的替换
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 0.354662 -1.224246 -0.501209 NaN NaN
2018-11-03 -0.245834 -1.049596 2.366225 NaN 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 NaN
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 NaN
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
  • 对缺失值就行填充
df4.fillna(1000)
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 1000.000000 1.0
2018-11-02 0.354662 -1.224246 -0.501209 1000.000000 1000.0
2018-11-03 -0.245834 -1.049596 2.366225 1000.000000 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 1000.0
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 1000.0
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
  • 对数据进行布尔值进行填充
pd.isnull(df4)
A B C D E
2018-11-01 False False False True False
2018-11-02 False False False True True
2018-11-03 False False False True False
2018-11-04 False False False False True
2018-11-05 False False False False False
2018-11-06 False False False False True
2018-11-07 False False False False False

数据操作

#统计的工作一般情况下都不包含缺失值,
df4.mean() 
#  默认是对列进行求平均,沿着行方向也就是axis=0
A    0.120563
B   -0.062880
C    0.649962
D   -0.183015
E    3.250000
dtype: float64
df4.mean(axis=1)
#  沿着列方向求每行的平均
2018-11-01    0.864494
2018-11-02   -0.456931
2018-11-03    0.767699
2018-11-04   -0.341789
2018-11-05    0.425596
2018-11-06    0.718779
2018-11-07    1.685509
Freq: D, dtype: float64
 # 对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播:
s = pd.Series([1,3,4,np.nan,6,7,8],index=dates)
s
2018-11-01    1.0
2018-11-02    3.0
2018-11-03    4.0
2018-11-04    NaN
2018-11-05    6.0
2018-11-06    7.0
2018-11-07    8.0
Freq: D, dtype: float64
df4.sub(s,axis='index')
A B C D E
2018-11-01 1.197094 -0.091087 -1.648029 NaN 0.0
2018-11-02 -2.645338 -4.224246 -3.501209 NaN NaN
2018-11-03 -4.245834 -5.049596 -1.633775 NaN -2.0
2018-11-04 NaN NaN NaN NaN NaN
2018-11-05 -6.548041 -6.841934 -5.426872 -7.055175 -2.0
2018-11-06 -7.691073 -6.066984 -5.142353 -6.224474 NaN
2018-11-07 -7.532925 -7.637593 -5.680625 -8.721314 -2.0
df4
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 0.354662 -1.224246 -0.501209 NaN NaN
2018-11-03 -0.245834 -1.049596 2.366225 NaN 2.0
2018-11-04 -0.689940 0.471282 -1.417401 0.268905 NaN
2018-11-05 -0.548041 -0.841934 0.573128 -1.055175 4.0
2018-11-06 -0.691073 0.933016 1.857647 0.775526 NaN
2018-11-07 0.467075 0.362407 2.319375 -0.721314 6.0
df4.apply(np.cumsum)
A B C D E
2018-11-01 2.197094 0.908913 -0.648029 NaN 1.0
2018-11-02 2.551755 -0.315333 -1.149238 NaN NaN
2018-11-03 2.305922 -1.364929 1.216987 NaN 3.0
2018-11-04 1.615982 -0.893647 -0.200415 0.268905 NaN
2018-11-05 1.067942 -1.735581 0.372713 -0.786270 7.0
2018-11-06 0.376869 -0.802565 2.230360 -0.010745 NaN
2018-11-07 0.843944 -0.440158 4.549735 -0.732059 13.0
df4.apply(lambda x: x.max()-x.min())
A    2.888166
B    2.157262
C    3.783626
D    1.830700
E    5.000000
dtype: float64

统计个数与离散化

s = pd.Series(np.random.randint(0,7,size=15))
s
0     1
1     6
2     3
3     1
4     1
5     0
6     4
7     1
8     3
9     4
10    6
11    1
12    4
13    3
14    5
dtype: int32
s.value_counts()
# 统计元素的个数,并按照元素统计量进行排序,未出现的元素不会显示出来
1    5
4    3
3    3
6    2
5    1
0    1
dtype: int64
s.reindex(range(0,7))
# 按照固定的顺序输出元素的个数统计
0    1
1    6
2    3
3    1
4    1
5    0
6    4
dtype: int32
s.mode()
#  众数 
0    1
dtype: int32
  • 离散化
# 连续值转化为离散值,可以使用cut函数进行操作(bins based on vlaues) qcut (bins based on sample
# quantiles) 函数
arr = np.random.randint(0,20,size=15)  # 正态分布
arr
array([ 3, 14, 10,  2,  2,  0, 17, 13,  7,  0, 15, 14,  4, 19,  9])
factor = pd.cut(arr,3)
factor
[(-0.019, 6.333], (12.667, 19.0], (6.333, 12.667], (-0.019, 6.333], (-0.019, 6.333], ..., (12.667, 19.0], (12.667, 19.0], (-0.019, 6.333], (12.667, 19.0], (6.333, 12.667]]
Length: 15
Categories (3, interval[float64]): [(-0.019, 6.333] < (6.333, 12.667] < (12.667, 19.0]]
pd.value_counts(factor)
(12.667, 19.0]     6
(-0.019, 6.333]    6
(6.333, 12.667]    3
dtype: int64
factor1 = pd.cut(arr,[-1,5,10,15,20])
pd.value_counts(factor1)
(-1, 5]     6
(10, 15]    4
(5, 10]     3
(15, 20]    2
dtype: int64
factor2 = pd.qcut(arr,[0,0.25,0.5,0.75,1])
pd.value_counts(factor2)
(9.0, 14.0]      4
(2.5, 9.0]       4
(-0.001, 2.5]    4
(14.0, 19.0]     3
dtype: int64
原文地址:https://www.cnblogs.com/onemorepoint/p/9979725.html