数据清洗、合并、转化和重构

数据清洗

  • 数据清洗是数据分析关键的一步,直接影响之后的处理工作

  • 数据需要修改吗?有什么需要修改的吗?数据应该怎么调整才能适用于接下来的分析和挖掘?

  • 是一个迭代的过程,实际项目中可能需要不止一次地执行这些清洗操作

  • 处理缺失数据:pd.fillna(),pd.dropna()

数据连接(pd.merge)

  • pd.merge

  • 根据单个或多个键将不同DataFrame的行连接起来

  • 类似数据库的连接操作

import pandas as pd
import numpy as np

df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2' : np.random.randint(0,10,3)})

print(df_obj1)
print(df_obj2)

1. 默认将重叠列的列名作为“外键”进行连接

# 默认将重叠列的列名作为“外键”进行连接
print(pd.merge(df_obj1, df_obj2))

2. on显示指定“外键”

# on显示指定“外键”
print(pd.merge(df_obj1, df_obj2, on='key'))

3. left_on,左侧数据的“外键”,right_on,右侧数据的“外键”

# left_on,right_on分别指定左侧数据和右侧数据的“外键”

# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})

print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2'))

默认是“内连接”(inner),即结果中的键是交集

how指定连接方式

4. “外连接”(outer),结果中的键是并集

# “外连接”
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer'))

5. “左连接”(left)

# 左连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left'))

6. “右连接”(right)

 右连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))

整体代码:

import pandas as pd
import numpy as np

df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1': np.random.randint(0, 10, 7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2': np.random.randint(0, 10, 3)})

print(df_obj1)
print(df_obj2)
# 默认将重叠列的列名作为“外键”进行连接
print(pd.merge(df_obj1, df_obj2))
# on显示指定“外键”
print(pd.merge(df_obj1, df_obj2, on='key'))
# left_on,right_on分别指定左侧数据和右侧数据的“外键”

# 更改列名
df_obj1 = df_obj1.rename(columns={'key': 'key1'})
df_obj2 = df_obj2.rename(columns={'key': 'key2'})

print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2'))
# “外连接”
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer'))
# 左连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left'))
# 右连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))

效果

  key  data1
0   b      0
1   b      3
2   a      8
3   c      9
4   a      4
5   a      4
6   b      2
  key  data2
0   a      7
1   b      0
2   d      1
  key  data1  data2
0   b      0      0
1   b      3      0
2   b      2      0
3   a      8      7
4   a      4      7
5   a      4      7
  key  data1  data2
0   b      0      0
1   b      3      0
2   b      2      0
3   a      8      7
4   a      4      7
5   a      4      7
  key1  data1 key2  data2
0    b      0    b      0
1    b      3    b      0
2    b      2    b      0
3    a      8    a      7
4    a      4    a      7
5    a      4    a      7
  key1  data1 key2  data2
0    b    0.0    b    0.0
1    b    3.0    b    0.0
2    b    2.0    b    0.0
3    a    8.0    a    7.0
4    a    4.0    a    7.0
5    a    4.0    a    7.0
6    c    9.0  NaN    NaN
7  NaN    NaN    d    1.0
  key1  data1 key2  data2
0    b      0    b    0.0
1    b      3    b    0.0
2    a      8    a    7.0
3    c      9  NaN    NaN
4    a      4    a    7.0
5    a      4    a    7.0
6    b      2    b    0.0
  key1  data1 key2  data2
0    b    0.0    b      0
1    b    3.0    b      0
2    b    2.0    b      0
3    a    8.0    a      7
4    a    4.0    a      7
5    a    4.0    a      7
6  NaN    NaN    d      1

7. 处理重复列名

suffixes,默认为_x, _y

import pandas as pd
import numpy as np

# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data' : np.random.randint(0,10,3)})

print(pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right')))

效果:

  key  data_left  data_right
0   b          0           4
1   b          2           4
2   b          6           4
3   a          2           4
4   a          1           4
5   a          7           4

8. 按索引连接

left_index=True或right_index=True

# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])

print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))

数据合并(pd.concat)

import numpy as np
import pandas as pd

arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))

print(arr1)
print(arr2)

print(np.concatenate([arr1, arr2]))
print(np.concatenate([arr1, arr2], axis=1))

效果:

  key  data1  data2
0   b      2      1
1   b      1      1
6   b      9      1
2   a      9      0
4   a      1      0
5   a      5      0

 

 

2. pd.concat

  • 注意指定轴方向,默认axis=0

  • join指定合并方式,默认为outer

  • Series合并时查看行索引有无重复

1) index 没有重复的情况
# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))

print(ser_obj1)
print(ser_obj2)
print(ser_obj3)

print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
print(pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1))

效果:

0    5
1    9
2    1
3    9
4    8
dtype: int32
5    2
6    1
7    0
8    8
dtype: int32
9     8
10    3
11    4
dtype: int32
0     5
1     9
2     1
3     9
4     8
5     2
6     1
7     0
8     8
9     8
10    3
11    4
dtype: int32
      0    1    2
0   5.0  NaN  NaN
1   9.0  NaN  NaN
2   1.0  NaN  NaN
3   9.0  NaN  NaN
4   8.0  NaN  NaN
5   NaN  2.0  NaN
6   NaN  1.0  NaN
7   NaN  0.0  NaN
8   NaN  8.0  NaN
9   NaN  NaN  8.0
10  NaN  NaN  3.0
11  NaN  NaN  4.0
2) index 有重复的情况
# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))

print(ser_obj1)
print(ser_obj2)
print(ser_obj3)

print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))

效果:

0    5
1    5
2    5
3    7
4    2
dtype: int32
0    0
1    7
2    5
3    0
dtype: int32
0    3
1    5
2    9
dtype: int32
0    5
1    5
2    5
3    7
4    2
0    0
1    7
2    5
3    0
0    3
1    5
2    9
dtype: int32
3) DataFrame合并时同时查看行索引和列索引有无重复
import pandas as pd
import numpy as np

df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
                       columns=['A', 'B'])
df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
                       columns=['C', 'D'])
print(df_obj1)
print(df_obj2)

print(pd.concat([df_obj1, df_obj2],sort=False))
print(pd.concat([df_obj1, df_obj2], axis=1, join='inner'))

效果

   A  B
a  8  4
b  5  2
c  5  0
   C  D
a  2  8
b  8  4
     A    B    C    D
a  8.0  4.0  NaN  NaN
b  5.0  2.0  NaN  NaN
c  5.0  0.0  NaN  NaN
a  NaN  NaN  2.0  8.0
b  NaN  NaN  8.0  4.0
   A  B  C  D
a  8  4  2  8
b  5  2  8  4

数据重构

1. stack

  • 将列索引旋转为行索引,完成层级索引

  • DataFrame->Series

  • 沿轴方向将多个对象合并到一起

1. NumPy的concat

np.concatenate

import numpy as np
import pandas as pd

df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)

stacked = df_obj.stack()
print(stacked)

效果:

   data1  data2
0      5      2
1      0      4
2      0      4
3      3      2
4      6      0
0  data1    5
   data2    2
1  data1    0
   data2    4
2  data1    0
   data2    4
3  data1    3
   data2    2
4  data1    6
   data2    0
dtype: int32

2. unstack

  • 将层级索引展开

  • Series->DataFrame

认操作内层索引,即level=-1

import numpy as np
import pandas as pd

df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)

stacked = df_obj.stack()

# 默认操作内层索引
print(stacked.unstack())

# 通过level指定操作索引的级别
print(stacked.unstack(level=0))

效果:

   data1  data2
0      9      8
1      8      5
2      1      7
3      5      8
4      9      9
   data1  data2
0      9      8
1      8      5
2      1      7
3      5      8
4      9      9
       0  1  2  3  4
data1  9  8  1  5  9
data2  8  5  7  8  9

数据转换

一、 处理重复数据

1 duplicated() 返回布尔型Series表示每行是否为重复行

import numpy as np
import pandas as pd

df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
                       'data2' : np.random.randint(0, 4, 8)})
print(df_obj)

print(df_obj.duplicated())

2 drop_duplicates() 过滤重复行

默认判断全部列

可指定按某些列判断

print(df_obj.drop_duplicates())
print(df_obj.drop_duplicates('data2'))

3. 根据map传入的函数对每行或每列进行转换

  • Series根据map传入的函数对每行或每列进行转换
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj)

print(ser_obj.map(lambda x : x ** 2))

二、数据替换

replace根据值的内容进行替换

# 单个值替换单个值
print(ser_obj.replace(1, -100))

# 多个值替换一个值
print(ser_obj.replace([6, 8], -100))

# 多个值替换多个值
print(ser_obj.replace([4, 7], [-100, -200]))

 整体代码

import numpy as np
import pandas as pd

df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
                       'data2' : np.random.randint(0, 4, 8)})

print("df_obj")
print(df_obj)
print("df_obj.duplicated()")
print(df_obj.duplicated())
print("df_obj.drop_duplicates()")
print(df_obj.drop_duplicates())
print("df_obj.drop_duplicates('data2')")

print(df_obj.drop_duplicates('data2'))

ser_obj = pd.Series(np.random.randint(0,10,10))
print("ser_obj")
print(ser_obj)
print("ser_obj.map(lambda x : x ** 2")
print(ser_obj.map(lambda x : x ** 2))
print("ser_obj.replace(1, -100)")
# 单个值替换单个值
print(ser_obj.replace(1, -100))
print("ser_obj.replace([6, 8], -100)")
# 多个值替换一个值
print(ser_obj.replace([6, 8], -100))
print("ser_obj.replace([4, 7], [-100, -200])")
# 多个值替换多个值
print(ser_obj.replace([4, 7], [-100, -200]))

效果

df_obj
  data1  data2
0     a      2
1     a      2
2     a      2
3     a      1
4     b      3
5     b      3
6     b      0
7     b      0
df_obj.duplicated()
0    False
1     True
2     True
3    False
4    False
5     True
6    False
7     True
dtype: bool
df_obj.drop_duplicates()
  data1  data2
0     a      2
3     a      1
4     b      3
6     b      0
df_obj.drop_duplicates('data2')
  data1  data2
0     a      2
3     a      1
4     b      3
6     b      0
ser_obj
0    1
1    2
2    3
3    8
4    8
5    9
6    5
7    2
8    6
9    6
dtype: int32
ser_obj.map(lambda x : x ** 2
0     1
1     4
2     9
3    64
4    64
5    81
6    25
7     4
8    36
9    36
dtype: int64
ser_obj.replace(1, -100)
0   -100
1      2
2      3
3      8
4      8
5      9
6      5
7      2
8      6
9      6
dtype: int32
ser_obj.replace([6, 8], -100)
0      1
1      2
2      3
3   -100
4   -100
5      9
6      5
7      2
8   -100
9   -100
dtype: int32
ser_obj.replace([4, 7], [-100, -200])
0    1
1    2
2    3
3    8
4    8
5    9
6    5
7    2
8    6
9    6
dtype: int32
原文地址:https://www.cnblogs.com/loaderman/p/11967380.html