DataFrame合并数据集 pandas.merge

from pandas import DataFrame
import pandas as pd

df1 = DataFrame({'key':['b','b','a','c','a','a','b'],
                 'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],
                 'data2':range(3)})
print(df1)
'''
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
'''
print(df2)
'''
   data2 key
0      0   a
1      1   b
2      2   d
'''

# merge 根据一个或多个键将不同的dataframe中的行连接起来,类似数据库的连接
# 没有指定列名时,默认用重叠的列名做键
pd.merge(df1,df2)

# inner:交集,merge默认做的inner连接
print(pd.merge(df1,df2,how='inner',on='key'))
'''
   data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0
'''
# outer:并集
print(pd.merge(df1,df2,how='outer',on='key'))
'''
   data1 key  data2
0    0.0   b    1.0
1    1.0   b    1.0
2    6.0   b    1.0
3    2.0   a    0.0
4    4.0   a    0.0
5    5.0   a    0.0
6    3.0   c    NaN
7    NaN   d    2.0
'''

# left:左连接
print(pd.merge(df1,df2,how='left',on='key'))
'''
   data1 key  data2
0      0   b    1.0
1      1   b    1.0
2      2   a    0.0
3      3   c    NaN
4      4   a    0.0
5      5   a    0.0
6      6   b    1.0
'''

# right:右连接
print(pd.merge(df1,df2,how='right',on='key'))
'''
   data1 key  data2
0    0.0   b      1
1    1.0   b      1
2    6.0   b      1
3    2.0   a      0
4    4.0   a      0
5    5.0   a      0
6    NaN   d      2
'''

# 如果两个对象列名不同也可以分别进行指定
df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],
                 'data1':range(7)})
df4 = DataFrame({'key2':['a','b','d'],
                 'data2':range(3)})
df_merge2 = pd.merge(df3,df4,left_on='key1',right_on='key2')
print(df_merge2)
'''
   data1 key1  data2 key2
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a
'''

# 多对多的合并
df5 = DataFrame({'key':['b','a','a','c'],
                 'data1':range(4)})
df6 = DataFrame({'key':['a','a','a','b','d'],
                 'data2':range(5)})
print(df5)
'''
   data1 key
0      0   b
1      1   a
2      2   a
3      3   c
'''
print(df6)
'''
   data2 key
0      0   a
1      1   a
2      2   a
3      3   b
4      4   d
'''
print(pd.merge(df5,df6,on='key',how='inner')) # 多对多连接产生的行是笛卡尔积,左边2个a,右边3个a,最终6个a。
'''
   data1 key  data2
0      0   b      3
1      1   a      0
2      1   a      1
3      1   a      2
4      2   a      0
5      2   a      1
6      2   a      2
'''

# 要根据多个键进行合并,传入一个由列名组成的列表
left = DataFrame({'key1':['foo','foo','bar'],
                  'key2':['one','two','one'],
                  'lval':[1,2,3]})
right = DataFrame({'key1':['foo','foo','bar','bar'],
                  'key2':['one','one','one','two'],
                  'rval':[4,5,6,7]})

print(left)
'''
  key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3
'''
print(right)
'''
  key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7
'''
print(pd.merge(left,right,on=['key1','key2'],how='outer'))
'''
  key1 key2  lval  rval
0  foo  one   1.0   4.0
1  foo  one   1.0   5.0
2  foo  two   2.0   NaN
3  bar  one   3.0   6.0
4  bar  two   NaN   7.0
'''
# 对于合并运算后需要处理重复列名,suffixes用于指定附加到左右两个dataframe对象的重叠列名上的字符串
print(pd.merge(left,right,on='key1'))
'''
  key1 key2_x  lval key2_y  rval
0  foo    one     1    one     4
1  foo    one     1    one     5
2  foo    two     2    one     4
3  foo    two     2    one     5
4  bar    one     3    one     6
5  bar    one     3    two     7
'''
print(pd.merge(left,right,on='key1',suffixes=['_left','_right']))
'''
  key1 key2_left  lval key2_right  rval
0  foo       one     1        one     4
1  foo       one     1        one     5
2  foo       two     2        one     4
3  foo       two     2        one     5
4  bar       one     3        one     6
5  bar       one     3        two     7
'''
原文地址:https://www.cnblogs.com/nicole-zhang/p/14446789.html