DataFrame去重drop_duplicates

from pandas import DataFrame,Series
import pandas as pd
import numpy as np

# 移除重复数据
data = DataFrame({"k1":["one"]*3+["two"]*4,
                  "k2":[1,1,2,3,3,4,4]})
print(data)
'''
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
'''
# duplicated表示各行是否重复行
print(data.duplicated())
'''
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
'''
# 移除重复行
print(data.drop_duplicates())
'''
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4
'''
# 默认判断全部列,也可以指定部分列进行重复项判断
# 默认保留第一个出现的值组合,传入keep='last'则保留最后一个
print(data.drop_duplicates(["k1"]))
'''
    k1  k2
0  one   1
3  two   3
'''
print(data.drop_duplicates(["k1"],keep='last'))
'''
    k1  k2
2  one   2
6  two   4
'''
原文地址:https://www.cnblogs.com/nicole-zhang/p/14434303.html