pandas使用

import pandas as pd
import numpy as np

# 读取数据，并以表格的形式显示
df1 = pd.DataFrame(pd.read_excel('a.xlsx'))

# 判断是否为空，每个表格都会去判断，并返回True/False
a1 = df1.isnull().values
"""
[[False False False False False False False False False]
 [False False  True False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False  True False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]]

"""

# 判断是否为非空，每个表格都会去判断，并返回True/False，会以规则的表格形式返回
a2 = df1.notnull()
"""
      1     2      3     4     5      6  open_acc  weight  data
0  True  True   True  True  True   True      True    True  True
1  True  True  False  True  True   True      True    True  True
2  True  True   True  True  True   True      True    True  True
3  True  True   True  True  True   True      True    True  True
4  True  True   True  True  True  False      True    True  True
5  True  True   True  True  True   True      True    True  True
6  True  True   True  True  True   True      True    True  True
"""

# 判断列是否重复,比对每一行，并返回True/False
b = df1.duplicated()
"""
0    False
1    False
2    False
3    False
4    False
5     True
6    False
dtype: bool
"""

# 返回去掉重复行后的表
c = df1.drop_duplicates()
"""
     1    2    3    4    5    6  open_acc   weight        data
0   12   12   23   34   56   67         2     12kg  2018/02/26
1  asd  asd  NaN  aas  qwe  qwe        10   23000g   2018/02/6
2  123   qw    q    q  qwe  qwe         7  45000mg   2018/5/26
3    1    2    3    4    5    6        15     123t      2017/9
4    1    2    3    4    5  NaN        20     456t      18/2/2
6   12  123    4    3   er   32        12      34g   2015/3/01
"""

# 空值以。。。。方式填充
d1 = df1.fillna('\')
"""
     1    2   3    4    5    6  open_acc   weight        data
0   12   12  23   34   56   67         2     12kg  2018/02/26
1  asd  asd     aas  qwe  qwe        10   23000g   2018/02/6
2  123   qw   q    q  qwe  qwe         7  45000mg   2018/5/26
3    1    2   3    4    5    6        15     123t      2017/9
4    1    2   3    4    5            20     456t      18/2/2
5    1    2   3    4    5    6        15     123t      2017/9
6   12  123   4    3   er   32        12      34g   2015/3/01
"""

# 包含空值的行删除
d2 = df1.dropna()
"""
     1    2   3   4    5    6  open_acc   weight        data
0   12   12  23  34   56   67         2     12kg  2018/02/26
2  123   qw   q   q  qwe  qwe         7  45000mg   2018/5/26
3    1    2   3   4    5    6        15     123t      2017/9
5    1    2   3   4    5    6        15     123t      2017/9
6   12  123   4   3   er   32        12      34g   2015/3/01
"""

# 给表更换新的列名，注意字段列表的长度一定要和列的长度相同
column_names = ['a', 'b', 'c', 'd', 'e', 'f','g','h','i']
df = pd.read_excel('a.xlsx', names=column_names)
"""
     a    b    c    d    e    f   g        h           i
0   12   12   23   34   56   67   2     12kg  2018/02/26
1  asd  asd  NaN  aas  qwe  qwe  10   23000g   2018/02/6
2  123   qw    q    q  qwe  qwe   7  45000mg   2018/5/26
3    1    2    3    4    5    6  15     123t      2017/9
4    1    2    3    4    5  NaN  20     456t      18/2/2
5    1    2    3    4    5    6  15     123t      2017/9
6   12  123    4    3   er   32  12      34g   2015/3/01
"""

# 打印第一列
# print(df1[1])
"""
0     12
1    asd
2    123
3      1
4      1
5      1
6     12
Name: 1, dtype: object
"""

# 将列中的12，1替换成'qwe'
a = df1[1].replace([12, 1], 'qwe')
"""
0    qwe
1    qwe
2    123
3    qwe
4    qwe
5    qwe
6    qwe
Name: 1, dtype: object
"""

# 打印数据的前n行，默认为5
# print(df1.head())

# 给数据分组
"""
0-5     A  不包含0
6-10    B
11-15   C
16-20   D
"""
bins = [0, 5, 10, 15, 20]
group_names = ['A', 'B', 'C', 'D']
df1['categories'] = pd.cut(df1['open_acc'], bins, labels=group_names)
"""
     1    2    3    4    ...     open_acc   weight        data categories
0   12   12   23   34    ...            2     12kg  2018/02/26          A
1    1  asd  NaN  aas    ...           10   23000g   2018/02/6          B
2  123   qw    q    q    ...            7  45000mg   2018/5/26          B
3    1    2    3    4    ...           15     123t      2017/9          C
4    1    2    3    4    ...           20     456t      18/2/2          D
5    1    2    3    4    ...           15     123t      2017/9          C
6   12  123    4    3    ...           12      34g   2015/3/01          C
"""

# 获取到需要换算单位的行

rows_with_lbs = df1['weight'].str.contains('mg').fillna(False)
"""
     1   2  3  4    5    6  open_acc   weight       data categories
2  123  qw  q  q  qwe  qwe         7  45000mg  2018/5/26          B
"""

# 换算单位
for i, lbs_row in df1[rows_with_lbs].iterrows():
    # print(i) # 获取到值的行
    # print(lbs_row['weight'][-2:])
    weight = int(float(lbs_row['weight'][:-2]) / 1000)
    df1.at[i, 'weight'] = '{}g'.format(weight)
# print(df1)
"""
     1    2    3    4    ...     open_acc  weight        data categories
0   12   12   23   34    ...            2    12kg  2018/02/26          A
1    1  asd  NaN  aas    ...           10  23000g   2018/02/6          B
2  123   qw    q    q    ...            7     45g   2018/5/26          B
3    1    2    3    4    ...           15    123t      2017/9          C
4    1    2    3    4    ...           20    456t      18/2/2          D
5    1    2    3    4    ...           15    123t      2017/9          C
6   12  123    4    3    ...           12     34g   2015/3/01          C
"""

# rows_with_lbs = df1['weight'].str.contains('t').fillna(False)
# for i,lbs_row in df1[rows_with_lbs].iterrows():
#     # print(i) # 获取到值的行
#     # print(lbs_row['weight'][:-2])
#     weight = int(float(lbs_row['weight'][:-2]) * 10000000)
#     df1.at[i, 'weight'] = '{}g'.format(weight)

# print(df1)


# 换算单位封装
def conversion(DataFrame, column, unit, num, new_unit='', null=False):
    """

    :param DataFrame: pandas对象
    :param column: 字段
    :param unit: 当前单位
    :param new_unit: 新单位
    :param num: 当前单位换算成新单位所需要乘的数
    :param null: 是否为空
    :return:pandas对象

         1    2    3    4    ...     open_acc     weight        data categories
    0   12   12   23   34    ...            2  1.20e+04g  2018/02/26          A
    1  asd  asd  NaN  aas    ...           10        23g   2018/02/6          B
    2  123   qw    q    q    ...            7        45g   2018/5/26          B
    3    1    2    3    4    ...           15  1.23e+08g      2017/9          C
    4    1    2    3    4    ...           20  4.56e+08g      18/2/2          D
    """


    rows_with_lbs = DataFrame[column].str.contains(unit).fillna(null)
    column_len = len(unit)

    for i, lbs_row in DataFrame[rows_with_lbs].iterrows():

        # print(lbs_row['weight'][-column_len:])
        weight = int(float(lbs_row[column][:-column_len]) * num)

        # 默认4位，超出长度变为科学计数法
        if len(str(weight)) > 4:

            DataFrame.at[i, column] = ('%0.2e' + new_unit) % weight
            """
                 1    2    3    4    ...     open_acc     weight        data categories
            0   12   12   23   34    ...            2  1.20e+04g  2018/02/26          A
            1  asd  asd  NaN  aas    ...           10  2.30e+01g   2018/02/6          B
            2  123   qw    q    q    ...            7  4.50e+01g   2018/5/26          B
            3    1    2    3    4    ...           15  1.23e+08g      2017/9          C
            4    1    2    3    4    ...           20  4.56e+08g      18/2/2          D
            """
        else:
            DataFrame.at[i, column] = str(weight) + new_unit
            """
                 1    2    3    4    ...     open_acc      weight        data categories
            0   12   12   23   34    ...            2      12000g  2018/02/26          A
            1  asd  asd  NaN  aas    ...           10         23g   2018/02/6          B
            2  123   qw    q    q    ...            7         45g   2018/5/26          B
            3    1    2    3    4    ...           15  123000000g      2017/9          C
            4    1    2    3    4    ...           20  456000000g      18/2/2          D
            """

    return DataFrame


df1 = conversion(df1, 'weight', 'kg', 1000, 'g')  # kg - g

df2 = conversion(df1, 'weight', 'mg', 0.001, 'g')  # mg -g

df3 = conversion(df2, 'weight', 't', 1000000, 'g')  # t - g

print(df3)