pandas高级多表操作

import pandas as pd
import numpy as np
position = pd.read_csv("D:mycode用pandasdataposition.csv", encoding="gbk")
company = pd.read_csv("D:mycode用pandasdatacompany_sql.csv", encoding="gbk")
# 修改列名
col = list(company.columns)
col[0] = "id"
# 重新赋值
company.columns = col
print("1,------", company)
# 右关联
print("2,-----", position.merge(right=company, how="inner", left_on="companyId", right_on="id"))
# 基于索引关联
print("3,----", company.join(position))
# 堆叠全关联左右拼接
print("4,--------", pd.concat([company, position], axis=1))
# 创建df1表
df1 = pd.DataFrame(
{
"A": list("abcd"),
"B": list("efgh"),
}
)
print("5,--------", df1)
# 创建df2表
df2 = pd.DataFrame(
{
"C": list("abcd"),
"D": list("efgh"),
}
)
print("6,--------", df2)
# 拼接
print("7,----", pd.concat([df1, df2], sort=False))
print("8,----", pd.concat([df1, df2], sort=True))
print("9,----", pd.concat([df1, df2], axis=1))
# 多重索引切片找值
print(position.groupby(by=["city", "education"]).mean())
print(position.groupby(by=["city", "education"]).mean().avg)
print(position.groupby(by=["city", "education"]).mean().avg["上海"])
print(position.groupby(by=["city", "education"]).mean().avg["上海"]["博士"])
print(position.groupby(by=["city", "education"]).mean().loc["上海"])
print(position.groupby(by=["city", "education"]).mean().loc["上海", "博士"])
print("10,-------")
# 基于set_index
print(position.sort_values(by=["city", "education"]).set_index(["city", "education"]))
print("11,------")
# 表格
print(position.positionLables)
print("12,------")
# 字符串计数
print(position.positionLables.str.count("分析师"))
print("13,--------")
# 出现位置
print(position.positionLables.str.find("数据"))
print("14,-------")
# 针对字符串操作
print(position.positionLables.str[1:-1])
print("15,--------")
# 针对值替换
print(position.replace(80307, ""))
print("16,-------")
# 针对值里面字符串替换
print(position.positionLables.str[1:-1].str.replace("'", ""))
print("17,------------")
# 改为空值
position.loc[position.city == "深圳", "city"] = np.NaN
print(position)
print("18,----------")
# 空值填充
print(position.fillna(1))
print("19,--------")
# 填充列名
position.city.fillna("abc")
print(position)
print("20,----------")
# 删除空值
print(position.dropna())
print("21,--------------")
# 字符串拼接
position = position[~position['avg'].isnull()]
position.avg = position.avg.astype('str') + "k"
print(position)
print(position.avg)
print("22,----------")
# 匿名函数拼接
position.avg.apply(lambda x: str(x) + "k")
print(position)
print("23,----------")
# 不同城市之间薪资排名前5
def func(x, n):
r = x.sort_values("avg", ascending=False)
return r[:n]
position.groupby("city").apply(func, n=5)
print(position)
print("24,-------------")

原文地址:https://www.cnblogs.com/zhang-da/p/14243583.html