pandas连接MySQL和impala
SQL连接
read_sql(sql,
con,
index_col=None,
coerce_float=True,
params=None,
parse_dates=None,
columns=None,
chunksize=None)
参数的意义:
- sql: 为可执行的sql语句
- con: 数据库的连接
- index_col: 选择某一列作为index
- coerce_float: 将数字形式的字符串直接以float型读入
- params: 返回传递参数的查询字符串
- parse_dates: 将某一列日期型字符串转换为datetime型数据,与pd.to_datetime函数功能类似。可以直接提供需要转换的列名以默认的日期形式转换,也可以用字典的格式提供列名和转换的日期格式,比如{column_name: format string}(format string:"%Y:%m:%H:%M:%S")
- colunms: 要选取的列。一般没啥用,因为在sql命令里面一般就指定要选择的列了
- chunksize: 如果提供了一个整数值,那么就会返回一个generator,每次输出的行数就是提供的值的大小。
import pandas as pd
# 方法一:用DBAPI构建数据库链接engine
import pymysql
conn = pymysql.connect(host='localhost',
user='root',
password='',
database='database_name')
df = pd.read_sql("select * from table_name",con=conn)
# 方法二:用sqlalchemy构建数据库链接engine
import sqlalchemy
from sqlalchemy import create_engine
#connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT, DATABASE)
connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}'.format('root',
'',
'localhost',
3306,
'nowcoderdb')
engine=create_engine(connect_info)
df2=pd.read_sql("select * from table_name",con=engine)
impala连接
# Impala 配置
import pandas as pd
from impala.dbapi import connect
import decimal
def impala_data_get(isql, db_name, table_name):
conn = connect(host=HOST,
port=PORT,
user=USER,
password=PASSWOED,
auth_mechanism=AUTH_MECHANISM
)
cur = conn.cursor()
# cur.execute("use snap")
# cur.execute("show tables")
# INVALIDATE METADATA 表名
frech_isql = "refresh `" + db_name + "`.`" + table_name + "`"
cur.execute(frech_isql)
cur.execute(isql)
# 获取字段名
des = cur.description
cols = [item[0] for item in des]
# 获取数据
data_list=cur.fetchall()
# DataFrame
data = pd.DataFrame(data_list, columns=cols)
return data, cols
pandas 基础设置
pd.set_option('display.max_rows',xxx) # 最大行数
pd.set_option('display.min_rows',xxx) # 最小显示行数
pd.set_option('display.max_columns',xxx) # 最大显示列数
pd.set_option ('display.max_colwidth',xxx) #最大列字符数
pd.set_option( 'display.precision',2) # 浮点型精度
pd.set_option('display.float_format','{:,}'.format) #逗号分隔数字
pd.set_option('display.float_format', '{:,.2f}'.format) #设置浮点精度
pd.set_option('display.float_format', '{:.2f}%'.format) #百分号格式化
pd.set_option('plotting.backend', 'altair') # 更改后端绘图方式
pd.set_option('display.max_info_columns', 200) # info输出最大列数
pd.set_option('display.max_info_rows', 5) # info计数null时的阈值
pd.describe_option() #展示所有设置和描述
pd.reset_option('all') #重置所有设置选项
csv 以日期存储
import datetime
import os
# date_time = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S').split(' ')[0]
save_time = datetime.datetime.now().strftime('%Y_%m_%d').split(' ')[0]
# print(csv_name)
file_path = './data/' + str(save_time) +'/'
file_path
outpath = ''
if not os.path.exists(file_path):
os.mkdir(file_path)
绘图
# 添加转入转出指标
def spec_advertiser_id_cbti_time_series(id_from_df, all_dfs, min_data=0, max_data=2000000):
top25_advertiser = list(id_from_df[:25].id)
food_drinks_datas = all_dfs
i = 0
fig = plt.figure(figsize=(40,30))
# fig = plt.figure(figsize=(16,14))
print(top25_advertiser)
required_datas = pd.DataFrame()
for advertiser_id in top25_advertiser:
# food_drinks_datas
spec_advertiser_datas = food_drinks_datas[food_drinks_datas.id==str(advertiser_id)].sort_values(
by='etl_date')
# plt.subplot(5, 5, i+1)
# print(spec_advertiser_datas['cost'].max(), spec_advertiser_datas['balance'].max())
# display(spec_advertiser_datas)
ax = fig.add_subplot(5, 5, i+1)
lns1 = ax.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['cost'], color='green', label='cost')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
ax2 = ax.twinx()
lns2 = ax2.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['balance'], color='red', label='balance')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
# plt.xlabel("time")
# plt.ylabel("cost")
# print(11, help(ax.set_xticks))
# ax3 = ax.twinx()
lns3 = ax2.plot(spec_advertiser_datas['etl_date'],
spec_advertiser_datas['transfer_in'],
# marker='+',
color='orangered',
label='transfer_in',
alpha=0.3)
# plt.xticks(rotation=45)
# plt.yticks(rotation=45)
# ax4 = ax.twinx()
lns4 = ax2.plot(spec_advertiser_datas['etl_date'],
spec_advertiser_datas['transfer_out'],
# marker='v',
color='blue',
label='transfer_out',
alpha=0.3)
# plt.xticks(rotation=45)
# plt.yticks(rotation=45)
# ax5 = ax.twinx()
# lns5 = ax5.plot(spec_advertiser_datas['etl_date'],
# spec_advertiser_datas['income'],
# marker='o',
# color='y',
# label='income',
# alpha=0.7)
ax.set_xlabel("etl_date",
# rotation=45
)
ax.set_ylabel(r"cost",
# rotation=45
)
# ax2.set_ylabel(r"balance",
# # rotation=45
# )
# ax2.set_ylabel(r"balance",
# # rotation=45
# )
# ax2.set_ylabel(r"balance",
# # rotation=45
# )
# ax2.set_ylabel(r"balance",
# # rotation=45
# )
# ax.set_xticks(rotation=45)
# ax.set_yticks(rotation=45)
# ax2.set_yticks(rotation=45)
lns = lns1+lns2+lns3+lns4
labs = [l.get_label() for l in lns]
ax.legend(lns, labs, loc=2)
# ax.set_ylim(0,1000000)
# ax2.set_ylim(0, 5300000)
ax.set_ylim(min_data, max_data)
ax2.set_ylim(min_data, max_data)
# ax3.set_ylim(0, food_drinks_datas.balance.max())
# ax4.set_ylim(0, food_drinks_datas.balance.max())
# ax5.set_ylim(0, food_drinks_datas.balance.max())
# ax.legend(loc=0)
# ax2.legend(loc=0)
ax.set_title('id' + str(i) + ": " + str(advertiser_id))
# plt.xticks(rotation=45)
# plt.yticks(rotation=45)
# plt.title(str(i) + ":" + str(advertiser_id), loc='right')
# plt.legend(['cost', 'balance'])
ax_ = plt.gca()
x_major_locator=MultipleLocator(15)
ax_.xaxis.set_major_locator(x_major_locator)
i += 1
required_datas = required_datas.append(spec_advertiser_datas)
plt.subplots_adjust(wspace=0.4, hspace=0.9)
plt.show()
return required_datas