pandas连接MySQL和impala

pandas连接MySQL和impala

SQL连接

read_sql(sql, 
		 con, 
		 index_col=None, 
		 coerce_float=True, 
		 params=None, 
		 parse_dates=None, 
		 columns=None, 
		 chunksize=None)

参数的意义:

  • sql: 为可执行的sql语句
  • con: 数据库的连接
  • index_col: 选择某一列作为index
  • coerce_float: 将数字形式的字符串直接以float型读入
  • params: 返回传递参数的查询字符串
  • parse_dates: 将某一列日期型字符串转换为datetime型数据,与pd.to_datetime函数功能类似。可以直接提供需要转换的列名以默认的日期形式转换,也可以用字典的格式提供列名和转换的日期格式,比如{column_name: format string}(format string:"%Y:%m:%H:%M:%S")
  • colunms: 要选取的列。一般没啥用,因为在sql命令里面一般就指定要选择的列了
  • chunksize: 如果提供了一个整数值,那么就会返回一个generator,每次输出的行数就是提供的值的大小。
import pandas as pd

# 方法一:用DBAPI构建数据库链接engine
import pymysql
conn = pymysql.connect(host='localhost',
                       user='root',
                       password='',
                       database='database_name')
df = pd.read_sql("select * from table_name",con=conn)


# 方法二:用sqlalchemy构建数据库链接engine
import sqlalchemy
from sqlalchemy import create_engine
#connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format(DB_USER, DB_PASS, DB_HOST, DB_PORT, DATABASE)
connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}'.format('root',
													   '', 
													   'localhost', 
													   3306, 
													   'nowcoderdb')
engine=create_engine(connect_info)
df2=pd.read_sql("select * from table_name",con=engine)

impala连接

# Impala 配置
import pandas as pd
from impala.dbapi import connect 
import decimal



def impala_data_get(isql, db_name, table_name):
    conn = connect(host=HOST, 
                    port=PORT,
                    user=USER,
                    password=PASSWOED,
                    auth_mechanism=AUTH_MECHANISM
                    )
    cur = conn.cursor()
    # cur.execute("use snap")
    # cur.execute("show tables")
    #     INVALIDATE METADATA 表名

    frech_isql = "refresh `" + db_name + "`.`" + table_name + "`"
    cur.execute(frech_isql)
    cur.execute(isql)
    # 获取字段名
    des = cur.description
    cols = [item[0] for item in des]
    # 获取数据
    data_list=cur.fetchall()
    # DataFrame    
    data = pd.DataFrame(data_list, columns=cols)
    return data, cols 

pandas 基础设置

详见

pd.set_option('display.max_rows',xxx) # 最大行数
pd.set_option('display.min_rows',xxx) # 最小显示行数
pd.set_option('display.max_columns',xxx) # 最大显示列数
pd.set_option ('display.max_colwidth',xxx) #最大列字符数
pd.set_option( 'display.precision',2) # 浮点型精度
pd.set_option('display.float_format','{:,}'.format) #逗号分隔数字
pd.set_option('display.float_format',  '{:,.2f}'.format) #设置浮点精度
pd.set_option('display.float_format', '{:.2f}%'.format) #百分号格式化
pd.set_option('plotting.backend', 'altair') # 更改后端绘图方式
pd.set_option('display.max_info_columns', 200) # info输出最大列数
pd.set_option('display.max_info_rows', 5) # info计数null时的阈值
pd.describe_option() #展示所有设置和描述
pd.reset_option('all') #重置所有设置选项

csv 以日期存储

import datetime 
import os 
# date_time = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S').split(' ')[0]
save_time = datetime.datetime.now().strftime('%Y_%m_%d').split(' ')[0]
# print(csv_name)
file_path = './data/' + str(save_time) +'/'
file_path 


outpath = ''
if not os.path.exists(file_path):
    os.mkdir(file_path)

绘图

# 添加转入转出指标
def spec_advertiser_id_cbti_time_series(id_from_df, all_dfs, min_data=0, max_data=2000000):
    top25_advertiser = list(id_from_df[:25].id)
    food_drinks_datas = all_dfs
    i = 0
    fig = plt.figure(figsize=(40,30))
    # fig = plt.figure(figsize=(16,14))
    print(top25_advertiser)
    required_datas = pd.DataFrame()
    for advertiser_id in top25_advertiser:
#         food_drinks_datas
        spec_advertiser_datas = food_drinks_datas[food_drinks_datas.id==str(advertiser_id)].sort_values(
            by='etl_date')
    #     plt.subplot(5, 5, i+1) 
#         print(spec_advertiser_datas['cost'].max(), spec_advertiser_datas['balance'].max())
#         display(spec_advertiser_datas)

        
        ax = fig.add_subplot(5, 5, i+1)
        lns1 = ax.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['cost'], color='green', label='cost')
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)



        ax2 = ax.twinx()
        lns2 = ax2.plot(spec_advertiser_datas['etl_date'], spec_advertiser_datas['balance'], color='red', label='balance')
        plt.xticks(rotation=45)
        plt.yticks(rotation=45)
    #     plt.xlabel("time")
    #     plt.ylabel("cost")    
    #     print(11, help(ax.set_xticks))
    
#         ax3 = ax.twinx()
        lns3 = ax2.plot(spec_advertiser_datas['etl_date'], 
                           spec_advertiser_datas['transfer_in'], 
#                            marker='+',
                           color='orangered', 
                           label='transfer_in',
                       alpha=0.3)
#         plt.xticks(rotation=45)
#         plt.yticks(rotation=45)
        
#         ax4 = ax.twinx()
        lns4 = ax2.plot(spec_advertiser_datas['etl_date'], 
                        spec_advertiser_datas['transfer_out'], 
#                         marker='v',
                        color='blue', 
                        label='transfer_out',
                       alpha=0.3)
#         plt.xticks(rotation=45)
#         plt.yticks(rotation=45)        
#         ax5 = ax.twinx()
#         lns5 = ax5.plot(spec_advertiser_datas['etl_date'], 
#                         spec_advertiser_datas['income'], 
#                            marker='o',
#                         color='y', 
#                         label='income',
#                        alpha=0.7)
        
        ax.set_xlabel("etl_date", 
    #                   rotation=45
                     )
        ax.set_ylabel(r"cost", 
    #                   rotation=45
                     )
#         ax2.set_ylabel(r"balance",
#     #                    rotation=45
#                       )
#         ax2.set_ylabel(r"balance",
# #                         rotation=45
#               )
#         ax2.set_ylabel(r"balance",
# #                         rotation=45
#               )
#         ax2.set_ylabel(r"balance",
# #                         rotation=45
#               )
    #     ax.set_xticks(rotation=45)
    #     ax.set_yticks(rotation=45)
    #     ax2.set_yticks(rotation=45)
        lns = lns1+lns2+lns3+lns4
        labs = [l.get_label() for l in lns]
        ax.legend(lns, labs, loc=2)
        
#         ax.set_ylim(0,1000000)
#         ax2.set_ylim(0, 5300000)
        ax.set_ylim(min_data, max_data)
        ax2.set_ylim(min_data, max_data)
#         ax3.set_ylim(0, food_drinks_datas.balance.max())
#         ax4.set_ylim(0, food_drinks_datas.balance.max())
#         ax5.set_ylim(0, food_drinks_datas.balance.max())
    #     ax.legend(loc=0)
    #     ax2.legend(loc=0)

        ax.set_title('id' + str(i) + ": " + str(advertiser_id))

#         plt.xticks(rotation=45)
#         plt.yticks(rotation=45) 
    #     plt.title(str(i) + ":" + str(advertiser_id), loc='right')
    #     plt.legend(['cost', 'balance'])
        ax_ = plt.gca()
        x_major_locator=MultipleLocator(15)
        ax_.xaxis.set_major_locator(x_major_locator)

        i += 1 
        
        required_datas = required_datas.append(spec_advertiser_datas)

    plt.subplots_adjust(wspace=0.4, hspace=0.9)
    plt.show()
    return required_datas
原文地址:https://www.cnblogs.com/geoffreygao/p/14976902.html