Python/Jupyter小技巧

plt.rcParams['figure.facecolor'] = 'white'
plt.figure(figsize=(7,10))
sns.barplot(x='importance', y='feature', data=feat_imp[:40])
plt.title('LGB Features Importance')

pandas.to_csv 中文乱码问题

df.to_csv("predict_result.csv",encoding="utf_8_sig")

UTF-8以字节为编码单元，它的字节顺序在所有系统中都是一样的，没有字节序的问题，也因此它实际上并不需要BOM(“ByteOrder Mark”)。但是UTF-8 with BOM即utf-8-sig需要提供BOM。

在程序中能够正常输出中文，但是导出到文件后使用excel打开是出现中文乱码是因为excel能够正确识别用gb2312、gbk、gb18030或utf_8 with BOM 编码的中文，如果是utf_8 no BOM编码的中文文件，excel打开会乱码。

打印所在机器的内存大小

import psutil

def print_memory_size():
    '''输出系统内存'''
    mem = psutil.virtual_memory()
    total = str(round(mem.total / 1024 / 1024 / 1024))
    print('系统内存为 {} GB'.format(total))
    
print_memory_size()

让图片说中文

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

图片边缘背景色改为白色

plt.rcParams['figure.facecolor'] = 'white'

figsize

plt.rcParams['figure.figsize'] = (14.0, 14.0)

Pandas.DataFrame内存占用

def print_mem_usage(df):
    mem = df.memory_usage().sum() / 1024 ** 2
    print('The DataFrame takes up {:.3} MB'.format(mem))

# 解决DataFrame中某些列的内容特别长而被截断，但又需要能输出更多的内容以便查看的情况，这里的数值可以根据需要进行修改
pd.set_option('display.max_colwidth',300)

解析JSON

import json
 
data = {
    'name': 'pengjunlee',
    'age': 32,
    'vip': True,
    'address': {'province': 'GuangDong', 'city': 'ShenZhen'}
}
# 将 Python 字典类型转换为 JSON 对象
json_str = json.dumps(data)
print(json_str) 
# {"name": "pengjunlee", "age": 32, "vip": true, "address": {"province": "GuangDong", "city": "ShenZhen"}}
# 将 JSON 对象类型转换为 Python 字典
user_dic = json.loads(json_str)
print(user_dic['address'])
# {'province': 'GuangDong', 'city': 'ShenZhen'}

ROC曲线(二分类)

来源

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc  # 计算roc和auc

def auc_curve(y,prob):
    fpr,tpr,threshold = roc_curve(y,prob) # 计算真正率和假正率
    roc_auc = auc(fpr,tpr) # 计算auc的值
 
    plt.figure(figsize=(6, 4.5))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.3f)' % roc_auc) # 假正率为横坐标，真正率为纵坐标做曲线
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc='lower right')
    plt.show()

通过命令行运行Jupyter Notebook

如何自动执行Jupyter的ipynb文件?

jupyter nbconvert --execute my_notebook.ipynb

# 设置每个cell单元格的超时时间，在运行耗时时间长的任务时能够避免TimeoutError问题
jupyter nbconvert --ExecutePreprocessor.timeout=600 --execute my_notebook.ipynb

# 执行笔记本，捕获新输出，并保存结果mynotebook.nbconvert.ipynb 
jupyter nbconvert --to notebook --execute my_notebook.ipynb

#指定--inplace将覆盖输入文件而不是写入新文件
jupyter nbconvert --to notebook --inplace --execute my_notebook.ipynb

# 将jupyter notebook转为Python脚本
jupyter nbconvert --to python my_notebook.ipynb

以上两种方法都能够运行Notebook，区别是第一种方法会生成一个对应的html文件。而第二种方法转为脚本后需要使用Python运行。但如果原本的notebook中有任何特定的魔术命令或是shell命令，将无法执行。

Jupyter Notebook魔术命令

%matplotlib inline # 直接输出图像
%config InlineBackend.figure_format = 'svg' # 设置图片输出格式
%who # 展示环境中的变量列表
%who_ls # 以列表的方式展示
%whos # 类似%who，给出更详细的信息
%debug # 交互式debug

Jupyter Notebook修改背景主题

jt -t oceans16 -f oxygen -fs 12 -T -N -cellw 80% -nfs 13 -tfs 13 -ofs 12

jt -t monokai -f oxygen -fs 12 -T -N -cellw 80% -nfs 13 -tfs 13 -ofs 12

# jt -t monokai -f fira -fs 17 -cellw 90% -ofs 15 -dfs 15 -T -T

Python 时间函数

import datetime
from datetime import timedelta
  
now = datetime.datetime.now()
# datetime.date.today() 
  
#今天
today = now
  
#昨天
yesterday = now - timedelta(days=1)
  
#明天
tomorrow = now + timedelta(days=1) #当前季度
now_quarter = now.month / 3 if now.month % 3 == 0 else now.month / 3 + 1
#本周第一天和最后一天
this_week_start = now - timedelta(days=now.weekday())
this_week_end = now + timedelta(days=6-now.weekday())
  
#上周第一天和最后一天
last_week_start = now - timedelta(days=now.weekday()+7)
last_week_end = now - timedelta(days=now.weekday()+1)
  
#本月第一天和最后一天
this_month_start = datetime.datetime(now.year, now.month, 1)
this_month_end = datetime.datetime(now.year, now.month + 1, 1) - timedelta(days=1)
  
#上月第一天和最后一天
last_month_end = this_month_start - timedelta(days=1)
last_month_start = datetime.datetime(last_month_end.year, last_month_end.month, 1)
  
#本季第一天和最后一天
month = (now.month - 1) - (now.month - 1) % 3 + 1
this_quarter_start = datetime.datetime(now.year, month, 1)
this_quarter_end = datetime.datetime(now.year, month + 3, 1) - timedelta(days=1)
  
#上季第一天和最后一天
last_quarter_end = this_quarter_start - timedelta(days=1)
last_quarter_start = datetime.datetime(last_quarter_end.year, last_quarter_end.month - 2, 1)
  
#本年第一天和最后一天
this_year_start = datetime.datetime(now.year, 1, 1)
this_year_end = datetime.datetime(now.year + 1, 1, 1) - timedelta(days=1)
  
#去年第一天和最后一天
last_year_end = this_year_start - timedelta(days=1)
last_year_start = datetime.datetime(last_year_end.year, 1, 1)

扫描文件夹中所有的文件，并修改文件名

import os
dirlist = []

for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        file1 =  'doris---'+str(filename)
        dir1 = str(dirname)
        dirlist.append(dir1+file1)
dirlist

个人公众号：ApocalypseNow。分享互联网数据分析行业经验。