#处理7Z分笔数据 py7zr

#!/usr/bin/env python
import os
import numpy as np
import py7zr
import shutil
import pandas as pd
import time
#处理7Z分笔数据

path = r'G:datas of status	ick-by-tick trade'#数据文件存放位置
pathsave = 'G:\datas of status\python codes\'#设定临时文件存放位置

listM = np.array(os.listdir(path))  #获取月文件夹
print(listM)
listM=np.char.add(path + "\",listM)#获取月文件夹路径

def read_files(filename):#读文件内容
    with open(filename, "r") as f:
        df1 = pd.DataFrame(f.readlines())
        df1 = pd.DataFrame(df1[0].str.strip())
        df1 = pd.DataFrame(df1[0].str.split("	",expand = True))
        df1[3] = df1[1].astype("int")*df1[2].astype("int")
        df1.columns = ["time","price","vol","amount"]
        vol_t = abs(df1["vol"].astype("long")).sum()
        amount_t = abs(df1["amount"].astype("long")).sum()

        df_f_xiao = df1[(df1["amount"].astype("int") <0)&((df1["amount"].astype("int") > -40000) )]
        df_f_zhong = df1[(df1["amount"].astype("int") <=  -40000)&((df1["amount"].astype("int") > -200000) )]
        df_f_da = df1[(df1["amount"].astype("int") <=  - 200000)&((df1["amount"].astype("int") > -1000000) )]
        df_f_te_da = df1[(df1["amount"].astype("int") <=  - 1000000)]

        f_xiao = df_f_xiao["amount"].astype("long").sum()
        f_zhong = df_f_zhong["amount"].astype("long").sum()
        f_da = df_f_da["amount"].astype("long").sum()
        f_te_da = df_f_te_da["amount"].astype("long").sum()

        df_z_xiao = df1[(df1["amount"].astype("int") > 0) & ((df1["amount"].astype("int") < 40000))]
        df_z_zhong = df1[(df1["amount"].astype("int") >= 40000) & ((df1["amount"].astype("int") < 200000))]
        df_z_da = df1[(df1["amount"].astype("int") >=  200000) & ((df1["amount"].astype("int") < 1000000))]
        df_z_te_da = df1[(df1["amount"].astype("int") >= 1000000)]

        z_xiao = df_z_xiao["amount"].astype("long").sum()
        z_zhong = df_z_zhong["amount"].astype("long").sum()
        z_da = df_z_da["amount"].astype("long").sum()
        z_te_da = df_z_te_da["amount"].astype("long").sum()
        '''
        print(vol_t)
        print(amount_t)

        print(f_xiao)
        print(f_zhong)
        print(f_da )
        print(f_te_da)
        print(z_xiao)
        print(z_zhong)
        print(z_da )
        print(z_te_da)
        '''
        list_return = [vol_t,amount_t,z_xiao,z_zhong,z_da,z_te_da,f_xiao,f_zhong,f_da,f_te_da]
        return list_return





#tempname=r'G:\datas of status\python codes\20200428\SH600000.txt'
#read_files(tempname)

def read_dirs(savedir):#读文件夹
    files=np.array(os.listdir(savedir))
    file_names = np.char.add(savedir + "\",files)
    listdir_return = []

    for file in file_names:
        (filepath, tempfilename) = os.path.split(file)
        (filename, extension) = os.path.splitext(tempfilename)

        if not os.path.getsize(file):#判断文件大小是否为0
            print("file siz = 0")
            print(file)
        else:
            list_t = read_files(file)
            list_t.insert(0,filename)
            listdir_return.append(list_t)

    #print(listdir_return)
    npM = pd.DataFrame(listdir_return)
    npM.columns = ["name","vol","amount","z_xiao","z_zhong","z_da","z_te_da","f_xiao","f_zhong","f_da","f_te_da"]
    return npM
    #print(npM)

def extract_files(filename):#提出7Z文件
    with py7zr.SevenZipFile(filename, 'r') as archive:
        allfiles = archive.getnames()#获取7Z文件内的子文件名
        #print(allfiles)
        tempdir = allfiles[0].split("/")[0]#取7Z文件内文件夹名称
        #print(tempdir)
        savedir =pathsave + str(tempdir)
        #print(pathsave)
        if os.path.exists(savedir):
            shutil.rmtree(savedir)#删除同名文件夹
        os.mkdir(savedir)#重建文件夹
        #archive.extract(pathsave,allfiles[0:3])#解压到文件夹
        archive.extractall(pathsave)#解压到文件夹
        #print(archive.extractall())
        pdM2 = read_dirs(savedir)

        shutil.rmtree(savedir)
        pdM2.insert(1,"date",tempdir,allow_duplicates=False)
        #print(pdM2)
        return pdM2





def do_work(listD):
    pdM_all = pd.DataFrame(
        columns=["name", "date", "vol", "amount", "z_xiao", "z_zhong", "z_da", "z_te_da", "f_xiao", "f_zhong", "f_da",
                 "f_te_da"])
    for filename in listD:
        #filename = listD[0]
        print("=========")
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        pdD_t = extract_files(filename)
        #print(pdD_t["date"][0])
        save_dfile = pathsave + "\" + "everyday_data" + "\" + pdD_t["date"][0] + ".csv"
        #print(save_dfile)
        pdD_t.to_csv(save_dfile,sep=",",index=False,header=True)
        pdM_all = pdM_all.append(pdD_t)

        print(filename)
    print(pdM_all)
    save_file = pathsave + pdM_all["date"][0].str[0:6] + ".csv"
    save_file = save_file.reset_index(drop = True)
    print(save_file[0])
    #df.to_csv(‘/opt/births1880.csv’, index=False, header=False

    pdM_all.to_csv(save_file[0],sep=",",index=False,header=True)




def start_work():
    m = 0  # 开始处理第几个文件夹(1~16,16=202004,15=202003)
    do_num = 1
    for n in range(do_num):

        i = m - n #处理第几个文件夹(1~16)
        print(listM[i])
        listD = np.array(os.listdir(listM[i]))#获取一个文件夹下所有日文件全路径

        print(listD)
        listD = np.char.add(listM[i] + "\",listD)#获取日文件全名

        print(listD)
        do_work(listD)
        print(i)
start_work()
#以下为单位处理一天的数据
def do_one_day():
    tempdir = "20190325"#某天数据已解压的文件夹
    savedir = pathsave + tempdir

    pdM2 = read_dirs(savedir)

    pdM2.insert(1, "date", tempdir, allow_duplicates=False)


    save_dfile = pathsave + "\" + "everyday_data" + "\" + tempdir + ".csv"
    # print(save_dfile)
    pdM2.to_csv(save_dfile, sep=",", index=False, header=True)





#do_one_day()
原文地址:https://www.cnblogs.com/rongye/p/12862152.html