数据处理

1.数据处理之前先进行数据清洗,把自己所需要的好的数据提取出来

import codecs
filepath=r"G:我的pythonpython基础大数据相关数据1E~001.txt"
file=codecs.open(filepath,"rb","gbk","ignore")  # 按照指定编码
mylist=file.readlines() # 返回一个list,读取到内存


# 保存加工的文件
# 数据分流(把数据进行分离,好的数据和坏的数据分开)
savegoodfilepath = r"G:我的pythonpyhon基础day12QQQQGood.txt"
savebadfilepath = r"G:我的pythonpython基础day12QQQQbed.txt"
filegood = open(savegoodfilepath,"wb")
filebad = open(savebadfilepath,"wb")
for line in mylist:
    # print(QQlist[1])
    if len(line) > 35 or len(line) <= 15:
        filebad.write(line.encode("utf-8"))

    else:
        QQlist = line.split("----")
        if len(QQlist) == 2:
            filegood.write(line.encode("utf-8","ignore"))
        else:
            filebad.write(line.encode("utf-8"))
file.close()
filegood.close()
filebad.close()

2.数据提取,数据清洗之后就要从中提取出自己所需要的部分

filepath=r"G:我的pythonpython基础day12QQQQGood.txt"
file = open(filepath,"rb")

savefilepath = r"G:我的pythonpython基础day12QQQQGoodpass.txt"
save = open(savefilepath,"wb")

for line in file:
    linestr = line.decode("utf-8","ignore")
    mylist = linestr.split("----")
    save.write(mylist[1].encode("utf-8","ignore"))
file.close()
save.close()

3.数据排序 从提取的数据中 按照自己的需要进行排序

filepath=r"G:我的pythonpython基础day12QQQQGoodpass.txt"
file = open(filepath,"rb")
mylist=file.readlines() # 保存成列表
mylist.sort() # 排序
file.close() # 关闭文件

savefilepath = r"G:我的pythonpython基础day12QQQQGoodpasssort.txt"
save = open(savefilepath,"wb")
for line in mylist:
    line = line.decode("utf-8")
    save.write(line.encode("utf-8"))


save.close()

4.排序计数 把重复一样的统计出来

filepath=r"G:我的pythonpython基础day12QQQQGoodpasssort.txt"
file = open(filepath,"rb")
mylist=file.readlines()

length= len(mylist)
file.close()
savefilepath = r"G:我的pythonpython基础day12QQQQGoodpasssorttimes.txt"
save = open(savefilepath,"wb")

# 前提数据拍好顺序
# 重复数据统计
i = 0
while i < length:
    time = 1
    passwordstr = mylist[i]
    while i < length - 1:
        if mylist[i] == mylist[i+1]:
            time += 1
            i += 1
        else:
            break
    save.write((str(time)+" "+passwordstr.decode("utf-8")).encode("utf-8"))
    i += 1
save.close()

5.数据分类

QQlist=[5,6,7,8,9,10,11,"小垃圾"]
filepath = r"G:我的pythonpython基础day12QQQQGood.txt"
file = open(filepath,"rb")
mylist = file.readlines()
file.close()

# 创建文件对象 存放到列表中
filelist=[]
for i in QQlist:
    QQfilepath = "G:\我的python\python基础\day12\QQ\QQ位数分类\"+str(i)+"位QQ.txt"
    QQfile = open(QQfilepath,"wb")
    filelist.append(QQfile)

# 分类位置
for line in mylist:
    bakline = line
    line = line.decode("utf-8")
    linelist = line.split("----")
    length = len(linelist[0])  # 取账号的长度

    if length == 5:
        filelist[0].write(bakline)
    elif length == 6:
        filelist[1].write(bakline)
    elif length == 7:
        filelist[2].write(bakline)
    elif length == 8:
        filelist[3].write(bakline)
    elif length == 9:
        filelist[4].write(bakline)
    elif length == 10:
        filelist[5].write(bakline)
    elif length == 11:
        filelist[6].write(bakline)
    else:
        filelist[7].write(bakline)
# 关闭文件
for  QQfile  in filelist:
    QQfile.close()
原文地址:https://www.cnblogs.com/wang102030/p/9353593.html