工作中用到的小脚本2

import xlwt
import openpyxl
from urllib.parse import *
import xlrd
def eq(l):
    workbook = xlrd.open_workbook('data.xlsx')
    table = workbook.sheet_by_index(0)
    all_data=[]
    url=[]
    test=[]
    for i in range(0,table.nrows):
        u = table.cell(i, 1).value
        ip=table.cell(i,0).value
        #print(ip)
        if u not in l:
            print(u,"	",ip)
#文件移动函数
def moveFile(srcfile,dstfile):
    if not os.path.isfile(srcfile):
        print ("%s 该文件不存在!请检查您的输入"%(srcfile))
    else:
        fpath,fname=os.path.split(dstfile)    #分离文件名和路径
        if not os.path.exists(fpath):
            os.makedirs(fpath)                #创建路径
        shutil.move(srcfile,dstfile)          #移动文件

def searchdata(l,dir):
    workbook=xlrd.open_workbook('C:\Users\yxb\Downloads\汇总高危\网站基本信息20200424(1).xls')
    workbook2=xlrd.open_workbook('C:\Users\yxb\Downloads\汇总高危\网站群网站清单_20200312入库(1).xlsx')
    table1=workbook.sheet_by_index(0)
    table2=workbook2.sheet_by_index(0)
    all_data=[]
    un=[]
    ip=[]
    url=[]
    start=[]
    end=[]
    for i in range(0,table1.nrows):
        u=table1.cell(i,4).value
        unit=table1.cell(i,-1).value
        ip.append(table1.cell(i,5).value)
        un.append(table1.cell(i,16).value)
        if u=='*' or u=='无':
            u=table1.cell(i,5).value
            all_data.append(unit)
            url.append(u)
        url.append(u)
        all_data.append(unit)
    for i in range(0,table2.nrows):
        u=table2.cell(i,0).value
        unit=table2.cell(i,3).value
        url.append(u)
        all_data.append(unit)
    sum=0
    for i in l:
        if i in url:
            print(i,"	",all_data[url.index(i)])
            start.append(i)
            end.append(all_data[url.index(i)])
        else:
            if i in ip:
                print(i,"	",un[ip.index(i)])
                start.append(i)
                end.append(all_data[ip.index(i)])
    list = os.listdir(dir)
    for i in range(0, len(list)):
        path = os.path.join(dir, list[i])
        if os.path.isfile(path):
            with open(path, encoding="utf-8") as f:
                content = f.read()
            doc = pq(content)  # 解析html 文本
            item = doc("h1")
            s=((item.eq(2).text()))
            res = urlparse(s)
            # print(res)
            if s in start:
                moveFile(path,"F:\scrapy\819\"+end[start.index(s)]+"\")
            else:
                if res.scheme == 'http' or res.scheme is None or res.scheme == '' or res.scheme == 'https':
                    if res.netloc == '':
                        # print(res.path)
                        if res.path in start:
                            moveFile(path, "F:\scrapy\819\"+end[start.index(res.path)]+"\")
                    else:
                        if res.netloc in start:
                            moveFile(path, "F:\scrapy\819\"+end[start.index(res.netloc)]+"\")
                        # list.append(res.netloc)
                        # print(res.netloc)
                else:
                    if res.scheme in start:
                        moveFile(path, "F:\scrapy\819\"+end[start.index(res.scheme)]+"\")
                    # listUrl.append(res.scheme)
                    # print(res.scheme)
    print('操作完成')









def chooseInfo(dir):
    l=[]
    listUrl=[]
    list = os.listdir(dir)
    for i in range(0, len(list)):
        path = os.path.join(dir, list[i])
        if os.path.isfile(path):
            with open(path, encoding="utf-8") as f:
                content = f.read()
            doc = pq(content)  # 解析html 文本
            item = doc("h1")
            s=((item.eq(2).text()))
            #if int(item.eq(1).html()) > 0 or int(item.eq(3).html()) > 0:
            #parrten='^?([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?.)+[a-zA-Z]{2,6}(/)'
            #a=re.findall('(?:[-w.]|(?:%[da-fA-F]{2}))+',s)
            #a=re.split('(?:[-w.]|(?:%[da-fA-F]{2}))+',a)
            #print(a)
            l.append(s)

    #print("操作全部完成!")

    for url in l:
        res=urlparse(url)
        #print(res)
        if res.scheme=='http' or res.scheme is None or res.scheme=='' or res.scheme=='https':
            if res.netloc =='':
                #print(res.path)
                listUrl.append(res.path)
            else:
                list.append(res.netloc)
                #print(res.netloc)
        else:
            listUrl.append(res.scheme)
            #print(res.scheme)
    return listUrl



a=[]
a=chooseInfo("C:\Users\yxb\Downloads\汇总高危\总\")
searchdata(a,"C:\Users\yxb\Downloads\汇总高危\总\")
#eq(a)

  

原文地址:https://www.cnblogs.com/kk328/p/13532163.html