批量下载github代码,同时含有解压zip,遍历文件函数

代码:

 1 # # -*- coding:utf-8 -*-
 2 # @Time : 2021/7/22 22:04 
 3 # @Author : 周博
 4 # @File : test_1.py 
 5 # @博客园: https://www.cnblogs.com/smartisn/
 6 
 7 import requests
 8 from lxml import etree
 9 import re
10 from urllib import request
11 import zipfile
12 import os
13 def Get_whole_file(file):
14     Lists_val=[]
15     for root, dirs, files in os.walk(file):
16         # root 表示当前正在访问的文件夹路径
17         # dirs 表示该文件夹下的子目录名list
18         # files 表示该文件夹下的文件list
19         # 遍历文件
20         for f in files:
21             Lists_val.append(os.path.join(root, f))
22         # # 遍历所有的文件夹
23         # for d in dirs:
24         #     print(os.path.join(root, d))
25     return Lists_val
26 def un_zip(zip_filename,des_dir):
27     '''
28     解压压缩包至des_dir指定文件夹
29     :param zip_filename:输入的压缩包名字,例如a.zip
30     :param des_dir: 解压到的位置:例如为  ./文件存储/
31     :return:
32     '''
33     with zipfile.ZipFile(zip_filename, 'r') as zzz:
34         # 捕捉错误并且 返回存在错误的 压缩包名称
35         try:
36             zzz.extractall(des_dir)
37             print(zip_filename,"解压成功")
38         except zipfile.BadZipFile:
39             print("Error: 压缩文件不完整:",zip_filename)
40 
41 def DownLoadGithub(start,end):
42     # 51-60
43     for page in range(start,end):
44         url = 'https://github.com/search?l=C%23&o=desc&p='+str(page)+'&q=C%23&s=stars&type=Repositories'
45         print("*******************")
46         print(url)
47         strhtml = requests.get(url, timeout=7)
48         tree = etree.HTML(strhtml.text)
49         hreff = tree.xpath('//*[@id="js-pjax-container"]/div/div[3]/div/ul//div[@class="f4 text-normal"]//a//@href')
50         for hh in hreff:
51             try:
52                 file_name=hh.replace("/","_")
53                 hh="https://github.com"+hh
54                 strhtml = requests.get(hh, timeout=7)  # Get方式获取网页数据
55                 tree = etree.HTML(strhtml.text)
56                 href_down = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[2]/a//@href')[0]
57                 href_down="https://github.com"+href_down
58                 print(href_down)
59                 print("./data/" + file_name + '.zip')
60                 request.urlretrieve(href_down, "./data/" + file_name + '.zip')
61                 print("下载成功")
62             except:
63                 continue
64 if __name__=="__main__":
65     # E:pycharmWorkPlace.net_analyzerDownLoad_GitHubdata
66 
67     # un_zip("./data/_5argon_protobuf-unity.zip","./extract_data")
68     List_vals=Get_whole_file("./data/")
69     for val in List_vals:
70         try:
71             un_zip(val, "./extract_data")
72         except Exception as e:
73             print(e)
74             continue
原文地址:https://www.cnblogs.com/smartisn/p/15047620.html