python 爬取电影并保存

# 爬取电影片段ts
from
multiprocessing import Process from multiprocessing import Value from urllib import request import urllib from time import sleep import socket import ssl ssl._create_default_https_context = ssl._create_unverified_context class CatchVideo(object): def __init__(self): self.headers = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0" self.url = "" def set_url(self, i): if i < 1000: self.url = "https://youku.letv-cdn.com/2019/09/15/dR8FIvDrfTIl9Xb0/out%03d.ts" % i else: self.url = "https://youku.letv-cdn.com/2019/09/15/dR8FIvDrfTIl9Xb0/out%04d.ts" % i # 获取并下载ts文件 def download_ts(self, i): path = './video/' rq = request.Request(self.url) rq.add_header('User-Agent', self.headers) response = request.urlopen(rq) read_ts = response.read() with open(path + "%03d.ts" % i, "wb") as f: f.write(read_ts) response.close() # 关闭urlopen方法,防止被ban def start_work(self, i, my_flag): self.set_url(i) try: self.download_ts(i) print(str(i) + ".ts success") sleep(1) except urllib.error.URLError as e: print(e.reason) my_flag.value = 1 # self.download_ts(i) except socket.timeout as e2: print(e2.reason) self.download_ts(i) if __name__ == '__main__': catch_video = CatchVideo() flag = Value("d", 0) socket.setdefaulttimeout(20) # 设置socket层超时时间20秒 x = Value("d", 10) j = 0 while j < 200: # 5个进程并发运行 p_l = [Process(target=catch_video.start_work, args=(i, flag)) for i in range(j, j+5)] for p in p_l: p.start() for p in p_l: p.join() if flag.value: print('===============download completed!=============') break j = j + 5
import os
# 将ts片段保存成一个文件,后缀名为.mp4 path
= './video/' lis_path = os.listdir(path) for i in lis_path: with open(path + i, 'rb')as f: data = f.read() with open(path + 'new.mp4', 'ab') as w: w.write(data)

参考:https://blog.csdn.net/qq_37251897/article/details/106174317

原文地址:https://www.cnblogs.com/hello-python2020/p/13935188.html