异步线程池爬取 校花网视频

import re
import requests

response = requests.get("http://www.xiaohuar.com/v/")



url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
for url in url_s:
    res = requests.get(url)
    result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)

    # print(result)

def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code==200:
            return response.text
    except Exception:
        pass



def  parse_data(text):
    url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
    # list = []
    for url in url_s:
        if url:
            yield url

def parse_detail(text):
    try:
        movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
        if movie_url_list:
            movie_url = movie_url_list[0]
            if movie_url.endswith(".mp4"):
                return movie_url

    except Exception(TypeError):
        pass
import uuid
def download_movie(movie_url):
    try:
        response=requests.get(movie_url)
        # print(response.text)
        with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f:
            f.write(response.content)
    except Exception:
        pass







if __name__ == '__main__':
    base_url = "http://www.xiaohuar.com/list-3-{}.html"
    for line in range(1):
        url=base_url.format(line)
        #1 发送请求
        index_text=get_page(url)
        #2解析数据
        urls = parse_data(index_text)

        for url in urls:
            #访问详情页获取详情页文本
            detail_text = get_page(url)

            movie_url =  parse_detail(detail_text)
            #保存视屏
            download_movie(movie_url)


from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(50)



response = requests.get("http://www.xiaohuar.com/v/")
# print(response.text)


url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
for url in url_s:
    # print(url)

    res = requests.get(url)
    result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)

    # print(result)

def get_page(url):
    print(url)
    try:
        response = requests.get(url)
        if response.status_code==200:
            return response.text
    except Exception:
        pass



def  parse(res):
    text = res.result()
    if text:
        # print(text)
        url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
        # list = []
        for url in url_s:
            if url:
                if url.startswith("/"):
                    url = "http://www.xiaohuar.com"+url
                pool.submit(get_page,url).add_done_callback(parse_detail)

def parse_detail(res):
    text = res.result()
    if text:
        try:
            movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
            if movie_url_list:
                movie_url = movie_url_list[0]
                if movie_url.endswith(".mp4"):
                    pool.submit(download_movie,movie_url)

        except Exception(TypeError):
            pass
import uuid
def download_movie(movie_url):
    if movie_url:
        try:
            response=requests.get(movie_url)
            # print(response.text)
            with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f:
                f.write(response.content)
        except Exception:
            pass







if __name__ == '__main__':
    base_url = "http://www.xiaohuar.com/list-3-{}.html"
    for line in range(2):
        url=base_url.format(line)
        #1 发送请求
        pool.submit(get_page,url).add_done_callback(parse)
原文地址:https://www.cnblogs.com/tangda/p/10932916.html