用python爬校花网

import requests
import re
import hashlib,time

def get_index(url):
    response=requests.get(url)
    if response.status_code == 200:
        return response.text

def parse_index(res):
    urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
    return urls

def get_detail(urls):
    for url in urls:
        if not url.startswith('http'):
            url='http://www.xiaohuar.com%s' %url
        r1=requests.get(url)
        if r1.status_code == 200:
            url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
            if url_list:
                mp4_url=url_list[0]
                # print(mp4_url)
                save(mp4_url)

def save(url):
    print('Download:%s' %url)
    r2=requests.get(url)
    if r2.status_code == 200:
        m=hashlib.md5()
        m.update(url.encode('utf-8'))
        m.update(str(time.time()).encode('utf-8'))
        filename='%s.mp4' %m.hexdigest()
        file_path=r'D:\爬虫视频\%s' % filename
        with open(file_path,'wb') as f:
            f.write(r2.content)

def main():
    for i in range(5):
        res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
        res2=parse_index(res1)
        get_detail(res2)

if __name__ == '__main__':
    main()

基于上面代码开多线程爬取视频,优化下载速度

# 异步,多线程优化下载速度

import requests
import re
import hashlib,time
from concurrent.futures import ThreadPoolExecutor

p=ThreadPoolExecutor(30)

def get_index(url):
    response=requests.get(url)
    if response.status_code == 200:
        return response.text

def parse_index(res):
    res=res.result()
    urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
    # return urls
    for url in urls:
        p.submit(get_detail,url)

def get_detail(urls):
    for url in urls:
        if not url.startswith('http'):
            url='http://www.xiaohuar.com%s' %url
        r1=requests.get(url)
        if r1.status_code == 200:
            url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
            if url_list:
                mp4_url=url_list[0]
                # print(mp4_url)
                save(mp4_url)

def save(url):
    print('Download:%s' %url)
    r2=requests.get(url)
    if r2.status_code == 200:
        m=hashlib.md5()
        m.update(url.encode('utf-8'))
        m.update(str(time.time()).encode('utf-8'))
        filename='%s.mp4' %m.hexdigest()
        file_path=r'D:\爬虫视频\%s' % filename
        with open(file_path,'wb') as f:
            f.write(r2.content)

def main():
    for i in range(5):
        p.submit(get_index,'http://www.xiaohuar.com/list-3-%s.html' %i).add_done_callback(parse_index)
        # res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
        # res2=parse_index(res1)
        # get_detail(res2)

if __name__ == '__main__':
    main()
原文地址:https://www.cnblogs.com/shenbuer/p/7824422.html