pressmuSpiderr

#!/usr/bin/env python  
# encoding: utf-8
import requests
from random import choice
from lxml import html
from urllib.parse import urljoin,quote
import os
import time
NAMEURLDIC={}
NAMEURLDIC_L2={}
ualist=["Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)"]
ua=choice(ualist)
header={"User_Agent":ua}
mailurl="https://press.mu"
url="https://press.mu/tag"
searc_url="https://press.mu/search/{}?p={}"
def getpage(url):
    req=None
    try:
        req=requests.get(url=url,headers=header,stream=True)
        req.encoding=req.apparent_encoding
    except:
        pass
    return req
def parse(url):
    source=getpage(url).text
    if len(source):
        root=html.fromstring(source)
    return root
def buff(url):
    buff = None
    req=getpage(url)
    return req
def save_file(title,url,type="m3u8"):

    if os.path.exists("pressimg"):
        pass
    else:
        os.mkdir("pressimg")
    with open(f'./pressimg/{title}.{type}',"wb") as fs:
            fs.write(buff(url).content)

root=parse(url)
taglist=root.xpath("//section[@id='tag']/ul/li/a")
for tag in taglist:
    title=tag.xpath("./text()")[0]
    href=urljoin(mailurl,tag.xpath("./@href")[0])
    NAMEURLDIC.setdefault(title,href)
for k,v in NAMEURLDIC.items():
    #第一页
    root=parse(v)
    #视频件数:
    v_count=root.xpath("//p[@id='hit']/strong/text()")[0]
    v_max_page_num=root.xpath("//nav[@id='pager']/ul/li[last()-1]/a/text()")[0]
    print(f'当前分类为{k}:,视频件数为:{v_count}')
    for item in range(1,int(v_max_page_num)+1):
        print(f"获取第{item}页")
        if item==1:
            pass
        else:
            root = parse(searc_url.format(quote(title.strip()),item))
        level2list=root.xpath("//section[@class='items']//h2/a")
        for level2 in level2list:
            title_level2 = level2.xpath("./text()")[0]
            href_level2 = urljoin(mailurl, level2.xpath("./@href")[0])
            NAMEURLDIC_L2.setdefault(title_level2, href_level2)
            print(title_level2,href_level2)
            root2 = parse(href_level2)
            videourl=root2.xpath("//div[@id='player']//video/source/@src")[0]
            imgurl="https:"+root2.xpath("//div[@id='player']//video/@poster")[0]
            print("videourl",videourl)
            print("imgurl",imgurl)
            save_file(title_level2,videourl)
            save_file(title_level2,imgurl,"jpg")
            print("开始下载",f"{title_level2}.jpg")

  

原文地址:https://www.cnblogs.com/c-x-a/p/9055139.html