漫画批量下载

#更新记录20180630----------

1。添加存在文件跳过逻辑

2。非法文件路径名字去除逻辑的添加

# -*- coding: utf-8 -*-
# @Time : 2018/11/16 10:02 AM
# @Author : cxa
# @File : cosmic.py
# @Software: PyCharm
# !/usr/bin/env python
# encoding: utf-8
# !/usr/bin/env python
# encoding: utf-8
from requests_html import HTMLSession
import aiohttp
import asyncio
import hashlib
import os
from traceback import format_exc
import base64
from cryptography.fernet import Fernet
# 文件下载也要是异步
import aiofiles
import multiprocessing
from tomorrow import threads
from retrying import retry

workers = multiprocessing.cpu_count() * 2 + 1
# 开始索引数
strat_num = 227002
# 结束索引数
end_num = 250606
key = "X0JxSkg4NFVBQVBPODlUM0VzT1liNnloeWtLcndkSldRT2xURzQ4MEM5RT0="
page_num_xpath = "//p[@class='selectpage']/select[@id='page_number']/option[last()]/@file"
page_id_xpath = "//img[@id='thumbnail']/@src"


def aes_cbc_decrypt(message):
    decrypted_text = Fernet(base64.b64decode(key).decode("utf8")).decrypt(bytes("{}".format(message), encoding="utf8"))
    return decrypted_text.decode("utf8")


# 漫画题目
cosmic_name = "//head//title/text()"
# 漫画id
cosmic_id = "//img[@id='curPic']/@src"
main_url = aes_cbc_decrypt(
    "gAAAAABbNdhqCnxkaJwZ2VL7HUXne_IOic-NsHtE30W-J68oecVmgm0dzO_lLXgTlI7a5_NbUWlkGm7FqLwY81XIBddNWbac4rCgBA9NFAECsNISkhTvdRl4uDSaS6bHY8sbcJJwO13Z")
cosmic_urllist = [main_url.format(i) for i in range(strat_num, end_num + 1)]
pagenum_xpath = "//font[@id='TotalPage']/text()"
full_url = aes_cbc_decrypt(
    "gAAAAABbNdk5FLeX55hOiDAXxgCwwYmGrokYvU3Nd1AOYuOE7OdIEcBdAmSG_Q3kOltealBKMOgUBKDuPUJtzFFPwqoxL-FUip"
    "VNQU-JmBW_K5qxgzTQ3IOla_F61Rscy0fJOaN-mEXKPqrakctyDRN7OVm1LARTMhylQELLuBnJgIT4WXilchg=")  # 漫画的总id,序号的id和格式使用(jpg)
session = HTMLSession()
sema = asyncio.Semaphore(5)
session = HTMLSession()


async def getbuff(url, c_name):
    async with aiohttp.ClientSession() as session2:
        async with session2.get(url, timeout=60) as r:
            buff = await r.read()
            if not len(buff):
                url = url.replace(".jpg", ".png")
                async with session2.get(url, timeout=60) as r2:
                    buff = await r2.read()
            await getimg(url, buff, c_name)


async def run(url, c_name):
    with (await sema):
        await getbuff(url, c_name)


#
@threads(30)
@retry(stop_max_attempt_number=5)
def asyc_get_req(url):
    req = session.get(url, timeout=15)
    if req.status_code == 200:
        return req
    else:
        raise ValueError("访问出错")


def spider(req):
    try:
        if req.status_code == 200:
            root = req.html
            name = root.xpath(cosmic_name)[0]
            print(name)
            with open("1.html", 'a', encoding='utf-8') as fs:
                fs.write(req.text)
            id = root.xpath(page_id_xpath)[0].split('/')[-2]
            max_page = root.xpath(page_num_xpath)[0].split('.')[0]
            full_urllist = [full_url.format(id, i, "jpg") for i in range(1, int(max_page) + 1)]
            event_loop = asyncio.get_event_loop()
            tasks = [run(url, name) for url in full_urllist]
            results = event_loop.run_until_complete(asyncio.wait(tasks))
    except:
        print(format_exc())


async def getimg(url, buff, c_name):
    # 题目那层目录
    filepath = os.path.join(os.getcwd(), "comics_images", c_name)
    # 如果标题太长就转md5,然后单独启动一个text写入内容为标题
    md5name = hashlib.md5(c_name.encode("utf-8")).hexdigest()
    filepath2 = os.path.join(os.getcwd(), "comics_images", md5name)

    id = url.split('/')[-1]
    image_id = os.path.join(filepath, id)
    image_id2 = os.path.join(filepath2, md5name)

    # 题目层目录是否存在
    if not os.path.exists(filepath) and not os.path.exists(filepath2):
        try:
            os.makedirs(filepath)
        except:
            os.makedirs(filepath2)
            image_id = image_id2
            fs = await aiofiles.open(os.path.join(filepath2, "title.txt"), 'w')
            await fs.write(c_name)

    # 文件是否存在
    if not os.path.exists(image_id) and not os.path.exists(image_id2):
        f = await aiofiles.open(image_id, 'wb')
        await f.write(buff)


if __name__ == '__main__':
    # with ThreadPool(workers) as pool:
    #     pool.map(spider, cosmic_urllist)
    req_list = []
    for url in cosmic_urllist:
        req = asyc_get_req(url)
        req_list.append(req)

    for req in req_list:
        spider(req)

  

原文地址:https://www.cnblogs.com/c-x-a/p/9243511.html