彼岸网爬虫

import requests
from PIL import Image
from io import BytesIO
import re
from requests.exceptions import HTTPError

root = "http://pic.netbian.com/index_%d.html"
# 除首页外所有分页的统一格式

uni = "http://pic.netbian.com"
# 子页面和大图的URL前缀

AllPage = []
# 要爬取的分页URL

AllImgHTML = []
# 缩略图代表的子页面的部分URL,格式为/tupian/.*.html

AllImgURL = []


# 每张大图的部分URL,格式为/uploads/allimg/.*.jpg

def GetPageURL(root, Start, counts):
    # 得到每个分页的URL放到AllPage中
    if Start == 1:
        AllPage.append("http://pic.netbian.com/index.html")
        # 将非标准格式的首页URL放入
        for i in range(Start + 1, Start + counts):
            newURL = root.replace("%d", str(i))
            AllPage.append(newURL)
    else:
        for i in range(Start, Start + counts):
            newURL = root.replace("%d", str(i))
            AllPage.append(newURL)


def GetImgHTML(AllPage):
    # 得到每个分页中子页面的URL放到AllImgHTML中
    for PageURL in AllPage:
        try:
            res = requests.get(PageURL)
            res.raise_for_status()
        except HTTPError:
            print("HTTP Error!")

        except ConnectionError:
            print("Failed to connect!")

        with open("F:\1\PageFile.txt", "w", encoding="ISO-8859-1") as PageFile:
            PageFile.write(res.text)
            PageFile.close()

        with open("F:\1\PageFile.txt", "r", encoding="gbk") as ReadFile:
            str = ReadFile.read()
            mid = re.split(""", str)
            print(mid)
            # 用"进行分割,以进行正则表达式匹配
            for i in mid:
                ImgHTML = re.findall("^/tupian/.*.html$", i)
                # 提取所有符合格式的str放到ImgHTML中
                if len(ImgHTML) != 0:
                    AllImgHTML.append(ImgHTML[0])


def GetImgURL():
    # 得到每个分页中每个子页面的大图的URL放到UsefulImgURL中
    UsefulImgHTML = [None for i in range(len(AllImgHTML))]
    # 为字符串拼接分配内存
    for i in range(len(AllImgHTML)):
        UsefulImgHTML[i] = uni + AllImgHTML[i]
    # 拼接后得到了可用的子页面URL,格式为http://pic.netbian.com//tupian/.*.html

    for html in UsefulImgHTML:
        # 对图片组进行请求
        try:
            htmlres = requests.get(html)
            htmlres.raise_for_status()
        except HTTPError:
            print("HTTP Error!")

        except ConnectionError:
            print("Failed to connect!")

        with open("F:\1\ImgHTML.txt", "w", encoding="ISO-8859-1") as ImgHTML:
            ImgHTML.write(htmlres.text)
            ImgHTML.close()

        with open("F:\1\ImgHTML.txt", "r", encoding="gbk") as ReadHTML:
            str = ReadHTML.read()
            mid = re.split(""", str)

            for i in mid:
                ImgURL = re.search("^/uploads/allimg/.*.jpg$", i)
                if ImgURL is not None:
                    AllImgURL.append(ImgURL[0])
                    break
                # 爬到一个大图的URL即break。将每张大图的部分URL存入AllImgURL中,格式为/uploads/allimg/.*.jpg

    UsefulImgURL = [None for i in range(len(AllImgURL))]
    # 拼接得到最终可供下载的URL放到UsefulImgURL中
    for i in range(len(AllImgURL)):
        UsefulImgURL[i] = uni + AllImgURL[i]

    return UsefulImgURL


def DownloadWallpaper(url, path):
    try:
        res = requests.get(url)
        res.raise_for_status()
        MyImage = Image.open(BytesIO(res.content))
        MyImage.save(path)
        print("Done...")
    except HTTPError:
        print("HTTP Error!")
    except ConnectionError:
        print("Failed to connect!")


if __name__ == "__main__":
    GetPageURL(root, 1,2 )
    GetImgHTML(AllPage)
    UsefulImgURL = GetImgURL()
    num = []
    for i in range(len(UsefulImgURL)):
        num.append(i)

    UsefulSavePath = [None for i in range(len(UsefulImgURL))]
    for i in range(len(UsefulSavePath)):
        UsefulSavePath[i] = "F:\1\" + str(num[i]) + ".jpg"
    for i in range(len(UsefulImgURL)):
        print(i, end=" ")
        DownloadWallpaper(UsefulImgURL[i], UsefulSavePath[i])
    print("Task completed!")
原文地址:https://www.cnblogs.com/liyu8/p/14243771.html