动漫电脑壁纸_爬虫

github项目地址

无意中发现一个好网站,上面有很多美丽的动漫壁纸。

图片太多,手动保存太慢,遂写此程序。

源代码:

import os #引入文件模块
import re #正则表达式
import urllib.request

#连接网页并返回源码
def open_url(url):
      try:
            req = urllib.request.Request(url)
            req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")
            response = urllib.request.urlopen(req)
            # status_code = response.code
            html = response.read()
            return html
      except:
            print(url + " 404 网页丢失,请稍后再试!")
            return 404

def main():
    dongman_url = 'https://www.dongmanxingkong.com/category/pic/wallpaper/page/1'
    dongman_url0 = 'https://www.dongmanxingkong.com/category/pic/wallpaper/page/'
    add_urls = [] # 网页列表
    paper_urls = [] # 壁纸地址列表
    img_num = 1 # 图片序列号
    os.chdir('PC_paper') # 转移到图片防止目录
    for i in range(1,4): # 搜集网页
        dongman_url = dongman_url0 + str(i)
        dongman_html = open_url(dongman_url)
        dongman_html = dongman_html.decode('utf-8')
        # 正则表达式匹配
        add_url = re.findall(r'class="post-title"><a href="([^"]+.html)" title="【电脑壁纸】',dongman_html)
        print(len(add_url))# 输出当前网页
        add_urls.extend(add_url) # 将子网页添加到列表中
    print(add_urls) # 输出列表
    print(len(add_urls)) # 列表长度
    for i in add_urls: # 从网页列表中搜集图片源地址
        print(i)

        paper_html = open_url(i)
        paper_html = paper_html.decode('utf-8')
        paper_url = re.findall(r'电脑壁纸 " src="([^"]+.jpg)"',paper_html)
        paper_urls.extend(paper_url) # 将所有地址存放到列表中
        print(paper_url)

    print(paper_urls)
    print('' + str(len(paper_urls)) + '张,现在开始下载图片,请勿关闭程序!')
    # 开始保存图片
    for i in paper_urls:
        file_name = str(img_num) + '.jpg'
        img_html = open_url(i)
        with open(file_name, 'wb') as f:
            f.write(img_html)
        img_num = img_num + 1


if __name__ == '__main__':
    main()

原文地址:https://www.cnblogs.com/jxxclj/p/9650381.html