python爬去壁纸网站上的所有壁纸

 import requests as r                                         2 from bs4 import BeautifulSoup
  3 import os
  4 base_url = "http://www.win4000.com"#站点
  5 theme_base_url = "http://www.win4000.com/zt/xiaoqingxin_"
  6 #利用列表解析快速生成每页链接列表
  7 theme_url_list = [theme_base_url + str(x) + ".html" for x i    n range(1,6)]
  8 
  9 #套图链接列表
 10 series_url_list = []
 11 #获取所有套图链接列表
 12 #UA伪装
 13 headers = {
 14          "User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x8        6_64; rv:80.0) Gecko/20100101 Firefox/80.0"
 15         
 16         }
 17 def get_series_url_lists(url,headers):
 18     resp = r.get(url,headers)
 19     if resp is not None:
 20         result = resp.text
 21         bs = BeautifulSoup(result,"html.parser")
 22         ul = bs.find("div",attrs = {"class":"tab_tj"})
 23         a_s = ul.find_all("a")
 24         for a in a_s:
 25             series_url_list.append(a.get("href"))
 26 
 27 #保存文件夹名
 28 save_root_dir = os.path.join(os.getcwd(),"tmp/")
 29 #获取某个套图里的所有图片
 30 def fetch_all_series_pic(url,headers):
 31     cur_page = 1
 32     while True:
 33         current_url = url
 34         if cur_page>1:
 35             current_url = url.relapce(".html",+"_"+str(cur_    page)+".html")
 36             resp = r.get(current_url,headers)
 #http请求码错误则退出程序
 38             if resp.statu_code == 404:
 39                 break
 40             else:
 41                 if resp is not None:
 42                     bs = BeautifulSoup(result,"lxml")
 43                     #使用lxml获取标题,用作文件夹名称
 44                     title_name = bs.find("div",attrs = {"cl    ass":"ptitle"}).h1.text
 45                     save_dir = os.path.join(save_root_dir,t    itle_name)
 46                     if not os.path.exists(save_dir):
 47                         os.makedirs(save_dir)
 48                     #使用CCS选择器选择图片节点
 49                     imgs = bs.select("img.pic-large")
 50                     for img in imgs:
 51                         download_pic(img.attrs.get("src"),s    ave_dir)
 52                     cur_page+=1
 53 #下载图片的方法
 54 def download_pic(url,path):
 55     print("下载图片:" + url)
 56     try:
 57         #就是通过义/为分割符,形成一个字符串列表并且取列表>    的最后一个元素
 58         pic_name = url.split("/")[-1]
 59         #.content返回的是二进制文件
 60         #.text返回的是Unicode(str)数据
 61         #图片为二进制文件
 62         img_resp = r.get(url).content
 63         with open(path +"/"+pic_name,"wb+") as f:
 64             f.write(img_resp)
 65     except Exception as reason:
 66         print(str(reason))

 68 if __name__ == "__main__":
 69     for url in theme_url_list:
 70         get_series_url_lists(url,headers)
 71     for url in series_url_list:
 72         fetch_all_series_pic(url,headers)
 73                              
笨鸟先飞
原文地址:https://www.cnblogs.com/zoutingrong/p/13739719.html