爬取豆瓣分页照片下载

from bs4 import BeautifulSoup   #引用BeautifulSoup库
import requests                 #引用requests
import os                       #os
import pandas as pd
import csv
import codecs
import re
import xlwt #excel操作
import time
#案例一
# lst=[]
# url='http://kaijiang.zhcw.com/zhcw/html/3d/list_1.html'
# r = requests.get(url)                     
# r.encoding='utf-8'
# text=r.text
# soup = BeautifulSoup(text, "html.parser")
# tbody=soup.find('tbody',id="tdata")
# tr=tbody.find_all('tr')
# td=tr[0].find_all('td')
# print(tr)
# for page in range(0,14016):
#     td=tr[page].find_all('td')
    
#     lst.append([td[0].text,td[1].text,td[2].text,td[3].text,td[4].text,td[5].text,td[6].text,td[7].text])
#     with open("Lottery_data.csv",'w') as csvfile:
#         writer = csv.writer(csvfile)
#         writer.writerow(['期号','号码1', '号码2', '号码3', '号码4', '号码5', '号码6', '号码7'])
#         writer.writerows(lst)
# csvfile.close()

#案例二:下载图片获取图片列表链接地址
# url = "https://movie.douban.com/celebrity/1011562/photos/";
# fake_headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
# }
# response = requests.get(url, headers=fake_headers)  # 请求参数里面把假的请求header加上
# soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
# datas = soup.find_all('div', class_="cover")
# picture_list = []
# for d in datas:
#     plist = d.find('img')['src']
#     picture_list.append(plist)
# print(picture_list) 
#['https://img1.doubanio.com/view/photo/m/public/p2564834267.jpg', 'https://img1.doubanio.com/view/photo/m/public/p2382591759.jpg']

#案例三,图片列表分页处理
#分页处理 :我们点击第二页看看浏览器url的变化 :https://movie.douban.com/celebrity/1011562/photos/?type=C&start=30&sortby=like&size=a&subtype=a
# 通过观察可知,这里的参数,只有 start 是变化的,即为变量,其余参数都可以按照常理来处理
# 同时还可以知道,这个 start 参数应该是起到了类似于 page 的作用,start = 30 是第二页,start = 60 是第三页,依次类推,最后一页是 start = 420。
# 于是我们处理分页的代码也呼之欲出了
# 首先将上面处理 HTML 页面的代码封装成函数
def get_poster_url(res):
    print("urlshi:",res)
    fake_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }
    response = requests.get(res, headers=fake_headers)# 请求参数里面把假的请求header加上
    soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
    datas = soup.find_all('div', class_="cover")
    picture_list = []
    for d in datas:
        plist = d.find('img')['src']
        picture_list.append(plist)
    return picture_list

#然后我们在另一个函数中处理分页和调用上面的函数
# def fire():
#     page=0
#     for i in range(0,450,30):
#         print("开始爬取第%s页" % page)
#         url = "https://movie.douban.com/celebrity/1011562/photos/?type=C&start={}&sortby=like&size=a&subtype=a".format(i)
#         res=requests.get(url).text
#         data=get_poster_url(res)
#         page+=1
#此时,我们所有的海报数据都保存在了 data 变量中,现在就需要一个下载器来保存海报了
# >>>range(10)        # 从 0 开始到 10
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# >>> range(1, 11)     # 从 1 开始到 11
# [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# >>> range(0, 30, 5)  # 步长为 5
# [0, 5, 10, 15, 20, 25]
# >>> range(0, 10, 3)  # 步长为 3
# [0, 3, 6, 9]
# >>> range(0, -10, -1) # 负数
# [0, -1, -2, -3, -4, -5, -6, -7, -8, -9]
# >>> range(0)
# []
# >>> range(1, 0)
# []

def download_picture(pic_l):
    if not os.path.exists(r'picture'):
        os.mkdir(r'picture')
    for i in pic_l:
        pic = requests.get(i)
        p_name=i.split('/')[7]  #p2385103173.jpg
     #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list): i = https://img2.doubanio.com/view/photo/m/public/p2385103173.jpg
     #(i.split('/') : ['https:', '', 'img2.doubanio.com', 'view', 'photo', 'm', 'public', 'p2385103173.jpg'] with open(
'picture\\'+p_name,'wb') as f: f.write(pic.content) #再修改fire函数,修改下载器到 fire 函数,此时为了不是请求过于频繁而影响豆瓣网的正常访问,设置 sleep time 为1秒 def fire(): page=0 for i in range(0,450,30): print("开始爬取第%s页" % page) url = "https://movie.douban.com/celebrity/1011562/photos/?type=C&start={}&sortby=like&size=a&subtype=a".format(i) data=get_poster_url(url) download_picture(data) page+=1 time.sleep(1) if __name__=='__main__': fire()
原文地址:https://www.cnblogs.com/yszr/p/15588609.html