'''
爬虫百度图片
'''
import re
import os
import time
import requests
from selenium import webdriver
#########################
###此段代码不需要关心啥意思###
#########################
if not os.path.exists('百度图片'):
os.mkdir('百度图片')
#####################
###限制30张图片的代码###
####################
# # 获取所有图片
# response = requests.get(
# 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=美女')
# data = response.text
# img_desc_dics = re.findall("app.setData(('imgData.*?));", data, re.S)[0]
# img_desc_dics = eval(str(img_desc_dics))
#
# # 获取所有图片的数据
# img_datas = img_desc_dics[1]['data']
# count = 0
# for img_data in img_datas:
# # 获取搜索图片的参数
# os_ = img_data.get('os')
# cs_ = img_data.get('cs')
#
# if os_ and cs_:
# # 获取搜索图片的信息
# img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
# img_search_response = requests.get(img_search_url)
# img_search_data = img_search_response.text
#
# # 获取图片信息
# img_url = re.findall('''('firstSc');" src="(.*?)"''', img_search_data)[0]
# img_name = img_url.split('/')[-1]
# img_name = os.path.join('百度图片', img_name) # 拼接出图片的地址,如 百度图片/3822951_144045377000_2.jpg
#
# # 保存图片
# img_response = requests.get(img_url)
# img_data = img_response.content
# fw = open(img_name, 'wb')
# fw.write(img_data)
# fw.flush()
#
# # 提示
# count += 1
# print(f'{img_name}保存成功,成功保存{count}张')
#
# # 防止百度禁ip,慢一点
# time.sleep(0.01)
#########################################################################
###自行百度selenium的用法,使用这一套代码可以无限爬取所有图片,否则将被限制30张###
# http://npm.taobao.org/mirrors/chromedriver/
########################################################################
# pip install selenium
page_count_end = 100 # 爬取 指定数字(10)* 30 = 300张图片
word = '美女'
chrome = webdriver.Chrome()
try:
chrome.implicitly_wait(100)
chrome.get(
f'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word={word}')
js_code = '''
window.scrollTo(0, document.body.scrollHeight);
var lenOfPage = document.body.scrollHeight;
return lenOfPage
'''
# selenium控制爬取页数
count = 0
page_count = 0
while page_count < page_count_end:
try:
page_count += 1
chrome.execute_script(js_code)
time.sleep(0.3)
except:
continue
img_desc_search_urls = re.findall('href="(/search/detail?.*?)"', chrome.page_source, re.S) # re.S使.可以匹配换行符
# 获取所有图片的数据
for img_data in img_desc_search_urls:
try:
# 获取搜索图片的参数
os_ = re.findall('os=(.*?)&', img_data)[0]
cs_ = re.findall('cs=(.*?)&', img_data)[0]
if os_ and cs_:
# 获取搜索图片的信息
img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
img_search_response = requests.get(img_search_url)
img_search_data = img_search_response.text
# 获取图片信息
img_url = re.findall('''('firstSc');" src="(.*?)"''', img_search_data)[0]
img_name = img_url.split('/')[-1]
img_name = os.path.join('百度图片', img_name) # 拼接出图片的地址,如 百度图片/3822951_144045377000_2.jpg
# 保存图片
img_response = requests.get(img_url)
img_data = img_response.content
fw = open(img_name, 'wb')
fw.write(img_data)
fw.flush()
# 提示
count += 1
print(f'{img_name}保存成功,成功保存{count}张')
# 防止百度禁ip,慢一点
time.sleep(0.01)
except:
continue
except Exception:
pass
finally:
chrome.close()
## 以上代码分两部分,前部分加注释的为数量30的百度图片:后半部分未注释的代码,可以爬虫百度图片不限制,从谷歌打开,所以默认浏览器是谷歌,另外根目录要有个chromedriver.exe文件