爬取必应首页大图

不废话,直接上代码

# -*- coding: utf-8 -*-
# @Author: Wang Hongbin
# @Email:   wanghongbin@ngoos.org
# @Date:   2018-03-16 14:19:27
# @Last Modified by:   Wang Hongbin
# @Last Modified time: 2018-03-28 16:26:07
import requests 
import re 
import os
import time #时间模块

local = time.strftime("%Y-%m-%d_")
baseUrl = "https://cn.bing.com"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}

def getImgUrl(url):  
    reg1 = r"(/az/hprichbg/rb/.*?.jpg)"
    con = requests.get(url)
    content = con.text
    imgUrl = re.findall(reg1, content, re.S)[0]    
    # imgLink = baseUrl+imgUrl
    return imgUrl  

def getFilePath():
	# filePath = '/var/www/html/biYinPic/images/' + time.strftime("%Y%m%d") + '/'
	filePath = 'C:/Users/Administrator/Pictures/MyDesktop/'
	if not os.path.exists(filePath):
		os.mkdir(filePath)

	return filePath

def getImgName(url):
	reg2 = r"/az/hprichbg/rb/(.*?)_"
	imgName = re.findall(reg2, url, re.S)[0]
	imgName = local + imgName + '.jpg'
	return imgName


def downloadByPic(url):
	imgUrl = getImgUrl(url)
	imgName = getImgName(imgUrl)
	filePath = getFilePath()
	fileName = filePath+imgName
	
	picUrl = baseUrl + imgUrl
	read = requests.get(picUrl)

	f = open(fileName, 'wb')
	f.write(read.content)
	f.close()

# reg3 = r'<div class="hplaCata"><div class="hplatt">(.*)</div><div class="hplats">(.*)</div><div id="hplaSnippet">(.*)</div><div class="hplaPvd">(.*)</div>'

downloadByPic(baseUrl)
print('is ok!')

爬取结果

下图是七月份至今的爬取图片,因为是在window上执行的,电脑不开机的时候不会执行,代码放在Linux上执行也没问题,使用crontab启个定时器就行了

https://cdn.jsdelivr.net/gh/WHBLeer/Gallery/img/20201124112720.png

原文地址:https://www.cnblogs.com/sanlilin/p/14145163.html