爬虫学习(十)——原始正则抓取数据案例

糗事百科图片抓取案例

打算发大水
import os

import re
import time
import urllib.request
import urllib.parse

# 输入目标页码和图片存储名
def header():
start_page = int(input("请输入起始页"))
end_page = int(input("请输入结束页"))
qiutu = input("请输入文件名字")
# 对目标页码进行爬取
for page in range(start_page,end_page+1):
print("正在爬取第%s页"%page)
request = headle_request(page)
download(request,page,qiutu)
# 设置时间间隔,防止网站识别为恶意攻击
time.sleep(2)


# 构建请求对象,拼接url
def headle_request(page):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
url = "https://www.qiushibaike.com/pic/page/%s/?s=5167052"%page
request = urllib.request.Request( url, headers=headers )
return request


# 根据请求对象下载指定的目标数据
def download(request,qiutu):
# 获取响应数据
response = urllib.request.urlopen(request)
# 创建文件存储的文件夹

if not os.path.exists(qiutu):
os.mkdir(qiutu)
content = response.read().decode("utf8")
# 正则表达式的编写,目标是获取图片的url【重点】
img = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>',re.S)
# 正则表达式匹配目标标签,
# 返回的是一个列表
ret = img.findall(content)
for x in ret:
img_url = "http:"+x[0]
# 构建图片的文件名和格式
filename = x[1]+".jpg"
# 构建图片的存储路径
image_path = os.path.join(qiutu,filename)
urllib.request.urlretrieve(img_url,image_path)
time.sleep(1.5)

if __name__ == '__main__':
header()



励志网语录抓取案例

import os
import re
import time
import urllib.request
import urllib.parse

def main():
start_page = int(input("请输入抓取的起始页:"))
end_page = int(input("请输入抓取的结束页:"))
for page in range(start_page,end_page+1):
print("正在爬取第%d"%page)
ret = request(page)
content(ret)


def request(page):
headers = {"User - Agent": "Mozilla / 5.0( Windows NT 6.1;WOW64) AppleWebKit / 537.36( KHTML, likeGecko) Chrome / 72.0.3626.96Safari / 537.36"}
url =" http://www.yikexun.cn/lizhi/qianming/list_50_%s.html"%page
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request).read().decode("utf8")
pattern = re.compile(r'<div class="art-t">.*?<a href="(.*?)"><b>(.*?)</b></a>.*?</div>',re.S)
ret = pattern.findall( response )
return ret



def content(ret):
if not os.path.exists("励志语录1"):
os.mkdir("励志语录1")
for content in ret:
title = content[1]+".html"
article_url="http://www.yikexun.cn"+content[0]
article_path = os.path.join( "励志语录1",title)
response = urllib.request.urlopen(article_url)
string = response.read().decode("utf8")
regular =re.compile(r'(<div class="neirong">.*?<p>(.*?)</p>.*?</div>)',re.S)
neirong = regular.findall(string)
for info in neirong:
cont = '<h1 style="color:blue">%s</h1> %s'%(content[1],info[0])
with open(article_path,"w",encoding="utf8") as tf:
tf.write(cont)
tf.close()
time.sleep(1)
if __name__ == '__main__':
main()


原文地址:https://www.cnblogs.com/kuangkuangduangduang/p/10374888.html