Python小爬虫,用Python3.X编写

import urllib.request  # 导入urlib.request模块
import re # 导入re模块


# 获得每一页的网址并返回
def get_url(pageNumber):
new_url = "http://jandan.net/ooxx/" + 'page-' +
str(pageNumber) + '#comments'
return new_url


# 打开网址并返回
def url_open(url):
# 添加文件头,看上去像是浏览器的访问
f = urllib.request.Request(url)
f.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/56.0.2924.87 Safari/537.36')
# 打开网页并保存在变量中
response = urllib.request.urlopen(url)
html = response.read()
return html


# 获取页码并返回
def get_page(url):
htmlCode = url_open(url).decode('utf-8')
pageA = htmlCode.find('current-comment-page') + 23
pageB = htmlCode.find(']', pageA)
pageNumber = htmlCode[pageA:pageB]
return pageNumber


# 煎蛋网主页
main_url = "http://jandan.net/ooxx"

# *通过循环对每一页进行访问
pageNumber = int(get_page(main_url))
while pageNumber > 0:
print(pageNumber)
new_url = get_url(pageNumber)
print(new_url)
html = url_open(new_url)
new_html = html.decode('utf-8')
# 用正则表达式寻找图片的地址
imageList = re.findall('src=".*.(?:jpg|gif)', new_html)

# *对一页中的每一张图片进行遍历,抓取每一张图片
n = 1
for i in imageList:
# 获取图片的地址
j = i.split('=')[-1]
imgUrl = 'http:' + j.split('"')[1]
# 保存图片到文件夹(D://pythonprogram/ooxx),文件夹可自行指定
        if '.jpg' in imgUrl:
urllib.request.urlretrieve(imgUrl,
'D://pythonprogram/ooxx/%d-%d.jpg' % (pageNumber, n))
else:
urllib.request.urlretrieve(imgUrl,
'D://pythonprogram/ooxx/%d-%d.gif' % (pageNumber, n))
n += 1

pageNumber -= 1




原文地址:https://www.cnblogs.com/wenqinchao/p/6543307.html