爬虫--百度贴吧每一页中的图片

import urllib.request
import urllib.parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from lxml import etree

def loadPage(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
    request = urllib.request.Request(url, headers = headers)
    html = urllib.request.urlopen(request).read()

    #解析HTML文档为HTML_DOM模型
    content = etree.HTML(html)
    #返回所有匹配成功的列表集合
    link_list = content.xpath('//li[@class=" j_thread_list clearfix"]//div[@class="threadlist_title pull_left j_th_tit "]/a/@href')
    print(link_list,len(link_list))

    for link in link_list:
        fulllink = "http://tieba.baidu.com" + link      #每个帖子的链接
        loadImage(fulllink)

#取出每个帖子中每个图片的的链接
def loadImage(link):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
    request = urllib.request.Request(link, headers=headers)
    html = urllib.request.urlopen(request).read()
    content = etree.HTML(html)

    #返回帖子里所有图片链接的列表集合
    link_list = content.xpath('//img[@class="BDE_Image"]/@src')
    for link in link_list:
        filename = link[-15:]
        urllib.request.urlretrieve(link,'./tieba/'+filename)
        print("下载成功"+'----'+filename)


def tiebaSpider(url, beginPage, endPage):

    for page in range(beginPage, endPage + 1):
        pn = (page - 1) * 50
        fullurl = url + "&pn=" + str(pn)
        loadPage(fullurl)


if __name__ == "__main__":
    kw = input("请输入要爬取的贴吧名:")
    startPage = int(input("请输入起始页:"))
    endPage = int(input("请输入结束页:"))

    url = "https://tieba.baidu.com/f?"

    # 可以使用urlencode({'kw':kw})  --->  https://tieba.baidu.com/f?kw=美女
    key = urllib.parse.urlencode({"kw": kw})
    fullurl = url + key
    # fullurl = url + 'kw=' +kw
    # print(fullurl)

    tiebaSpider(fullurl, startPage, endPage)
原文地址:https://www.cnblogs.com/dongpei/p/9404640.html