python爬取网络上图片【小例子】

爬取百度百科上图片

# -*- coding:utf-8 -*-
import urllib3 as ul
import urllib3
import certifi
import urllib3.contrib.pyopenssl as pyopenssl
from bs4 import BeautifulSoup as bs
import time
#find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果.
link = 'https://baike.baidu.com/item/%E5%8F%A4%E5%A4%A9%E4%B9%90/107686?fr=aladdin'
#先拿到图片链接,存到列表,再一个个下载

def crawler(link):#主函数
    html = get_html(link)#返回获取到的HTML
    (piclinks, filename) = get_pic_link(html)#获取含有图片链接img列表及图片名
    print piclinks
    # download
    i = 0
    for link in piclinks[:]:
        download(link, filename + str(i) + '.jpg')
        i += 1
        # time.sleep(10)


def get_html(link):
    #主要建立连接并取得整个HTML
    pyopenssl.inject_into_urllib3()
    http = ul.PoolManager(cert_reqs='CERT_REQUIRED', 
                          ca_certs=certifi.where())#首先,为何要使用连接池?每发起一个独立的请求TCP就要经过三次握手的过程,而通过重用已经连接过的socket(HTTP1.1支持),连接过程将会减少服务器端资源的占用,应答速度也更快。
    r = http.request('GET', link)
    return r.data


def get_pic_link(html):
    #根据取到的html用beautiful soap定位取出图片列表,返回列表及图片名
    soup = bs(html, 'html.parser')#建BeautifulSoup对象
    purls = [link.get('src') for link in soup.find_all('img')]#找到了所有含img的,放到列表里,取了src部分,即图片链接
    #[<img src="https://bkssl.bdimg.com/static/wiki-lemma/widget/lemma_content/configModule/hotspotmining/img/logo_netease_715533d.png" />]

    # the title of the page as the picture fielname
    filename = soup.find('title').get_text()#这里截取了title作为图片名
    return (purls, filename)


def download(url, filename):
    #根据图片地址下载图片
    pyopenssl.inject_into_urllib3()
    http = ul.PoolManager(cert_reqs='CERT_REQUIRED', 
                          ca_certs=certifi.where())
    res = http.request('GET', url)
    with open(filename, 'wb') as f:
        f.write(res.data)


if __name__ == '__main__':
    crawler(link)
原文地址:https://www.cnblogs.com/facexiaoxi/p/8567669.html