抓取网页图片-以本地IIS网页为实践对象

#抓取网页图片
#适用于html页面结构为:li>img
#抓取单个网页图片小程序
#version:V1.0
#author:yxmichael
#更新时间:20210511

import requests
from bs4 import BeautifulSoup
import os,shutil
import time

def getHtmlText(url,code='utf-8'):
    try:
        r = requests.get(url,timeout = 30,headers = my_headers)
        r.raise_for_status
        r.encoding = code
        return r.text
    except:
        return ""

def parseHtml(nlist,html):
    try:
        soup = BeautifulSoup(html,'html.parser')
        div_main = soup.find('div',attrs={'id':'main'})
        lis = div_main.findAll('li')
        for li in lis:
            a_href = li.find('a')['href']
            if a_href != '#':
                img_src = li.find('img')['src']
                img_name = a_href.split('/')[-1]
                #img_name =img_name[-1]
                #print("{}	{}
".format(a_href,img_src))
                nlist.append([img_name,a_href,img_src])            
    except:
        print("")

def delOldDir(dir_path):
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)

def downImg(nlist,nums,site_url,dir_path): 
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)   
    os.chdir(dir_path)
    print("
正在获取原图……")
    for i in range(nums):
        img = nlist[i]
        img_name = img[0]
        img_href= site_url + img[1]
        file_name = dir_path +'/' + img_name
        r= requests.get(img_href,timeout=30)
        with open(file_name,'wb') as f:
            f.write(r.content)
        progressBar(i,nums)

def downImgMicro(nlist,nums,site_url,dir_path):
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)   
    os.chdir(dir_path)
    print("
正在获取缩微图……")
    for i in range(nums):
        img = nlist[i]
        img_name = img[0]
        img_src = site_url + img[2]
        prefix = '缩微图_'
        file_name = dir_path +'/' + prefix + img_name
        r= requests.get(img_src,timeout=30)
        with open(file_name,'wb') as f:
            f.write(r.content)
        progressBar(i,nums)
    

def progressBar(i,total):
    print('
当前进度:{0}{1:.0f}%'.format('▉'*(i+1),((i+1)/total*100)),end='')
    
def printHead():
    num = 80
    print("{}".format("*"*num))
    str_intro = '''
                    抓取单个网页图片小程序
                    version:V1.0
                    author:yxmichael
                    更新时间:20210511
    '''
    print(str_intro)
    print("{}".format("*"*num))
    print("
正在抓取……
")
        

def main():    
    global my_headers
    my_headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
    site_url = 'http://127.0.0.1/pg/'
    imgList = []
    start_time = time.time()
    cur_path = os.getcwd() + '/'
    tmp = '老照片'
    dir_path = cur_path  + tmp
    dir_path_micro = cur_path + tmp + '_缩微图'
    
    printHead()
    html = getHtmlText(site_url)
    parseHtml(imgList,html)
    nums = len(imgList)
    #nums =3
    delOldDir(dir_path)
    delOldDir(dir_path_micro)
    downImg(imgList,nums,site_url,dir_path)
    downImgMicro(imgList,nums,site_url,dir_path_micro)
    seconds = time.time() - start_time
    print("
成功下载{}张图片,耗时:{:.1f}秒。
保存路径{}".format(nums,seconds,dir_path))
    input("请按任意键退出……")    
    
main()    
原文地址:https://www.cnblogs.com/yuexiao/p/14756234.html