Python 爬虫学习 网页图片下载

  1. 使用正则表达式匹配
    # coding:utf-8
    
    import  re
    import urllib
    
    def get_content(url):
        """ Evilxr, """
        html = urllib.urlopen(url)
        content = html.read()
        html.close()
        return content
    
    def get_images(info):
        """" Download Baidu pictures.
        <img class="BDE_Image" src="http:*****">
        """
        regex = r' class="BDE_Image" src="(.+?.jpg)" ' 
        pat = re.compile(regex)
        images_code = re.findall(pat, info)
    
        i = 0
        for image_url in images_code:
            print image_url
            urllib.urlretrieve(image_url, '%s.jpg' % i)
            i = i +1
        print len(images_code)
    
    info = get_content("http://tieba.baidu.com/p/2299704181")
    print get_images(info)
    

      

  2. 使用第三方库BeautifulSoup匹配
    # 安装 sudo pip install beautifulsoup4
    

      

    # coding:utf-8
    
    import urllib
    from bs4 import BeautifulSoup
    
    def get_content(url):
        """ Evilxr, """
        html = urllib.urlopen(url)
        content = html.read()
        html.close()
        return content
    
    
    def get_images(info):
        """
         使用BeautifulSoup在网页源码中匹配图片地址
        """
        soup = BeautifulSoup(info)
        all_img = soup.find_all('img', class_="BDE_Image" )
    
        i = 1
    
        for img in all_img:
            print img['src']
            urllib.urlretrieve(img['src'], '%s.jpg' % i)
            i = i +1
        print "一共下载了 ", len(all_img), "张图片"
    
    
    info = get_content("http://tieba.baidu.com/p/3368845086")
    
    print get_images(info)
    

      

原文地址:https://www.cnblogs.com/evilxr/p/4041561.html