豆瓣用户相册批量下载

很简单的python代码,用BeautifuleSoup会简洁一点,不过就当练习一下正则表达式吧

# -*- coding:utf-8 -*- 
#!/usr/bin/env python

__revision__ = '0.1'
import re
import os
import urllib
import string,time
def get_html_data(url):
    try:
        html=urllib.urlopen(url)
        data=html.read()
    except:
        data=''
    return data
def download_all(urls,dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    for url in urls:
        lo="http://img3.douban.com/view/photo/photo/public/p"+url+".jpg" 
        print lo
        base_name=dir+url+".jpg"
        print(base_name)
        urllib.urlretrieve(lo,base_name)
        time.sleep(1)
def get_every_pages(url):
    m=re.compile(r"(?<=http://www.douban.com/photos/photo/)[0-9]*(?=/\")",re.IGNORECASE)
    data=get_html_data(url)
    x=m.findall(data)
    return x
def find_last_page(data):
    m=re.compile(r"(?<=start=)[0-9]*(?=\")")
    list=m.findall(data)
    max=0
    for i in range(len(list)):
        x=string.atoi(list[i])
        if x>max:
            max=x
            
    return max
def check_next_page(data,i):
    m=re.compile(r"start="+str(i),re.S|re.IGNORECASE)
    x=m.findall(data)
    if len(x)==0:
        return False
    else:
        return True
def get_photo(url,dir):
    i=1
    photo_list=[]
    data=get_html_data(url)
    lists=get_every_pages(url)
    photo_list.append(lists)
    last=find_last_page(data)
#    while check_next_page(data,18*i)==True:
    while last-18*i>=0:
        print url+"?start="+str(18*i)
        lists=get_every_pages(url+"?start="+str(18*i))
        #print i
        print last
        photo_list.append(lists)
        i=i+1
    photos=[]
    for p in photo_list:
        if len(p)>1:
            for pp in p:
                photos.append(pp)
        else:
            photos.append(p)
    #print photos
    #print len(photos)
    download_all(photos,dir)
if __name__=="__main__":
    print "input the url:"
    url=raw_input()
    print "input the dir name:"
    dir=raw_input()
    get_photo(url,dir)
原文地址:https://www.cnblogs.com/long0x0/p/2850905.html