Python爬虫(三)爬淘宝MM图片

直接上代码:

# python2
# -*- coding: utf-8 -*-

import urllib2
import re
import string
import os
import shutil

def crawl_taobaoMM(baseUrl, start, end):
    imgDir = 'mm_img'
    isImgDirExist = os.path.exists(imgDir)
    if not isImgDirExist:
        os.makedirs(imgDir)
    else:
        shutil.rmtree(imgDir)

    fileName = 'mm.txt'
    picNumber = 0
    with open(fileName, 'a') as f:
        for i in range(start, end + 1):
            url = baseUrl + '?page=' + str(i)
            userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)' 
                        ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
            headers = {'user-agent': userAgent}
            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req).read().decode('gbk')
            # 图片url、姓名、年龄、城市、职业
            serchPattern = r'<div class="personal-info">.*?<img src="//(.*?)".*?<a class="lady-name".*?>(.*?)' 
                           r'</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>.*?<em>(.*?)</em>'
            searchObj = re.compile(serchPattern, re.S)
            results = searchObj.findall(response)

            print '' + str(i) + '页...'
            for result in results:
                message = '%s %s %s %s %s
' % (result[0], result[1], result[2], result[3], result[4])
                print picNumber
                print message
                f.write(message.encode('utf-8'))
                pic = urllib2.urlopen('https://' + result[0]).read()
                picName = imgDir + '/' + string.zfill(picNumber, 5) + '.jpg'
                with open(picName, 'wb') as pf:
                    pf.write(pic)
                picNumber += 1

crawl_taobaoMM('https://mm.taobao.com/json/request_top_list.htm', 1, 10)

爬下来的图片:

参考资料:

Python爬虫实战四之抓取淘宝MM照片

原文地址:https://www.cnblogs.com/gattaca/p/6930592.html