爬取淘宝模特信息并自动保存图片

环境:Ubuntu 16.04

工具:python 3.5+,scrapy1.1,pycharm

import scrapy, re, os, lxml, urllib.request
from scrapy.http import Request
from bs4 import BeautifulSoup


class TaobaoMMSpider(scrapy.Spider):
    name = 'TaobaoMM'
    start_urls = ['https://mm.taobao.com/json/request_top_list.htm?page=1']
    #在这里输入你要保存图片的地址
    mainposition = '/media/liuyu/0009F608000B7B40/TaobaoMM/'

    # 处理第一个网页,获取总页数
    def parse(self, response):
        content = BeautifulSoup(response.text, "lxml")
        totalpage = content.find('input', id="J_Totalpage").get('value')
        url = 'https://mm.taobao.com/json/request_top_list.htm?page='
        for i in range(1):
            yield Request(url + str(i+1), callback=self.everypage)

    # 对每一页的网页进行处理,获取每位model的网页
    def everypage(self, response):
        content = BeautifulSoup(response.text, "lxml")
        modelinfo = content.find_all('div', class_="personal-info")
        for i in modelinfo:
            name = i.find('a', class_="lady-name").string
            seconddir = self.mainposition + name
            os.mkdir(self.mainposition + str(name))
            age = i.find('strong').string
            modelurl = 'https:' + i.find('a', class_="lady-name").get('href')
            yield Request(modelurl, callback=self.infocard, meta={'age': age, 'seconddir': seconddir})

    # 处理模特卡界面,获取模特id,构造获取model信息的json链接
    def infocard(self, response):
        content = BeautifulSoup(response.text, "lxml")
        modelid = content.find('input', id="J_MmuserId").get('value')
        infourl = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id=' + modelid
        albumurl = 'https:' + content.find('ul', class_="mm-p-menu").find('a').get('href')
        yield Request(infourl, callback=self.infoprocess,
                      meta={'seconddir': response.meta['seconddir'], 'albumurl': albumurl, 'age': response.meta['age']})

    # 处理model的json网页信息,获取名字等信息,然后跳转至相册界面
    def infoprocess(self, response):
        seconddir = response.meta['seconddir']
        albumurl = response.meta['albumurl']
        age = response.meta['age']
        content = BeautifulSoup(response.text, "lxml")
        modelinfo = content.find('ul', class_="mm-p-info-cell clearfix")
        info = modelinfo.find_all('li')
        name = info[0].find('span').string
        with open(seconddir + '/' + name + '.txt', 'w')as file:
            file.write('age' + age + '
')
            for i in range(6):
                file.write(info[i].find('span').string.replace("xa0", "") + '
')
            for i in range(2):
                file.write(info[i+7].find('p').string + '
')
            file.write('BWH:  ' + info[9].find('p').string + '
')
            file.write('cup_size:  ' + info[10].find('p').string + '
')
            file.write('shoe_size:  ' + info[11].find('p').string + '
')
        file.close()
        yield Request(albumurl, callback=self.album, meta={'seconddir': response.meta['seconddir']})

    # 处理相册框架界面,获取model的ID,构造相册列表的json请求链接
    def album(self, response):
        content = BeautifulSoup(response.text, "lxml")
        modelid = content.find('input', id="J_userID").get('value')
        url = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=' + modelid
        yield Request(url, callback=self.allimage, meta={'url': url, 'seconddir': response.meta['seconddir']})

    # 处理相册信息页面,获取总页数
    def allimage(self, response):
        url = response.meta['url']
        content = BeautifulSoup(response.text, "lxml")
        page = content.find('input').get('value')
        for i in range(int(page)):
            yield Request(url + '&page=' + str(i+1), callback=self.image, meta={'seconddir': response.meta['seconddir']})

    # 对相册每一页进行处理,获取相册名,对每一个相册进行访问
    def image(self, response):
        seconddir = response.meta['seconddir']
        content = BeautifulSoup(response.text,"lxml")
        albuminfo = content.find_all('div', class_="mm-photo-cell-middle")
        for i in albuminfo:
            albumname = i.find('h4').a.string.replace(" ","")
            albumname=albumname.replace("
","")
            thirddir = seconddir + '/' + albumname
            os.mkdir(thirddir)
            url = i.find('h4').a.get('href')
            pattern = re.compile('.*?user_id=(.*?)&album_id=(.*?)&album_flag')
            item = re.findall(pattern, url)
            for item in item:
                modelid = item[0]
                albumid = item[1]
            imageurl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=' + modelid + '&album_id=' + albumid + '&page='
            yield Request(imageurl, callback=self.imageprocess, meta={'url': imageurl, 'thirddir': thirddir})

    # 对相册页面进行处理,获取相册总页数
    def imageprocess(self, response):
        url = response.meta['url']
        content = response.text
        pattern = re.compile('.*?"totalPage":"(.*?)"')
        item = re.findall(pattern, content)
        pagenum = item[0]
        for i in range(int(pagenum)):
            imageurl = url + str(i+1)
            yield Request(imageurl, callback=self.saveimage, meta={'thirddir': response.meta['thirddir']})

    # 处理相册页面,获得每一个照片的链接
    def saveimage(self, response):
        thirddir = response.meta['thirddir']
        content = response.text
        pattern = re.compile('.*?"picUrl":"(.*?)"')
        pattern_2 = re.compile('.*?imgextra/.*?/(.*?)/')
        imageurls = re.findall(pattern, content)
        for imageurl in imageurls:
            imagename_temp=re.findall(pattern_2,imageurl)
            imagename=imagename_temp[0]
            url = 'https:' + imageurl
            print(url)
            u = urllib.request.urlopen(url).read()
            with open(thirddir + '/' + imagename + '.jpg', 'wb')as file:
                file.write(u)
                file.close()

运行结果:

原文地址:https://www.cnblogs.com/INnoVationv2/p/5708482.html