爬取淘女郎模特个人信息数据和写真

趁着国庆节有时间,帮人写了个爬取淘女郎模特动态加载的图片的爬虫,还有爬取模特们的个人信息数据,这个爬虫花了3天时间,因为图片是异步加载的所以爬取
的复杂度有点大,最终我通过研究URL的变化,构造新的URL来进行持续性爬取,不过爬取速度真心慢(查看了cpu的利用率还有很多没有利用到),我准备把多线程加进去
说实话不太好加,有点头大,

  1 # -*- coding: utf-8 -*-
  2 import requests,time,re
  3 import threadpool
  4 import json,os,redis
  5 import xlwt,xlrd,random
  6 import urllib.request
  7 from lxml import etree
  8 
  9 url = 'https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.38a457c5mEWpPl&style=&place=city%3A%E5%B9%BF%E5%B7%9E'
 10 url1='https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
 11 
 12 
 13 class tn(object):
 14 
 15     headers = {
 16         'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 17         'cookie': 'thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; linezing_session=dTr7U8KOxddDd4CJ4VFihDPH_1506516599348AOqB_1; v=0; _tb_token_=5b9375847e363; _m_h5_tk=4efc0ba8d72376fa1968a3f0a92f0eef_1506518851245; _m_h5_tk_enc=e10a67c79b8a47f97dd3134779acfdfe; uc3=sg2=URsQfTD%2BFY9mkKOl%2FNBXqNFPPUNKq8HjGx%2Bair7O99U%3D&nk2=UoCKEw%2B1myb2u1mo&id2=UoCJiFOLhjN6OQ%3D%3D&vt3=F8dBzWk7FANQZ7%2B830Y%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D; existShop=MTUwNjk0NjQwMQ%3D%3D; uss=VFcwj3YmzKmO6xkbkJFH%2FN%2FOd2CPNJzRBWBygIM3IKKXIgbm1DSeGb87; lgc=1132771621aa; tracknick=1132771621aa; cookie2=11aae79de97ae344158e4aa965c7003c; sg=a2d; cookie1=Aihx9FxoyUYIE7uEPgeqstl%2B5uvfGslyiCQ%2FpePYriI%3D; unb=1100473042; skt=11ea4b0360e50e08; t=b63e6968872da200706b694d67c62883; _cc_=UtASsssmfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; _nk_=1132771621aa; cookie17=UoCJiFOLhjN6OQ%3D%3D; cna=1/K/EZz4HDECAXhVTdivCBle; mt=ci=45_1; isg=Anx8i770Cd1_Zz2Ede24lLr7TRqCdCGCcQXP_1b95GdKIR2rfoTZL77ZdX-i; JSESSIONID=F34B74BB5A7A0A1BF96E8B3F2C02DE87; uc1=cookie14=UoTcCfmfxB%2Fd7g%3D%3D&lng=zh_CN&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&existShop=false&cookie21=U%2BGCWk%2F7owY3i1vB1W2BgQ%3D%3D&tag=8&cookie15=UtASsssmOIJ0bQ%3D%3D&pas=0',
 18 
 19     }
 20 
 21     def getUrlinfo(self,page):
 22 
 23         datas=[]
 24         pageurl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
 25 
 26         data = {
 27             'q':'',
 28             'viewFlag':'A',
 29             'sortType':'default',
 30             'searchStyle':'',
 31             'searchRegion':'city:',
 32             'searchFansNum':'',
 33             'currentPage':'%s'%(page),
 34             'pageSize':'100'
 35         }
 36         try:
 37             while True:
 38                 time.sleep(1)
 39                 reqs = requests.post(pageurl,data=data,headers=self.headers,timeout=5)
 40                 if reqs.status_code ==200:
 41                     break
 42                 else:
 43                     print('field')
 44         except Exception as e:
 45             print('error:',e)
 46         dictx = json.loads(str(reqs.text))
 47         t = dictx['data']['searchDOList']
 48         for i in t:
 49             r =  i['realName'],i['height'],i['weight'],i['city'],i['userId']
 50             #userid = i['userId']
 51             datas.append(r)
 52         return datas  #返回淘女郎信息数据
 53 
 54     def getImages(self,rs):
 55 
 56         a=0
 57         for id in rs:
 58             #print(id)
 59             os.mkdir(os.path.join('D:\SpiderProject\ZhiHu\taonvlang\img', str(id[0])))
 60             imagurl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20='+str(id[4])
 61 
 62             try:
 63 
 64                 html1 = requests.get(imagurl,headers=self.headers,timeout=5)
 65                 reqsones = str(html1.text)
 66                 #print(reqsones)
 67             except Exception as e:
 68                 print('error:',e)
 69             urls = etree.HTML(reqsones)
 70             imagesurl = urls.xpath('//a[@class="mm-first"]/@href')#获取淘女郎对应相册url
 71             #print(imagesurl)
 72             ad = 'album_id=d{11}|album_id=d{8}|album_id=d{9}'#获取album_id
 73             album_id = re.compile(ad)
 74             result = album_id.findall(str(imagesurl))
 75 
 76             for im in result:
 77                 pturl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=%s&%s&top_pic_id=0&page=0'%(id[4],im)
 78                 time.sleep(1)
 79                 html5 = requests.get(pturl,headers=self.headers,timeout=5)#获取json图片URL
 80                 print('获取json图片URL')
 81                 jsons =json.loads(str(html5.text))
 82                 try:
 83                     pic = jsons['picList']
 84                 except KeyError as e:
 85                     print('Error:',e)
 86 
 87                 for ius in pic:
 88                         a+=1
 89                         iu = ius['picUrl']
 90                         imurl = 'http:'+ str(iu)
 91                         filename = os.path.join('D:\SpiderProject\ZhiHu\taonvlang\img\%s'%(id[0]),str(a)+'.jpg')
 92                         print('开始下载图片')
 93                         try: 
 94                             file = urllib.request.urlretrieve(str(imurl),filename)
 95                         except urllib.error.HTTPErro as e:
 96                             print('Error:',e)
 97 
 98     def getInfophone(self):
 99 
100         userurl = 'https://mm.taobao.com/self/aiShow.htm?spm=719.7763510.1998643336.1.6WJFuT&userId=277949921'
101         html = requests.get(userurl,headers=self.headers,timeout=5)
102         html.encoding = 'GBK'
103         print(html.encoding)
104         selector = etree.HTML(str(html.text))
105         phone = selector.xpath('//strong[@style="font-family: simhei;color: #000000;font-size: 24.0px;line-height: 1.5;"]|//span[@style="font-size: 24.0px;"]/text()')
106 
107     def saveInfo(self,p):
108 
109         a,b = 1,0
110         workbook = xlwt.Workbook(encoding='ascii')
111         worksheet = workbook.add_sheet('My Worksheet')
112         worksheet.write(0,0, label='姓名')
113         worksheet.write(0, 1, label='身高')
114         worksheet.write(0, 2, label='体重')
115         worksheet.write(0, 3, label='城市')
116 
117         while a<=30 or b<=5:
118             for names in p:
119                 n = names[0]
120                 w = names[1]
121                 h = names[2]
122                 c = names[3]
123                 a+=1
124                 worksheet.write(3, 5, label=str(n))
125         workbook.save('Excel_Workbook.xls')
126 
127 if __name__ =="__main__":
128     t = tn()
129     for ii in range(1):
130             rs = t.getUrlinfo(ii)
131             #print(rs)
132             t.getImages(rs)
133         #t.saveInfo(p)
134     #t.getInfophone()






下面是运行代码截图
 
原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7624033.html