爬取易车网所有车系车型数据

下面就是是源代码,我是直接把数据保存在字典并且打印到控制台的


  1 # -*- coding: utf-8 -*-
  2 from lxml import etree
  3 import requests,re,random
  4 import time,redis
  5 from myweb.yichewang import user_agent
  6 import urllib.request
  7 from concurrent.futures import ThreadPoolExecutor
  8 
  9 import pymysql
 10 from datetime import datetime
 11 from wxpy import *
 12 #bot = Bot(cache_path=True)#微信端监控运行情况
 13 
 14 def run_time(func):
 15     def wrap(*arg,**kwargs):
 16         start_time = time.time()
 17         func(*arg,**kwargs)
 18         print('获取车型链接类方法运行时间为:',time.time() - start_time)
 19         return func
 20         #bot.file_helper.send('获取车型链接类方法运行时间为:',time.time() - start_time)
 21     return wrap
 22 
 23 
 24 def try_expect(funcs):
 25     def getwrap(*arg,**kwargs):
 26         try:
 27             funcs(*arg,**kwargs)
 28             print('时间:',datetime.now(),'-----------未发生异常------------')
 29             #bot.file_helper.send('时间:',datetime.now(),'未发生异常!!!!!!!!!!!!!!')
 30         except Exception:
 31             return '-----------出错啦!!!!!!!!!!'
 32         #return funcs
 33     return getwrap
 34 
 35 class Yi_car_data(object):
 36 
 37     headers = {'Referer':'http://i.yiche.com/authenservice/login.html?returnurl=http%3A%2F%2Fguangzhou.bitauto.com%2F%3Freferrer%3Dhttp%3A%2F%2Fi.yiche.com%2Fauthenservice%2FAboutPassWord%2FResetPasswordResult.aspx%3Freturnurl%3Dhttp%253a%252f%252fi.yiche.com%252fu27686084%252f'}
 38     login_url = 'http://i.yiche.com/ajax/Authenservice/login.ashx'
 39     data ={
 40         'txt_LoginName':'15766264244',
 41         'txt_Password':'123456789aa',
 42         'txt_Code':'',
 43         'cbx_keepState':'true',
 44         'returnurl':'http://guangzhou.bitauto.com/?referrer=http://i.yiche.com/authenservice/AboutPassWord/ResetPasswordResult.aspx?returnurl=http://i.yiche.com/u27686084/',
 45         'guid':'',
 46         'Gamut':'true'
 47 
 48     }
 49     cartype_url = 'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=0'
 50 
 51     def __init__(self):
 52         #client = redis.Redis(host='127.0.0.1',port=int(6379))
 53         self.pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
 54         self.r = redis.Redis(connection_pool=self.pool)
 55         #self.bot = Bot(cache_path=True)
 56         self.s_requests = requests.Session()
 57         html = self.s_requests.post(self.login_url,data=self.data,headers = self.headers,timeout=2)
 58         print(html.status_code)
 59 
 60 
 61     def Car_types_all_datas(self):
 62 
 63         global reponse
 64         self.headers['User-Agent'] = random.choice(user_agent.user_agent_list)
 65         try:
 66             while True:
 67                 time.sleep(1)
 68                 reponse = self.s_requests.get(self.cartype_url,headers=self.headers,timeout=3)
 69                 #print(reponse.text[132:])
 70                 if reponse.status_code ==200 or '奥迪' in reponse.content:
 71                     break
 72                 else:
 73                     print('downing field')
 74         except Exception as e:
 75             print('------------抛出异常----------------:',e)
 76         cartypeurl_re_patten = 'url:"(.*?)"'
 77         cartypename_re_patten = 'name:"(.*?)"'
 78         patten_cartypeurl = re.compile(cartypeurl_re_patten)
 79         patten_cartypename = re.compile(cartypename_re_patten)
 80         cartype_url = patten_cartypeurl.findall(str(reponse.text))
 81         cartype_name = patten_cartypename.findall(str(reponse.text))
 82         if len(cartype_url)==0:
 83             if len(cartype_name)==0:
 84                 print('------------------------解析数据为空---------------------')
 85 
 86         else:
 87             cartype_datas = list(zip(cartype_url,cartype_name))
 88             #print(type(str(self.r.get('car_url_name'))))
 89             try:
 90                 if str(cartype_datas) in str(self.r.get('car_url_name')):
 91                     print('---------------数据有重复-------------------------')
 92                 else:
 93                     self.r.set('car_url_name',str(cartype_datas))
 94             except Exception as e:
 95                 print(e)
 96                 print('----------------------------插入redis失败--------------------')
 97             #print(list(zip(cartype_url,cartype_name)))
 98             return cartype_datas
 99 
100     @try_expect #捕获运行异常
101     @run_time #计算方法运行时间
102     def car_type_all_info_time_expect(self):
103         self.Car_types_all_datas()
104 
105 
106     def car_type_all_info(self):
107         car_type_url = []
108         for url_name in self.Car_types_all_datas():
109             #print(url_name)
110             car_type_url.append('http://car.bitauto.com/'+str(url_name[0]))
111         #print(car_type_url)
112         return car_type_url#返回全部车品牌链接数据
113 
114 
115     def get_car_info(self,url):
116         global req
117         self.headers['User-Agent'] = random.choice(user_agent.user_agent_list)
118         try:
119             req = self.s_requests.get(url,headers=self.headers,timeout=5)
120             #print(req.text)
121         except:
122             print('--------------------出错啦---------------------------!!!!!!!!!!')
123         select  = etree.HTML(str(req.text))
124         car_url =['http://car.bitauto.com/'+str(i) for i in select.xpath('//li[@class="name"]/a/@href')]
125         car_name = select.xpath('//li[@class="name"]/a/@title')
126 
127         if len(car_url)==0 or len(car_name)==0:
128             print('----------------提取数据失败-----------------------')
129         elif len(car_url)!=0 or len(car_name)!=0:
130             #print(list(zip(car_url,car_name)))
131             return list(zip(car_url, car_name))#返回每款车品牌的车系链接和名字
132         else:
133             print('#######################################')
134 
135     def get_all_cartypes_info(self,infourl):
136 
137         data ={}
138         global reqs
139         self.headers['User-Agent'] = random.choice(user_agent.user_agent_list)
140         try:
141             time.sleep(1)
142             reqs = self.s_requests.get(infourl, headers=self.headers, timeout=5)
143             print(req.status_code)
144         except Exception as e:
145             print('------------出错啦-----------',e)
146         carxpath = etree.HTML(str(reqs.text))
147 
148         car_type = carxpath.xpath('//a[@class="txt"]/text()')#车型
149         car_price = carxpath.xpath('//span[@class="price"]/text()')#车价
150         #a =carxpath.xpath('//a[@class="lnk-bzl"]/text()|//a[@class="data"]/text()')
151         car_pailiang = '<span class="data" title="(.*?)">(.*?)</span>'#排量
152         car_zidong = '<span class="data">(.*?)</span>'#变速箱
153         #car_baozhilv = '<a class="lnk-bzl" href="/zhongxingche/baozhilv/" target="_blank" data-channelid="2.21.2032" data_cyslogclickflag="2.21.2032" onclick="BglogPostLog('2.21.2032',this);">(.*?) </a>'#保值率
154         #car_youhao = '<a class="data" data-channelid="2.21.855" target="_blank" href="http://car.bitauto.com/quanxinaodia4l/youhao/(.*?)">(.*?) </a>'#油耗
155         pailiang = re.compile(car_pailiang).findall(str(reqs.text),re.X)[0][1]
156         biansuxiang = re.compile(car_zidong).findall(str(reqs.text))[0]
157         car_baozhilv = carxpath.xpath('//a[@class="lnk-bzl"]/text()')[0][:6]
158         car_youhao = carxpath.xpath('//a[@class="data"]/text()')[0][:8]
159         data['车型'] = car_type
160         data['车价'] = car_price
161         data['排量'] = pailiang
162         data['变速箱'] = biansuxiang
163         data['五年保值率'] = car_baozhilv
164         data['油耗'] = car_youhao
165         print(data)
166         #yield data#存储到mongodb
167 
168 
169 def all_run_main():
170 
171     global all_cartype_datas
172     count = 0
173     tn_car = Yi_car_data()
174     # t.Car_types_all_datas()
175     tn_car.car_type_all_info_time_expect()
176     dn_car = tn_car.car_type_all_info()
177     # print(d)
178     for j in dn_car:
179         count += 1
180         #print('正在解析第'+str(j),'
',count)
181         try:
182             all_cartype_datas = tn_car.get_car_info(j)
183         except Exception as e:
184             print(e)
185         #print(f)
186         yield all_cartype_datas
187 
188 
189 def threading_run_main():
190     tn_car = Yi_car_data()
191     main = tn_car.get_all_cartypes_info
192     car_urls = []
193     for jk in list(all_run_main())[0]:
194         car_urls.append(jk[0])
195     print('链接数量:',len(car_urls))
196     pool = ThreadPoolExecutor(100)#----线程数-----
197     f = pool.map(main,car_urls)
198 
199 
200 
201 if __name__ =="__main__":
202     threading_run_main()
203     import pymongo
204     conn = pymongo.MongoClient('localhost', 27017)   #连接服务器
205     db = conn.fangtianxia
206     items = db.items
207     items.insert('')
208 
209 
210 
211     '''
212     #tn_car = Yi_car_data()
213     #tn_car.get_all_cartypes_info('http://car.bitauto.com/aodiq3haiwai/')
214     #print(list(all_run_main()))
215     #print(list(all_run_main()))
216     '''


 
原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7904570.html