爬取携程7天内的全国热门城市航班

  1 #!/usr/bin/env python
  2 # coding: utf-8
  6 
  7 import requests
  8 import pandas as pd
  9 import json,random,time,datetime
 10 
 11 # userAgent
 12 userAgent = [
 13     "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
 14     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
 15     "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0",
 16     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
 17     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 18     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 19     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
 20     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17"
 21     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
 22     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
 23     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
 24     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
 25     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
 26 ]
 27 
 28 # get city
 29 def getCityMsg():
 30     headers = {
 31         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
 32         "Referer": "https://flights.ctrip.com/itinerary",
 33         "Content-Type": "application/json"
 34     }
 35     url = 'https://flights.ctrip.com/itinerary/api/poi/get'
 36     r = requests.get(url=url,headers=headers).text
 37 #     print(len(r))
 38     # get city msg
 39     city = {}
 40     city_load = json.loads(r).get('data')
 41     for data in city_load.keys():
 42         ## 所有航班
 43         # if data != '热门':   
 44         #     tmpdata = city_load.get(data)
 45     #         for i in tmpdata:
 46     # #             print(i)  # A 
 47     #             for k in tmpdata.get(i):   
 48     #                 name = k.get('data').split('|')
 49     #                 cityNumId = name[2]
 50     #                 cityId = name[3]
 51     #                 cityName = name[1].split('(')[0]
 52     #                 city[cityName] = [cityId, cityNumId]
 53         if data == '热门':     # 仅限热门城市
 54             tmpdata = city_load.get(data)
 55             for i in tmpdata:   # tmpdata is list , i is dict
 56                 name = i.get('data').split('|')
 57                 cityNumId = name[2]
 58                 cityId = name[3]
 59                 cityName = name[1].split('(')[0]
 60                 city[cityName] = [cityId, cityNumId]
 61 
 62             
 63     return city
 64 
 65 # 生成自今日至往后7天日期
 66 def get_date():
 67     dateList = []  # 存放时间list
 68     formatDate = datetime.datetime.now()  # 生成今日的格式化时间
 69     strDate = formatDate.strftime('%Y-%m-%d')  # 生成字符串日期
 70     stpDate = datetime.datetime.strptime(strDate,'%Y-%m-%d')  # 将字符串转为日期格式的日期
 71     for i in range(7):
 72         stpDate += datetime.timedelta(days=+1)   # 日期叠加1
 73         dateList.append(datetime.datetime.strftime(stpDate,'%Y-%m-%d'))  # 放入字典
 74     return dateList
 75 
 76 # get page text:routeList
 77 def  get_routeList(headers, load_json, cnt):
 78     try:
 79         response = requests.post(url = "https://flights.ctrip.com/itinerary/api/12808/products",data=json.dumps(load_json), headers = headers).text
 80         result = json.loads(response)["data"].get('routeList')
 81         return json.loads(response)["data"].get('routeList')
 82     except Exception as e:
 83         print('Get 【{} --> {}】 Page is failed !'.format(load_json.get('airportParams')[0].get('dcityname'), load_json.get('airportParams')[0].get('acityname')))
 84         print('休息10m后再来……')
 85         time.sleep(600)
 86         cnt += 1
 87         if cnt <= 10:
 88             get_routeList(headers, load_json, cnt)
 89         else:
 90             return None
 91 # get Data
 92 def get_data(index, df, routeList):
 93     if routeList is not None:
 94         for i, route in enumerate(routeList):
 95             if route.get('routeType') == 'Flight':  # 只要航班
 96                 index += 1
 97                 # route is dict
 98                 # we need route inside legs, legs is list, but its lengths is 1
 99                 # so we should legs[0], legs[0] is dict
100 
101                 # flight
102                 flight = route.get('legs')[0].get('flight')  # dict
108 
109                 #### about flight
110                 if flight is not None:
111                     # common attr
112                     df.loc[index,'airlineCode'] = flight.get('airlineCode')
113                     df.loc[index,'AirlineName'] = flight.get('airlineName')
114                     df.loc[index,'durationDays'] = flight.get('durationDays')
115                     df.loc[index,'flightNumber'] = flight.get('flightNumber')
116                     df.loc[index,'mealFlag'] = flight.get('mealFlag')
117                     df.loc[index,'mealType'] = flight.get('mealType')
118                     df.loc[index,'comfort'] = flight.get('comfort')
119                     df.loc[index,'craftKind'] = flight.get('craftKind')
120                     df.loc[index,'craftTypeCode'] = flight.get('craftTypeCode')
121                     df.loc[index,'craftTypeKindDisplayName'] = flight.get('craftTypeKindDisplayName')
122                     df.loc[index,'craftTypeName'] = flight.get('craftTypeName')
123                     df.loc[index,'delayedTime'] = flight.get('delayedTime')
124                     df.loc[index,'oilFee'] = flight.get('oilFee')
125                     df.loc[index,'punctualityRate'] = flight.get('punctualityRate')
126                     df.loc[index,'sharedFlightName']  = flight.get('sharedFlightName')
127                     df.loc[index,'sharedFlightNumber'] = flight.get('sharedFlightNumber')
128                     df.loc[index,'specialCraft'] = flight.get('specialCraft')
129                     df.loc[index,'stopInfo'] = flight.get('stopInfo')
130                     df.loc[index,'stopTimes'] = flight.get('stopTimes')
131                     df.loc[index,'tax'] = flight.get('tax')
132                     # arrival
133                     df.loc[index,'arrivalairportName'] = flight.get('arrivalAirportInfo').get('airportName')
134                     df.loc[index,'arrivalairportTlc'] = flight.get('arrivalAirportInfo').get('airportTlc')
135                     df.loc[index,'arrivalcityName'] = flight.get('arrivalAirportInfo').get('cityName')
136                     df.loc[index,'arrivalcityTlc'] = flight.get('arrivalAirportInfo').get('cityTlc')
137                     df.loc[index,'arrivalTerminalName'] = flight.get('arrivalAirportInfo').get('terminal').get('name')
138                     df.loc[index,'arrivalDate'] = flight.get('arrivalDate')
139                     # departure 
140                     df.loc[index,'departureairportName'] = flight.get('departureAirportInfo').get('airportName')
141                     df.loc[index,'departureairportTlc'] = flight.get('departureAirportInfo').get('airportTlc')
142                     df.loc[index,'departureCityName'] = flight.get('departureAirportInfo').get('cityName')
143                     df.loc[index,'departureCityTlc'] = flight.get('departureAirportInfo').get('cityTlc')
144                     df.loc[index,'departureTerminalName'] = flight.get('departureAirportInfo').get('terminal').get('name')
145                     df.loc[index,'departureDate'] = flight.get('departureDate')
146 
147                 #### characteristic : charactor
148                 # characteristic:charactor
149                 charactor = route.get('legs')[0].get('characteristic')  # dict
150                 if charactor is not None:
151                     df.loc[index, 'businessAircraft'] = charactor.get('businessAircraft')
152                     df.loc[index, 'discountAmount'] = charactor.get('discountAmount')
153                     df.loc[index, 'discountShowType'] = charactor.get('discountShowType')
154                     df.loc[index, 'flyMan'] = charactor.get('flyMan')
155                     df.loc[index, 'groupTicketPrice'] = charactor.get('groupTicketPrice')
156                     df.loc[index, 'hotFlight'] = charactor.get('hotFlight')
157                     df.loc[index, 'hx'] = charactor.get('hx')
158                     df.loc[index, 'infantSoldOut'] = charactor.get('infantSoldOut')
159                     df.loc[index, 'lowPriceDiscount'] = charactor.get('lowPriceDiscount')
160                     df.loc[index, 'lowestBabyCfPrice'] = charactor.get('lowestBabyCfPrice')
161                     df.loc[index, 'lowestBabyPrice'] = charactor.get('lowestBabyPrice')
162                     df.loc[index, 'lowestCfPrice'] = charactor.get('lowestCfPrice')
163                     df.loc[index, 'lowestChildAdultCfPrice'] = charactor.get('lowestChildAdultCfPrice')
164                     df.loc[index, 'lowestChildAdultPrice'] = charactor.get('lowestChildAdultPrice')
165                     df.loc[index, 'lowestChildCfPrice'] = charactor.get('lowestChildCfPrice')
166                     df.loc[index, 'lowestChildPrice'] = charactor.get('lowestChildPrice')
167                     df.loc[index, 'lowestPrice'] = charactor.get('lowestPrice')
168                     df.loc[index, 'promotion'] = charactor.get('promotion')
169                     df.loc[index, 'providerHx'] = charactor.get('providerHx')
170                     df.loc[index, 'roundTripDiscounts'] = charactor.get('roundTripDiscounts')
171                     tmp_charactor = charactor.get('standardPrices')
172                     if tmp_charactor is not None:
173                         for i, stdPrice in enumerate(tmp_charactor):
174                             diffCabinCla = stdPrice.get('cabinClass')
175                             df.loc[index, 'price' + diffCabinCla] = stdPrice.get('price')
176                     df.loc[index, 'superFlyMan'] = charactor.get('superFlyMan')
177                     df.loc[index, 'weight'] = charactor.get('weight')
178 
209     return (index, df)
210         
211 
212 # main function    
213 def main(city):
214     # 初始化 时间
215     flightDates = get_date()
216     for flightDate in flightDates:   # 起飞日期
217         df = pd.DataFrame()
218         index = 0
219         print(flightDate, end= '	')
220         
221         
222         # 当出现错误时,在后续的过程中加入加入该段代码     
223 #         for (fromCityName, fromCityId) in city[city.index('厦门'):].items():
224         # 当第一次运行时,执行下面这个for
225         for (fromCityName, fromCityId) in city.items():  # 起飞城市
226             print(fromCityName,end='')
227             for (toCityName, toCityId) in city.items():  # 降落城市    
228                 # 容错次数
229                 cnt = 1
230                 if  fromCityName != toCityName:  
231                     print(toCityName,end='	')
232                     # headers
233                     headers = {
234                         "User-Agent": random.choice(userAgent),
235                         "origin": "https://flights.ctrip.com",
236                         "content-type": "application/json"
237                     }
238                     # 加载不同 load_json
239                     load_json = {
240                         "airportParams":[
241                             {"dcity":fromCityId[0],"dcityname":fromCityName,"acity":toCityId[0],"acityname":toCityName,"date":flightDate,"dcityid":fromCityId[1],"acityid":toCityId[1]}
242                         ],
243                         "classType": "ALL",
244                         "date": flightDate,
245                         "flightWay": "Oneway",
246                         "hasBaby": False,
247                         "hasChild": False,
248                         "searchIndex": 1,
249                         "token": "a4d91efc14f95ad7e1abaf914da140f3"
250                     }
251                     # routeList
252                     routeList = get_routeList(headers, load_json, cnt)
253                     # get_data
254                     if routeList is not None:  # 没有航班则跳过
255                         (index, df) = get_data(index, df, routeList)
256 #                         print(index,df.shape, end='	')
257                 time.sleep(random.choice(range(2)))
258             print('
' + '--'*50)
259             time.sleep(random.choice(range(3)))
260                #######################################
261             print('【{}】起飞,抓完!'.format(fromCityName))
262         time.sleep(random.choice(range(60,90)))
263         print(df.shape)
264         csv_path = '【{}】起飞航班.csv'.format(flightDate)
265         print('起飞日期:{},抓完,写入文件!'.format(fromCityName))
266         print(csv_path)
267         df.to_csv(csv_path,index=False, encoding='utf-8')
268     return (index, df)
269 
270 
271 
272 if __name__ == "__main__":
273      # getCityMsg
274     city = getCityMsg()
275     (index, df) = main(city)

降低爬取速度可用

原文地址:https://www.cnblogs.com/Alexisbusyblog/p/12580891.html