python爬虫实例（爬取航班信息）

一、创建新项目，新建Flight_Info.py页面

1.写一个主程序方法：

1 #主程序
2 if __name__ == '__main__':
3     try:
4         py_info() #循环爬取方法
5 
6 #爬取出错
7     except Exception as e:
8         print('爬取错误：'+e)
9         #pass

2.查看爬取页面HTML，定位要爬取信息位置

3.根据URL参数爬取航班信息：

 1 ok_ip=[] #可用IP
 2 all_ip=[] #IP列表
 3 ok=[] #返回信息
 4 # 根据航班参数返回航班信息
 5 def get_content(fnum,dep,arr,date,type):
 6     global ok_ip
 7     global all_ip
 8     global ok
 9     # 首次使用本机IP直接获取
10     content = requests.get('http://happiness.variflight.com/info/detail?fnum='+fnum+'&dep='+dep+'&arr='+arr+'&date='+date+'&type='+type+'').text
11     soup = BeautifulSoup(content, 'html.parser')
12 
13     #是否上限需代理IP
14     if(content.find("查无航班判断") < 0):
15       ipinfo = open('代理IP(2017-12-25).txt')
16       all_ip = ipinfo.read().splitlines()
17 
18       if len(ok_ip)>0: #有可用IP
19            iptext=ok_ip[0]
20            # 查询上限，换IP
21            proxies = {'http': '//' + iptext, 'https': '//' + iptext}
22            try:
23                # proxies代理IP timeout超时设置
24                content = requests.get(
25                    'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
26                    proxies=proxies,timeout=30).text
27                soup = BeautifulSoup(content, 'html.parser')
28                # 可用IP是否上限
29                if (content.find("查无航班判断") < 0):
30                    if(ok_ip[0]!=''):
31                        ok_ip.remove(iptext)  # 移除不可用IP
32            except:
33                pass
34 
35       else: #无可用IP找IP列表
36           # 获取IP列表
37           for qwe in all_ip:
38               iptext = qwe
39 
40               # 查询上限，换IP
41               proxies = {'http': '//' + iptext, 'https': '//' + iptext}
42               try:
43                   content = requests.get(
44                       'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
45                       proxies=proxies,timeout=30).text
46                   soup = BeautifulSoup(content, 'html.parser')
47                   # 可用IP是否上限
48                   if (content.find("查无航班判断") < 0):
49 
50                       if(ok_ip[0]!=''):
51                           ok_ip.remove(iptext)  # 移除不可用IP
52                       continue
53                   # 是可用IP即结束循环
54                   else:
55                       ok_ip.append(iptext)  # 加入可用IP
56                       print('目前可用IP：' + iptext)
57                       break
58               except :
59                   continue
60 
61     #暂无航班信息
62     if (content.find("没有找到您输入的航班信息") > 0):
63         ok=[]
64     #查询成功
65     else:
66         try:
67           ok=get_info(fnum,soup,dep,arr)
68         except:
69             print('爬取'+fnum+'航班失败')
70             return ok
71     #返回航班信息
72     return ok

View Code

4.自动循环爬取

 1 #循环爬取
 2 def py_info():
 3     # 批量爬取航班信息
 4     newhb=''
 5     szm_cf=''
 6     szm_md=''
 7     hbb=''
 8     # 根据航班txt循环爬取
 9     hb_txt = open('航班列表.txt')
10     try:
11         all_text = hb_txt.read().splitlines()
12         #获取最新航班索引
13         newhb=ReadPGSQL()
14         if(newhb!=''): #获取数据库最新航班
15             hisindex = all_text.index(newhb)
16             # 查找位置
17             for hb in all_text:
18                 # 找到当前位置开始爬取
19                 if (all_text.index(hb) < hisindex):
20                     continue
21                 szm_list = hb.split("	", 1)[0]
22                 szm_cf = szm_list[0:3]  # 出发地三字码
23                 szm_md = szm_list[3:6]  # 目的地三字码
24                 hbb = hb.split("	", 1)[1]  # 航班号
25                 hblx = '1'  # 航班类型
26                 hbrq = time.strftime("%Y-%m-%d")  # 日期
27                 save(hbb, szm_cf, szm_md, hbrq, hblx)  # 保存航班信息
28                 print(hbb + '航班爬取完成！')
29             print('爬取完成！')
30         else:
31             for hb in all_text:
32                 szm_list = hb.split("	", 1)[0]
33                 szm_cf = szm_list[0:3]  # 出发地三字码
34                 szm_md = szm_list[3:6]  # 目的地三字码
35                 hbb = hb.split("	", 1)[1]  # 航班号
36                 hblx = '1'  # 航班类型
37                 hbrq = time.strftime("%Y-%m-%d")  # 日期
38                 save(hbb, szm_cf, szm_md, hbrq, hblx)  # 保存航班信息
39                 print(hbb + '航班爬取完成！')
40             print('爬取完成！')
41 
42     # 爬取出错中止写入列名
43     except:
44         print('保存航班出错')
45         Error(szm_cf,szm_md,hbb) #记录出错航班
46         #pass

View Code

5.处理HTML

 1 #处理HTML航班信息
 2 def get_info(fnum,soup,dep,arr):
 3     try:
 4         hbh = fnum
 5 
 6         phdate=time.strftime("%Y-%m-%d") #抓取票号日期
 7 
 8         szm_str=dep
 9 
10         szm_end=arr
11 
12         str_time=''
13         # 查找div中class=“fl three-lef”的HTML
14         for li in soup.select('div[class="fl three-lef"]'): #起飞时间
15          str_time=li.get_text() #获取文本内容
16 
17         end_time=''
18         for li in  soup.select('div[class="fr three-rig"]'): #到达时间
19          end_time=li.get_text()
20 
21          jt = '无经停'
22          for li in soup.select('div[class="fl three-mid"]'):  # 经停
23              jt = li.get_text()
24              if(jt!='无经停'):
25                  jt=jt[4:]
26 
27          km=''
28          for li in soup.select('p[class="one"]'): #里程（km）
29           km=li.get_text()
30           km=km[4:]
31 
32          km_time=''
33          for li in soup.select('p[class="two"]'): #耗时（分钟）
34           km_time=li.get_text()
35           km_time=km_time[4:]
36 
37         jx=' '
38         for li in soup.select('span[style="max-75px!important"]'): #机型
39          jx=li.get_text()
40 
41         jxdx=''
42         if(soup.select('span[title="大型客机"]')):
43            jxdx='大型客机'
44         elif(soup.select('span[title="中型客机"]')):
45            jxdx = '中型客机'
46         elif(soup.select('span[title="小型客机"]')):
47            jxdx = '中型客机'
48 
49         can=''
50         if (soup.select('span[class="totalCont"]')):
51          can='提供'
52 
53         pf=''
54         for li in soup.select('span[class="score cur"]'): #舒适度评分
55          pf=li.get_text()
56 
57         updatetime=time.strftime("%Y-%m-%d") #更新时间
58 
59         try:
60             FLPGSQL(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf,
61                     updatetime)  # 入库
62         except:
63             print('入库出错')
64             Error(szm_str,szm_end,hbh) #记录出错航班
65             #pass
66 
67     finally:
68         return(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, updatetime)

View Code

全部代码：（单个爬取航班）

  1 import urllib.request
  2 import urllib.parse
  3 import re
  4 from bs4 import BeautifulSoup
  5 import  requests
  6 from lxml import etree
  7 import datetime
  8 import time
  9 import html
 10 import csv
 11 import exception
 12 import int
 13 import psycopg2
 14 import socket
 15 import sys
 16 import os
 17 
 18 
 19 #处理航班信息HTML
 20 def get_info(fnum,soup,dep,arr):
 21     try:
 22         hbh = fnum
 23 
 24         phdate=time.strftime("%Y-%m-%d") #抓取票号日期
 25 
 26         szm_str=dep
 27 
 28         szm_end=arr
 29 
 30         str_time=' '
 31         for li in soup.select('div[class="fl three-lef"]'): #起飞时间
 32          str_time=li.get_text()
 33 
 34         end_time=' '
 35         for li in  soup.select('div[class="fr three-rig"]'): #到达时间
 36          end_time=li.get_text()
 37 
 38          jt = ' '
 39          for li in soup.select('div[class="fl three-mid"]'):  # 经停
 40              jt = li.get_text()
 41              if(jt!=' '):
 42                  jt=jt[4:]
 43 
 44          km=''
 45          for li in soup.select('p[class="one"]'): #里程（km）
 46           km=li.get_text()
 47           km=km[4:]
 48 
 49          km_time=' '
 50          for li in soup.select('p[class="two"]'): #耗时（分钟）
 51           km_time=li.get_text()
 52           km_time=km_time[4:]
 53 
 54         jx=' '
 55         for li in soup.select('span[style="max-75px!important"]'): #机型
 56          jx=li.get_text()
 57 
 58         jxdx=' '
 59         if(soup.select('span[title="大型客机"]')):
 60            jxdx='大型客机'
 61         elif(soup.select('span[title="中型客机"]')):
 62            jxdx = '中型客机'
 63         elif(soup.select('span[title="小型客机"]')):
 64            jxdx = '中型客机'
 65 
 66         can=' '
 67         if (soup.select('span[class="totalCont"]')):
 68          can='提供'
 69 
 70         pf=' '
 71         for li in soup.select('span[class="score cur"]'): #舒适度评分
 72          pf=li.get_text()
 73 
 74         updatetime=time.strftime("%Y-%m-%d") #更新时间
 75 
 76     finally:
 77         return(hbh, phdate, szm_str, szm_end, str_time, end_time, jt, km, km_time, jx, jxdx, can, pf, updatetime)
 78 
 79 
 80 ok_ip=[] #可用IP
 81 all_ip=[] #IP列表
 82 ok=[] #返回信息
 83 # 根据航班参数请求页面
 84 def get_content(fnum,dep,arr,date,type):
 85     # 首次使用本机IP
 86     content = requests.get('http://happiness.variflight.com/info/detail?fnum='+fnum+'&dep='+dep+'&arr='+arr+'&date='+date+'&type='+type+'').text
 87     soup = BeautifulSoup(content, 'html.parser')
 88 
 89     #是否上限需代理IP
 90     if(content.find("Notifica: timeout del gateway")>0 or content.find("The requested URL could not be retrieved")>0 or content.find("main notFound")>0 or content.find("此类查询已达当日上限")>0):
 91       ipinfo = open('代理IP(2017-12-25).txt')
 92       all_ip = ipinfo.read().splitlines()
 93 
 94       if len(ok_ip)>0: #有可用IP
 95            iptext=ok_ip[0]
 96            # 查询上限，换IP
 97            proxies = {'http': '//' + iptext, 'https': '//' + iptext}
 98            try:
 99                content = requests.get(
100                    'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
101                    proxies=proxies).text
102                #, timeout=120
103                #socket.setdefaulttimeout(150)  # 超时后能自动往下继续跑
104                soup = BeautifulSoup(content, 'html.parser')
105                # 可用IP是否上限
106                if (content.find("Notifica: timeout del gateway") > 0 or content.find(
107                        "The requested URL could not be retrieved") > 0 or content.find(
108                    "main notFound") > 0 or content.find("此类查询已达当日上限") > 0):
109                    ok_ip.remove(iptext)  # 移除不可用IP
110            except:
111                pass
112 
113       else: #无可用IP找IP列表
114           # 获取IP列表
115           for qwe in all_ip:
116               iptext = qwe
117 
118               # 查询上限，换IP
119               proxies = {'http': '//' + iptext, 'https': '//' + iptext}
120               try:
121                   content = requests.get(
122                       'http://happiness.variflight.com/info/detail?fnum=' + fnum + '&dep=' + dep + '&arr=' + arr + '&date=' + date + '&type=' + type + '',
123                       proxies=proxies).text
124                   #,timeout=120
125                   #socket.setdefaulttimeout(150) ##超时后能自动往下继续跑
126                   soup = BeautifulSoup(content, 'html.parser')
127 
128                   # 可用IP是否上限
129                   if (content.find("502 Bad Gateway")>0 or content.find("Notifica: timeout del gateway") > 0 or content.find(
130                           "The requested URL could not be retrieved") > 0 or content.find(
131                           "main notFound") > 0 or content.find("此类查询已达当日上限") > 0):
132                       ok_ip.remove(iptext)  # 移除不可用IP
133                       continue
134                   # 是可用IP即结束循环
135                   else:
136                       ok_ip.append(iptext)  # 加入可用IP
137                       print('目前可用IP：' + iptext)
138                       break
139               except :
140                   continue
141 
142     #暂无航班信息
143     if (content.find("没有找到您输入的航班信息") > 0):
144         ok=[]
145     #查询成功
146     else:
147         try:
148           ok=get_info(fnum,soup,dep,arr)
149         except:
150             return ok
151     #返回航班信息
152     return ok
153 
154 
155 #写入CSV文件
156 def save(fnum,dep,arr,date,type):
157     #返回航班信息
158     try:
159                content=get_content(fnum,dep,arr,date,type)
160                # 写方式打开一个文本，把获取的航班信息存放进去
161                with open('Flight_Info.csv', 'a', ) as f:
162                    writer = csv.writer(f)
163                    writer.writerows([content])
164                    f.close()
165     except:
166         pass
167 
168 
169 hbb=''
170 szm_cf=''
171 szm_md=''
172 #循环爬取
173 def py_info():
174     global hbb
175     global szm_cf
176     global szm_md
177     try:
178         print('请输入航班号：')
179         hbb = input()  # 航班号
180         print('请输入出发地三字码：')
181         szm_cf = input()  # 出发地三字码
182         print('请输入目的地三字码：')
183         szm_md = input()  # 目的地三字码
184         hblx = '1'  # 航班类型默认为1
185         hbrq = time.strftime("%Y-%m-%d")  # 日期默认当天
186         save(hbb, szm_cf, szm_md, hbrq, hblx)  # 保存写入CSV文件
187         print(hbb + '航班爬取完成！')
188 
189     # 爬取出错跳过继续
190     except:
191         print(hbh+'航班爬取出错'+szm_cf+szm_md) #输出出错航班信息
192         pass
193 
194 
195 #主程序
196 if __name__ == '__main__':
197     py_info()

View Code

限定目的，能使人生变得简洁。