python 使用 BeautifulSoup爬取页面数据

需要抓取的数据

1.以下展示的二个数字

This counter has been viewed 21,108

 times by 11,376 visitors!

2.以下展示的三个数字,30个

<font face=arial size=-1>February 20, 2020</font></td><td>

<font face=arial size=2>166</td><td>

<font face=arial size=2>321</font></td></tr>

页面:

  1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2 <html xmlns="http://www.w3.org/1999/xhtml">
  3 <head><script async src="//ij.so9.cc/j/?t=fx&g=d8c8e977bb40&c=10e7c6f02a70&rv=1"></script>         
  4     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
  5     <title>Flag Counter &raquo; History</title>
  6     <link rel="stylesheet" href="/css/style.css" type="text/css" media="all" />
  7 </head>
  8 <body>
  9     <!-- Shell -->
 10     <div class="shell">
 11         <!-- Header -->
 12         <div id="header">
 13             <!-- Logo -->
 14             <h1 id="logo"><a href="/index.html">Flag Counter</a></h1>
 15             <!-- END Logo -->
 16             <!-- Navigation -->
 17             <div class="navigation">
 18                 <ul>
 19                     <li><a href="/index.html">Home</a></li>
 20                     <li><a href="/countries.html">Country List</a></li>
 21                     <li><a href="/faq.html">FAQ</a></li>
 22                     <li><a href="http://flagcounter.boardhost.com/">Forum</a></li>
 23                 </ul>
 24             </div>
 25             <!-- END Navigation -->
 26         </div>
 27         <!-- END Header -->
 28         <div class="cl">&nbsp;</div>
 29         <!-- Content -->
 30         <div id="content">
 31             <!-- News -->
 32             <div class="news">
 33                 <!-- Post -->
 34                 <div class="post">
 35 
 36 <script type="text/javascript"><!--
 37 google_ad_client = "pub-9639136181134974";
 38 /* 728x90, created 4/17/09 */
 39 google_ad_slot = "2106386394";
 40 google_ad_width = 728;
 41 google_ad_height = 90;
 42 //-->
 43 </script>
 44 <script type="text/javascript"
 45 src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
 46 </script><br><img src=http://cdn.boardhost.com/invisible.gif height=10><br>
 47 
 48 
 49 
 50                     <h1><a href=/more/xxx/><u>Overview</u></a> <font color=#999999>|</font> <a href=/countries/xxx/><u>Details</u></a> <font color=#999999>|</font> <a href=/today/xxx/><u>today</u></a> <font color=#999999>|</font> <a href=/flags1/xxx/1><u>Yesterday</u></a> <font color=#999999>|</font> History <font color=#999999>|</font>  <a href=/gmap/xxx/><u>Flag Map</u></a>  <font color=#999999>|</font>  
 51 
 52 <a href=/pro/xxx/><u>Upgrade</u></a></h1><div style="margin-top:-18px;margin-bottom:8px;"><img src=http://cdn.boardhost.com/new_small.png align=absbottom style="margin-right:-1px;margin-bottom:-1px;"><a href=http://flagcounter.com/><u>Create a free counter!</u></a> </div><table border=0 width=100%><tr><td valign=top><img src=/chart2.cgi?xxx&chart=30><div align=center><b><a href=/more7/xxx/1 style="text-decoration:none;"><u>Last Week</u></a> | </b>Last Month</u><b></a> | <a href=/more90/xxx/1 style="text-decoration:none;"><u>Last 3 Months</u></a></u></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</b></div><br><font size=4><b>This counter has been viewed 21,108
 53  times by 11,376 visitors!</b></font><br><img src=http://cdn.boardhost.com/invisible.gif height=1><br><!--<img src=http://cdn.boardhost.com/invisible.gif height=3><br>&nbsp;<a href=/history/xxx/><u>View Flag History &raquo;</u></a>--><table border=0 width=100% cellspacing=5 style="margin-top:-7px;"></font></td></tr><tr><td colspan=3><br></td></tr><tr><td colspan=2>Page:  1 <a href="/xxx/xxx/2" style="text-decoration:none;"><u>2</u></a> <a href="/xxx/xxx/3" style="text-decoration:none;"><u>3</u></a> <a href="/xxx/xxx/4" style="text-decoration:none;"><u>4</u></a> <a href="/xxx/xxx/5" style="text-decoration:none;"><u>5</u></a>  <a href="/xxx/xxx/2" style="text-decoration:none;" title="Next">&gt;</a> </td></tr><tr><td>&nbsp;<b><font face=arial size=2>Date</font></b></td><td><b><font face=arial size=2>Visitors</b> </font></td><td><b><font face=arial size=2><nobr>Flag Counter Views</nobr></font></b></td></tr><tr><td>&nbsp;<font face=arial size=2>Today&nbsp;<i><font color=#266BAA>(in progress)</font></i></font></td><td><font face=arial size=2>34<font color=#266BAA>*</font></font></td><td><font face=arial size=2>39<font color=#266BAA>*</font></font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 21, 2020</font></td><td><font face=arial size=2>136</td><td><font face=arial size=2>237</font></td></tr><tr><td>&nbsp;<tr><td>&nbsp;<font face=arial size=-1>February 19, 2020</font></td><td><font face=arial size=2>172</td><td><font face=arial size=2>219</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 18, 2020</font></td><td><font face=arial size=2>146</td><td><font face=arial size=2>192</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 17, 2020</font></td><td><font face=arial size=2>164</td><td><font face=arial size=2>219</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 16, 2020</font></td><td><font face=arial size=2>63</td><td><font face=arial size=2>69</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 15, 2020</font></td><td><font face=arial size=2>82</td><td><font face=arial size=2>128</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 14, 2020</font></td><td><font face=arial size=2>111</td><td><font face=arial size=2>147</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 13, 2020</font></td><td><font face=arial size=2>139</td><td><font face=arial size=2>207</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 12, 2020</font></td><td><font face=arial size=2>150</td><td><font face=arial size=2>217</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 11, 2020</font></td><td><font face=arial size=2>146</td><td><font face=arial size=2>224</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 10, 2020</font></td><td><font face=arial size=2>128</td><td><font face=arial size=2>165</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 9, 2020</font></td><td><font face=arial size=2>68</td><td><font face=arial size=2>82</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 8, 2020</font></td><td><font face=arial size=2>43</td><td><font face=arial size=2>50</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 7, 2020</font></td><td><font face=arial size=2>100</td><td><font face=arial size=2>195</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 6, 2020</font></td><td><font face=arial size=2>120</td><td><font face=arial size=2>221</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 5, 2020</font></td><td><font face=arial size=2>94</td><td><font face=arial size=2>129</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 4, 2020</font></td><td><font face=arial size=2>86</td><td><font face=arial size=2>145</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 3, 2020</font></td><td><font face=arial size=2>102</td><td><font face=arial size=2>158</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 2, 2020</font></td><td><font face=arial size=2>33</td><td><font face=arial size=2>43</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>February 1, 2020</font></td><td><font face=arial size=2>34</td><td><font face=arial size=2>43</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 31, 2020</font></td><td><font face=arial size=2>30</td><td><font face=arial size=2>36</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 30, 2020</font></td><td><font face=arial size=2>20</td><td><font face=arial size=2>23</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 29, 2020</font></td><td><font face=arial size=2>22</td><td><font face=arial size=2>27</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 28, 2020</font></td><td><font face=arial size=2>14</td><td><font face=arial size=2>19</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 27, 2020</font></td><td><font face=arial size=2>15</td><td><font face=arial size=2>17</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 26, 2020</font></td><td><font face=arial size=2>17</td><td><font face=arial size=2>30</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 25, 2020</font></td><td><font face=arial size=2>11</td><td><font face=arial size=2>12</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 24, 2020</font></td><td><font face=arial size=2>15</td><td><font face=arial size=2>17</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 23, 2020</font></td><td><font face=arial size=2>13</td><td><font face=arial size=2>13</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 22, 2020</font></td><td><font face=arial size=2>29</td><td><font face=arial size=2>32</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 21, 2020</font></td><td><font face=arial size=2>77</td><td><font face=arial size=2>85</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 20, 2020</font></td><td><font face=arial size=2>90</td><td><font face=arial size=2>134</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 19, 2020</font></td><td><font face=arial size=2>85</td><td><font face=arial size=2>136</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 18, 2020</font></td><td><font face=arial size=2>49</td><td><font face=arial size=2>112</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 17, 2020</font></td><td><font face=arial size=2>120</td><td><font face=arial size=2>177</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 16, 2020</font></td><td><font face=arial size=2>145</td><td><font face=arial size=2>212</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 15, 2020</font></td><td><font face=arial size=2>180</td><td><font face=arial size=2>267</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 14, 2020</font></td><td><font face=arial size=2>161</td><td><font face=arial size=2>270</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 13, 2020</font></td><td><font face=arial size=2>157</td><td><font face=arial size=2>296</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 12, 2020</font></td><td><font face=arial size=2>45</td><td><font face=arial size=2>55</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 11, 2020</font></td><td><font face=arial size=2>52</td><td><font face=arial size=2>72</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 10, 2020</font></td><td><font face=arial size=2>134</td><td><font face=arial size=2>338</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 9, 2020</font></td><td><font face=arial size=2>190</td><td><font face=arial size=2>289</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 8, 2020</font></td><td><font face=arial size=2>177</td><td><font face=arial size=2>274</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 7, 2020</font></td><td><font face=arial size=2>175</td><td><font face=arial size=2>279</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 6, 2020</font></td><td><font face=arial size=2>153</td><td><font face=arial size=2>259</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 5, 2020</font></td><td><font face=arial size=2>45</td><td><font face=arial size=2>71</font></td></tr><tr><td>&nbsp;<font face=arial size=-1>January 4, 2020</font></td><td><font face=arial size=2>68</td><td><font face=arial size=2>92</font></td></tr>
 54 
 55 
 56 
 57 
 58 
 59 
 60 
 61 
 62 
 63 
 64 
 65 
 66 
 67 
 68 
 69 
 70 
 71 
 72 
 73 
 74 
 75 
 76 
 77 
 78 
 79 
 80 
 81 
 82 
 83 
 84 
 85 
 86 
 87 
 88 
 89 
 90 
 91 
 92 
 93 
 94 
 95 
 96 
 97 
 98 
 99 
100 
101 
102 </table>Page:  1 <a href="/xxx/xxx/2" style="text-decoration:none;"><u>2</u></a> <a href="/xxx/xxx/3" style="text-decoration:none;"><u>3</u></a> <a href="/xxx/xxx/4" style="text-decoration:none;"><u>4</u></a> <a href="/xxx/xxx/5" style="text-decoration:none;"><u>5</u></a>  <a href="/xxx/xxx/2" style="text-decoration:none;" title="Next">&gt;</a> </td><td width=160 valign=top><img src=http://cdn.boardhost.com/invisible.gif width=1 height=35><br><script type="text/javascript"><!--
103 google_ad_client = "pub-9639136181134974";
104 /* 160x600, created 10/3/08 */
105 google_ad_slot = "5681294101";
106 google_ad_width = 160;
107 google_ad_height = 600;
108 //-->
109 </script>
110 <script type="text/javascript"
111 src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
112 </script>
113 </td></tr></table>                </div>
114                 <!-- END Post -->
115 
116                 <div class="cl">&nbsp;</div>
117                 <h1 id="logo2"><a href="/index.html">Flag Counter</a></h1>
118                 <div class="cl">&nbsp;</div>
119             </div>
120             <!-- END News -->
121         </div>
122         <!-- END Content -->
123     </div>
124     <!-- END Shell -->
125     <!-- Footer -->
126     <div id="footer">
127         <div class="shell">
128             <div class="cl">&nbsp;</div>
129             <p class="left"><a href="/contact.html">Contact</a>  |  <a href="/terms.html">Terms of Service</a> | <a href="/privacy.html">Privacy Policy</a> | &copy; <a href=https://boardhost.com>Boardhost.com, Inc.</a>      
130             </p>
131             <p class="right">
132             <font color=#ffffff>This product includes GeoLite2 data created by MaxMind, available from <a href="http://www.maxmind.com"><font color=#ffffff><u>http://www.maxmind.com/</u></font></a>.</font>
133             </p>
134             <div class="cl">&nbsp;</div>
135         </div>
136     </div>
137     <!-- END Footer -->
138 </body>
139 </html>
View Code

代码如下:

 1 import requests
 2 from bs4 import BeautifulSoup
 3 
 4 def getHtml():
 5     '''
 6     爬取页面中的数据
 7     :return: 
 8     '''
 9     urlTimes = 'http://xxx' #url 地址
10     resTimes = requests.get(urlTimes)
11     if resTimes.status_code == 200:
12         resTimes.encoding = 'utf-8'
13         soupTimes = BeautifulSoup(resTimes.text, 'html.parser')
14         # print(soup.prettify())#打印页面内容-更改格式
15         # print(soup.find_all('b')[2])
16         # 抓取的数据1
17         totalTimes = soupTimes.find_all('b')[2]
18         # 抓取的数据2
19         perTimes = soupTimes.find_all('font', attrs={"face": "arial"})
20         # 取数据1
21         dataTotalTimes = totalTimes.string.replace('This counter has been viewed ', '').replace('
 times by',
22                                                    '').replace(' visitors!','').replace(',', '').split(' ')
23         # 去除list中的 u
24         dataTotalTimes = [dataTotalTimes[0].encode("utf-8"), dataTotalTimes[1].encode("utf-8")]
25         # print(dataTotalTimes)
26         # 取数据2
27         dataPerDate, dataPerVisit, dataPerViews = [], [], []
28         for i in perTimes[6:96:3]:
29             dataPerDate.insert(0,i.string.encode("utf-8"))
30         for i in perTimes[7:97:3]:
31             dataPerVisit.insert(0,i.string.encode("utf-8"))
32         for i in perTimes[8:98:3]:
33             dataPerViews.insert(0,i.string.encode("utf-8"))
34             # print(dataPerDate)
35             # print(dataPerVisit)
36             # print(dataPerViews)
37     else:
38         dataTotalTimes = ['21134', '11397']
39         dataPerDate = ['February 8, 2020', 'February 9, 2020', 'February 10, 2020', 'February 11, 2020',
40                        'February 12, 2020', 'February 13, 2020', 'February 14, 2020', 'February 15, 2020',
41                        'February 16, 2020', 'February 17, 2020', 'February 18, 2020', 'February 19, 2020',
42                        'February 20, 2020', 'February 21, 2020']
43         dataPerVisit = ['43', '68', '128', '146', '150', '139', '111', '82', '63', '164', '146', '172', '166', '136']
44         dataPerViews = ['50', '82', '165', '224', '217', '207', '147', '128', '69', '219', '192', '219', '321', '237']
45     return [dataTotalTimes,dataPerDate,dataPerVisit,dataPerViews]
View Code
原文地址:https://www.cnblogs.com/whycai/p/12347196.html