Python入门学习笔记11:原生爬虫

 1 """
 2 原生爬虫
 3 
 4  爬虫前奏:
 5  明确目的
 6  找到数据对应的网页
 7  分析网页的结构找到数据所在的标签位置
 8 
 9  模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML
10  用正则表达式提取我们要的数据(名字、人气<热度>)
11 
12  参考文档:
13 https://blog.csdn.net/qq_38151401/article/details/93018656
14 
15 思路:
16 
17 (1)获取网页内容
18 
19 (2)分析所要获取的数据格式
20 
21 (3)获取相应的数据
22 
23 (4)将数据转化为所需要的格式
24 
25 (5)数据展现
26 """
  1 #样例:原生爬虫爬取虎牙的王者荣耀板块,进行主播人气排序
  2 #拓展爬虫框架:BeautifulSoup,Scrapy
  3 # 爬虫、反爬虫、反反爬虫  ip容易被封,代理IP库
  4 import re
  5 from urllib import request
  6 import ssl
  7 #断点调试
  8 class Spider():
  9     #定义链接、截取字段
 10     url = 'https://www.huya.com/g/wzry'                 #爬虫获取的网站
 11     root_pattern = '<span class="txt">([sS]*?)</li>'  #爬虫获取的节点
 12     #root_pattern2 = '<li class="game-live-item"[sS]*?</li>'
 13     name_pattern = '<i class="nick" title="([sS]*?)">'#爬虫获取的名字(正则)
 14     number_pattern = '<i class="js-num">([sS]*?)</i>' #爬虫获取的人气值(正则)
 15 
 16     #获取网站的代码
 17     def __fetch_content(self):
 18         ssl._create_default_https_context = ssl._create_unverified_context#创建免验证的ssl
 19         r = request.urlopen(Spider.url)#获取地址
 20         htmls = r.read()               #读取代码
 21         htmls = str(htmls, encoding='utf-8')#变为可阅读的字符串格式
 22         return htmls
 23 #
 24     #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
 25     def __analysis(self, htmls):
 26         root_html = re.findall(Spider.root_pattern,htmls)
 27         print(root_html[0])
 28         #root_html2 = re.findall(Spider.root_pattern2,htmls)
 29         anchors = []
 30         for html in root_html:
 31             name = re.findall(Spider.name_pattern,html)
 32             number = re.findall(Spider.number_pattern,html)
 33             anchor = {'name':name,'number':number} #   {'name': ['Dae-心态'], 'number': ['<i class="js-num">473.4万</i>']}
 34             anchors.append(anchor)
 35         #print(anchors[0])
 36         a = 1
 37         return anchors
 38 
 39     #处理所获取数组中多余的符号等
 40     def __refine(self,anchors):
 41         l = lambda anchor:{
 42             'name':anchor['name'][0].strip(),
 43             'number': anchor['number'][0]
 44             }
 45         return map(l,anchors)
 46 
 47     #排序
 48     def __sort(self,anchors):
 49         #filter
 50         anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
 51         return anchors
 52     #排序的条件
 53     def __sort_seed(self,anchor):
 54 
 55         #r = re.findall('[1-9]d[^,]*.d*|0.d*[1-9]d*|[^,]',anchor['number'])
 56         #r = re.findall('[1-9][^,]d*.d*|0.d*[1-9][^,]d*', '1,816.1万')
 57         # print(anchor['number'],list(r),r[0])
 58         number = float(str(anchor['number']).replace('', ''))
 59 
 60         if ',' in anchor['number']:
 61             number = float(str('1,816.1万').replace(',','').replace('',''))
 62         elif '' in anchor['number']:
 63             number *= 10000
 64         return number
 65 
 66     #展示
 67     def __show(self,anchors):
 68         for rank in range(0,len(anchors)):
 69             #print(anchor['name']+'-----'+anchor['number'])
 70             print('rank  ' + str(rank + 1)
 71                   + ':' + anchors[rank]['name']
 72                   + '    ' + anchors[rank]['number'])
 73     #公共方法区调用私有方法
 74     def go(self):
 75         htmls = self.__fetch_content()          #获取网站的代码
 76         anchors = self.__analysis(htmls)        #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
 77         anchors = list(self.__refine(anchors))  #处理所获取数组中多余的符号等
 78         anchors = self.__sort(anchors)          #排序
 79         self.__show(anchors)                    #展示
 80         #print(list(anchors))
 81 
 82 spider = Spider()
 83 spider.go()
 84 
 85 """
 86       <li class="game-live-item" gid="2336" data-lp="1259515661837">
 87         <a href="https://www.huya.com/688" class="video-info " target="_blank">
 88         <img class="pic" data-original="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" src="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" data-default-img="338x190" alt="张大仙的直播" title="张大仙的直播">
 89 
 90                                 <em class="tag tag-recommend">超级明星</em>
 91                             
 92         <div class="item-mask"></div>
 93         <i class="btn-link__hover_i"></i>
 94         <p class="tag-right">
 95 
 96             <!-- 手机开播 -->
 97                         
 98             <!-- VR直播 -->
 99             
100                 <!-- 无损音质 || 蓝光 -->
101                                     <em class="tag-blue">蓝光8M</em>
102                                 
103                         
104 
105                     </p>
106     </a>
107     <a href="https://www.huya.com/688" class="title" title="大仙来啦" target="_blank">大仙来啦</a>
108     <span class="txt">=============================================================================
109         <span class="avatar fl">
110             <img data-original="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" data-default-img="84x84" alt="张大仙" title="张大仙">
111             <i class="nick" title="张大仙">张大仙</i>
112         </span>
113                 <span class="num">
114                 <i class="num-icon"></i>
115                 <i class="js-num">1,404.5万</i></span>
116     </span>
117 </li>
118 
119 """
原文地址:https://www.cnblogs.com/liuxiaoming123/p/13375309.html