使用Python自带的库和正则表达式爬取熊猫直播主播观看人气

  • 主要是体现代码的规范性
    from urllib import request
    import re
    
    
    class Spider():
        url = 'https://www.panda.tv/cate/lol'
        root_pattern = '<div class="video-info">([sS]*?)</div>'
        name_pattern = '</i>([sS]*?)</span>'
        number_pattern = '<span class="video-number">([sS]*?)</span>'
    
        def __fetch_content(self):
            r = request.urlopen(Spider.url)
            htmls = r.read()
            htmls = str(htmls, encoding='utf-8')
            return htmls
    
        def __analysis(self, htmls):
            root_html = re.findall(Spider.root_pattern, htmls)
            anchors = []
            for html in root_html:
                name = re.findall(Spider.name_pattern, html)
                number = re.findall(Spider.number_pattern, html)
                anchor = {"name": name, "number": number}
                anchors.append(anchor)
            # print(root_html[1])
            # print(anchors[1])
            return anchors
    
        def __refine(self, anchors):
            L = lambda anchor: {"name": anchor['name'][0].strip(), 'number': anchor['name'][1]}
            return map(L, anchors)
    
        def __sort(self, anchors):
            anchors = sorted(anchors, key=self.__sort_seed, reverse=True)
            return anchors
    
        def __sort_seed(self, anchor):
            r = re.findall("d*", anchor["number"])
            number = float(r[0])
            if '万' in anchor['number']:
                number = number * 10000
            return number
    
        def __show(self, anchors):
            for rank in range(0, len(anchors)):
                print("排名:"+str(rank+1)+"  主播:" + anchors[rank]['name'] +
                      "--------" + "观看人数:" +
                      anchors[rank]['number'])
    
    
    
    
        def go(self):
            htmls = self.__fetch_content()
            anchors = self.__analysis(htmls)
            anchors = list(self.__refine(anchors))
            anchors = self.__sort(anchors)
            self.__show(anchors)
            print(len(anchors))
            # print(anchors)
    
    
    spider = Spider()
    spider.go()
    

      

原文地址:https://www.cnblogs.com/longbigbeard/p/10473411.html