Pyhton入门笔记第五天实战：原生爬虫

from urllib import request
class Spider():
url='https://www.panda.tv/cate/lol' #目录地址
def __fetch__content(self): #定义私有方法，要取得页面内容
r=request.urlopen(Spider.url) #通过request.urlopen()函数得到
htmls=r.read() #再通过read()读取出来现在是数字类型
htmls=str(htmls,encoding='utf-8') #进行字符转化得到内容

def go(self): # 类入口
self.__fetch__content()

spider=Spider()
spider.go()

二,

import re

from urllib import request

class Spider():

url='https://www.panda.tv/cate/lol'

root_pattern='<div class="video-info">[sS]*?</div>' #正则

def __fetch__content(self):

r=request.urlopen(Spider.url)

htmls=r.read()

htmls=str(htmls,encoding='utf-8')

return htmls

def __analysis(self,htmls):

root_html=re.findall(Spider.root_pattern,htmls) #正则

#print(root_html[0])

def go(self):

htmls=self.__fetch__content()

self.__analysis(htmls)

spider=Spider()

spider.go()

三

'''
This is mob
'''
import re
from urllib import request
'''
This is class
'''
class Spider():
    url='https://www.panda.tv/cate/hearthstone?pdt=1.c_lol.psbar-ca1.2.2g65c8rk97k'
    root_pattern='<div class="video-info">([sS]*?)</div>'  # This is variable
    name_pattern='</i>([sS]*?)</span>'                    #   variable
    number_pattern='<span class="video-number">([sS]*?)</span>'
    def __fetch__content(self):      #取得页面内容
        r=request.urlopen(Spider.url)
        htmls=r.read()
        htmls=str(htmls,encoding='utf-8')
        return htmls
    def __analysis(self,htmls):      #把取得的内容组合成序列
        root_html=re.findall(Spider.root_pattern,htmls)
        anchors=[]       #接收序列内容
        for html in root_html:
            name=re.findall(Spider.name_pattern,html)
            number=re.findall(Spider.number_pattern,html)
            anchor={'name':name,'number':number}   #把取出来的内容放入字典
            anchors.append(anchor)                #再把字典放入序列
        # print(anchors[0])   #
        return anchors
    def __refine(self,anchors):    #把取出来的内容再精加工，去空格，回车等
        l=lambda anchor:{'name':anchor['name'][0].strip(),'number':anchor['number'][0]}   #strip()函数去空格
        return map(l,anchors)   #去除后，重新组合
    def __sort(self,anchors):   #对抓取的内容进行排序按number
        anchors=sorted(anchors,key=self.__sort_seed,reverse=True)    #sorted()函数排序
        return anchors
    def __sort_seed(self,anchor):   #对number的精计算，因为取得了的number为字符类，要转化成数字类型进行排序
        r=re.findall('d*',anchor['number'])  #提取number中的数字
        number=float(r[0])                #转成浮点型,因为带小数点
        if '万' in anchor['number']:       #把number中带万的进行数字转化
            number*=10000
        return number

    def __show(self,anchors):      #最终显示效果
        for rank in range(0,len(anchors)):  #打印出序列号   rank
            print('rank' + str(rank+1) + ':'+anchors[rank]['name'] +'   '+':'+anchors[rank]['number'])
    def go(self):
        htmls=self.__fetch__content()
        anchors=self.__analysis(htmls)
        anchors=list(self.__refine(anchors))
        anchors=self.__sort(anchors)
        self.__show(anchors)
        # print(anchors)
spider=Spider()
spider.go()

Pyhton入门 笔记 第五天 实战：原生爬虫

Pyhton入门笔记第五天实战：原生爬虫