from urllib import request
class Spider():
url='https://www.panda.tv/cate/lol' #目录地址
def __fetch__content(self): #定义私有方法,要取得页面内容
r=request.urlopen(Spider.url) #通过request.urlopen()函数得到
htmls=r.read() #再通过read()读取出来 现在是数字类型
htmls=str(htmls,encoding='utf-8') #进行字符转化 得到内容
def go(self): # 类入口
self.__fetch__content()
spider=Spider()
spider.go()
二,
import re
from urllib import request
class Spider():
url='https://www.panda.tv/cate/lol'
root_pattern='<div class="video-info">[sS]*?</div>' #正则
def __fetch__content(self):
r=request.urlopen(Spider.url)
htmls=r.read()
htmls=str(htmls,encoding='utf-8')
return htmls
def __analysis(self,htmls):
root_html=re.findall(Spider.root_pattern,htmls) #正则
#print(root_html[0])
def go(self):
htmls=self.__fetch__content()
self.__analysis(htmls)
spider=Spider()
spider.go()
三
''' This is mob ''' import re from urllib import request ''' This is class ''' class Spider(): url='https://www.panda.tv/cate/hearthstone?pdt=1.c_lol.psbar-ca1.2.2g65c8rk97k' root_pattern='<div class="video-info">([sS]*?)</div>' # This is variable name_pattern='</i>([sS]*?)</span>' # variable number_pattern='<span class="video-number">([sS]*?)</span>' def __fetch__content(self): #取得页面内容 r=request.urlopen(Spider.url) htmls=r.read() htmls=str(htmls,encoding='utf-8') return htmls def __analysis(self,htmls): #把取得的内容组合成序列 root_html=re.findall(Spider.root_pattern,htmls) anchors=[] #接收序列内容 for html in root_html: name=re.findall(Spider.name_pattern,html) number=re.findall(Spider.number_pattern,html) anchor={'name':name,'number':number} #把取出来的内容放入字典 anchors.append(anchor) #再把字典放入序列 # print(anchors[0]) # return anchors def __refine(self,anchors): #把取出来的内容再精加工,去空格,回车等 l=lambda anchor:{'name':anchor['name'][0].strip(),'number':anchor['number'][0]} #strip()函数去空格 return map(l,anchors) #去除后,重新组合 def __sort(self,anchors): #对抓取的内容进行排序按number anchors=sorted(anchors,key=self.__sort_seed,reverse=True) #sorted()函数排序 return anchors def __sort_seed(self,anchor): #对number的精计算,因为取得了的number为字符类,要转化成数字类型进行排序 r=re.findall('d*',anchor['number']) #提取number中的数字 number=float(r[0]) #转成浮点型,因为带小数点 if '万' in anchor['number']: #把number中带万的进行数字转化 number*=10000 return number def __show(self,anchors): #最终显示效果 for rank in range(0,len(anchors)): #打印出序列号 rank print('rank' + str(rank+1) + ':'+anchors[rank]['name'] +' '+':'+anchors[rank]['number']) def go(self): htmls=self.__fetch__content() anchors=self.__analysis(htmls) anchors=list(self.__refine(anchors)) anchors=self.__sort(anchors) self.__show(anchors) # print(anchors) spider=Spider() spider.go()