抓取图片
# 导包
import requests
import re
from lxml import etree
import os
# 定义请求类
class PearVideo(object):
# /定义抓取方法
def get_countent(self,url,type):
if type == 'index':
file_name = 'test_pear.html'
else:
file_name = 'inner_pear.html'
# 使用os模块判断文件是否存在
if not os.path.exists('test_pear.html'):
# 发送HTTP请求
r = requests.get(url)
# 解码
html = r.content.decode("utf-8")
print(html)
# 第二种解码方案
# html = r.content.decode('utf-8').encode('gbk',)
# 断点
# exit(-1)
# 写文件,指定文件编码
with open('./'+file_name,'w',encoding='utf-8') as f:
f.write(html)
return html
else:
# 读取文件返回
with open('./'+file_name,encoding='utf-8') as f:
contents = f.read()
return contents
# 定义数据匹配方法
def get_xpath(self,html):
# 匹配图片地址
html_data_img = html.xpath('//div[@class="img"]/@style')
print(html_data_img)
# 处理图片
img_list = []
for item in html_data_img:
# item = item.replace('background-image:url(','').replace(');','')
# 使用正则匹配处理图片
# 定义正则表达式
regex = re.compile("background-image: url((.+?));")
img_list.append(regex.findall(item)[0])
# 图片下载
for item in img_list:
r = requests.get(item)
# 写文件
with open('./test_pear.png','wb') as f:
f.write(r.content)
# 断点
exit(-1)
print(img_list)
if __name__ == "__main__":
# 实例化对象
pearvideo = PearVideo()
# 调用内置方法
html = pearvideo.get_countent('https://www.pearvideo.com/',type)
# print(html)
pearvideo.get_xpath(html)
抓取视频
# 导包
import requests
import re
from lxml import etree
import os
# 定义请求类
class PearVideo(object):
# /定义抓取方法
def get_countent(self,url,type):
if type == 'index':
file_name = 'test_pear.html'
else:
file_name = 'inner_pear.html'
# 使用os模块判断文件是否存在
if not os.path.exists('test_pear.html'):
# 发送HTTP请求
r = requests.get(url)
# 解码
html = r.content.decode("utf-8")
print(html)
# 第二种解码方案
# html = r.content.decode('utf-8').encode('gbk',)
# 断点
# exit(-1)
# 写文件,指定文件编码
with open('./'+file_name,'w',encoding='utf-8') as f:
f.write(html)
return html
else:
# 读取文件返回
with open('./'+file_name,encoding='utf-8') as f:
contents = f.read()
return contents
# 定义数据匹配方法
def get_xpath(self,html):
# 转换格式
html = etree.HTML(html)
# 匹配内页地址
html_data_url = html.xpath("//div[@class='actcontbd']/a/@href")
print(html_data_url)
# 处理内页网址
url_list = []
for item in html_data_url:
item = 'https://www.pearvideo.com/'+ item
url_list.append(item)
print(url_list)
# 爬取内页
url_page = url_list[8]
# print(url_list[8])
# for item in url_list:
inner_html = self.get_countent(url_page,'inner')
# 匹配真实视频地址
regex = re.compile('srcUrl="(.+?)"')
print(regex.findall(inner_html))
# 下载视频 追加
r = requests.get(regex.findall(inner_html)[0])
with open("./test_pear.mp4",'ab') as f:
f.write(r.content)
if __name__ == "__main__":
# 实例化对象
pearvideo = PearVideo()
# 调用内置方法
html = pearvideo.get_countent('https://www.pearvideo.com/',type)
# print(html)
pearvideo.get_xpath(html)
多线程爬虫结构
# 导包
import threading
import requests
import time
# 定义线程容器
threads = []
# 定义计时器方法
def get_time():
ms = time.ctime()
return ms
# 定义抓取方法
def get_content(url):
r = requests.get(url)
print(r.status_code)
# 定义多线程
for item in range(20):
mytherad = threading.Thread(target=get_content,args=('https://www.pearvideo.com',))
threads.append(mytherad)
if __name__ == "__main__":
print('开始于:%s' % get_time())
# 同步请求
# for x in range(20):
# get_content('https://www.pearvideo.com')
# 开始异步多线程请求
for t in threads:
# 守护线程
t.setDaemon(True)
t.start()
# 打印每一个线程的执行时间
print('这一个执行到:%s'% get_time())
# 阻塞一下主线程
t.join()
print('结束于:%s' % get_time())
多线程爬取视频
# 导入requests网络请求模块
import requests
# 导入lxml标签匹配模块
from lxml import etree
# 导入re 正则匹配模块
import re
#导入系统路径模块
import os
# 导入进程模块
import multiprocessing
import threading
# 存在视频网址
mylist = []
# 请求函数
def Data(url):
#发送请求
test = requests.get(url)
# with open('./pa.html','w') as pa:
# pa.write(test.text.encode('gbk','ignore').decode('gbk','ignore'))
# 返回二进制流
return test.content
# 匹配标签函数
def Pipa():
# 调用请求函数
res = Data('https://www.pearvideo.com/category_10')
# 利用etree完整HTML数据
html = etree.HTML(res)
# 匹配标签
url = html.xpath('//*[@id="categoryList"]/li')
# 循环匹配到的标签进行操作
for i in url:
# 拼接完整的详情页网址
data = 'https://www.pearvideo.com/' + str(i.xpath('./div/a/@href')[0])
# 添加到准备好的列表内
mylist.append(data)
# 定义好写入方法
def xiangqing(url):
# 获取url切片用于视频名称
name = str(url).split('/')[-1] + '.mp4'
print(name)
# 调用请求方法获取详情页
res = Data(url)
#由于视频网址不在标签里而是在Jquery内所有没办法使用xpath 这里使用re匹配视频源所在网址
url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com";',
str(res))[0]
# 调用请求方法把视频所在的网址放进去获取资源
res = Data(url)
# 设置保存下载视频的路径
path = "video/"
# 判断路径是否存在
if not os.path.exists(path):
# 不存在则创建
os.makedirs(path)
#写入
with open(path + name, "wb") as f:
f.write(res)
if __name__ == '__main__':
#调用进行添加列表
Pipa()
# print(mylist)
# for i in mylist:
# xiangqing(i)
for i in mylist:
a = threading.Thread(target=xiangqing, args=(i,))
a.start()
a.join()