人工智能—爬虫

抓取图片

# 导包
import requests
import re
from lxml import etree
import os


# 定义请求类
class PearVideo(object):

  # /定义抓取方法
  def get_countent(self,url,type):

    if type == 'index':
      file_name = 'test_pear.html'
    else:
      file_name = 'inner_pear.html'
 
    # 使用os模块判断文件是否存在
    if not os.path.exists('test_pear.html'):
 
      # 发送HTTP请求
      r = requests.get(url)
 
      # 解码
      html = r.content.decode("utf-8")
      print(html)
 
      # 第二种解码方案
      # html = r.content.decode('utf-8').encode('gbk',)
 
      # 断点
      # exit(-1)
 
      # 写文件,指定文件编码
      with open('./'+file_name,'w',encoding='utf-8') as f:
        f.write(html)
      return html
 
    else:
      # 读取文件返回
      with open('./'+file_name,encoding='utf-8') as f:
 
        contents = f.read()
      return contents
 
   # 定义数据匹配方法
  def get_xpath(self,html):

    # 匹配图片地址
    html_data_img = html.xpath('//div[@class="img"]/@style')
    print(html_data_img)

    # 处理图片
    img_list = []
    for item in html_data_img:
      # item = item.replace('background-image:url(','').replace(');','')
      # 使用正则匹配处理图片
      # 定义正则表达式  
      regex = re.compile("background-image: url((.+?));")
      img_list.append(regex.findall(item)[0])
      # 图片下载
 
    for item in img_list:
      r = requests.get(item)
      # 写文件
      with open('./test_pear.png','wb') as f:
        f.write(r.content)
      # 断点
      exit(-1)
 
    print(img_list)
 

if __name__ == "__main__":
    # 实例化对象
    pearvideo = PearVideo()
    # 调用内置方法
    html = pearvideo.get_countent('https://www.pearvideo.com/',type)
    # print(html)
    pearvideo.get_xpath(html)
 


抓取视频

# 导包
import requests
import re
from lxml import etree
import os


# 定义请求类
class PearVideo(object):

  # /定义抓取方法
  def get_countent(self,url,type):

    if type == 'index':
      file_name = 'test_pear.html'
    else:
      file_name = 'inner_pear.html'
 
    # 使用os模块判断文件是否存在
    if not os.path.exists('test_pear.html'):
 
      # 发送HTTP请求
      r = requests.get(url)
 
      # 解码
      html = r.content.decode("utf-8")
      print(html)
 
      # 第二种解码方案
      # html = r.content.decode('utf-8').encode('gbk',)
 
      # 断点
      # exit(-1)
 
      # 写文件,指定文件编码
      with open('./'+file_name,'w',encoding='utf-8') as f:
        f.write(html)
      return html
 
    else:
      # 读取文件返回
      with open('./'+file_name,encoding='utf-8') as f:
 
        contents = f.read()
      return contents
 
   # 定义数据匹配方法
  def get_xpath(self,html):
    # 转换格式
    html = etree.HTML(html)

    # 匹配内页地址
    html_data_url = html.xpath("//div[@class='actcontbd']/a/@href")
    print(html_data_url)

    # 处理内页网址
    url_list = []
    for item in html_data_url:
      item = 'https://www.pearvideo.com/'+ item
      url_list.append(item)
    print(url_list)
 
    # 爬取内页
    url_page = url_list[8]
    # print(url_list[8])
    # for item in url_list:
    inner_html = self.get_countent(url_page,'inner')

    # 匹配真实视频地址
    regex = re.compile('srcUrl="(.+?)"')
    print(regex.findall(inner_html))


    # 下载视频 追加
    r = requests.get(regex.findall(inner_html)[0])
    with open("./test_pear.mp4",'ab') as f:
    f.write(r.content)



if __name__ == "__main__":
  # 实例化对象
  pearvideo = PearVideo()
  # 调用内置方法
  html = pearvideo.get_countent('https://www.pearvideo.com/',type)
  # print(html)
  pearvideo.get_xpath(html)
 

多线程爬虫结构

# 导包
import threading
import requests
import time

# 定义线程容器
threads = []


# 定义计时器方法
def get_time():
  ms = time.ctime()
  return ms

# 定义抓取方法
def get_content(url):
  r = requests.get(url)
  print(r.status_code)


# 定义多线程
for item in range(20):
  mytherad = threading.Thread(target=get_content,args=('https://www.pearvideo.com',))
  threads.append(mytherad)



if __name__ == "__main__":
  print('开始于:%s' % get_time())

  # 同步请求
  # for x in range(20):
    # get_content('https://www.pearvideo.com')

  # 开始异步多线程请求
  for t in threads:
    # 守护线程
    t.setDaemon(True)
    t.start()
  # 打印每一个线程的执行时间
  print('这一个执行到:%s'% get_time())
  # 阻塞一下主线程
  t.join()


  print('结束于:%s' % get_time())
 
 
 
多线程爬取视频
# 导入requests网络请求模块
import requests
# 导入lxml标签匹配模块
from lxml import etree
# 导入re 正则匹配模块
import re
#导入系统路径模块
import os
# 导入进程模块
import multiprocessing
import threading
 
 
# 存在视频网址
mylist = []

# 请求函数
def Data(url):
  #发送请求
  test = requests.get(url)
 
  # with open('./pa.html','w') as pa:
  # pa.write(test.text.encode('gbk','ignore').decode('gbk','ignore'))
 
  # 返回二进制流
  return test.content

# 匹配标签函数
def Pipa():
  # 调用请求函数
  res = Data('https://www.pearvideo.com/category_10')
 
  # 利用etree完整HTML数据
  html = etree.HTML(res)
 
  # 匹配标签
  url = html.xpath('//*[@id="categoryList"]/li')
 
  # 循环匹配到的标签进行操作
  for i in url:
 
    # 拼接完整的详情页网址
    data = 'https://www.pearvideo.com/' + str(i.xpath('./div/a/@href')[0])
 
    # 添加到准备好的列表内
    mylist.append(data)

# 定义好写入方法
def xiangqing(url):
 
    # 获取url切片用于视频名称
    name = str(url).split('/')[-1] + '.mp4'
    print(name)
 
    # 调用请求方法获取详情页
    res = Data(url)
    #由于视频网址不在标签里而是在Jquery内所有没办法使用xpath 这里使用re匹配视频源所在网址
    url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com";',
str(res))[0]
 
    # 调用请求方法把视频所在的网址放进去获取资源
    res = Data(url)
 
    # 设置保存下载视频的路径
    path = "video/"
 
    # 判断路径是否存在
    if not os.path.exists(path):
 
    # 不存在则创建
    os.makedirs(path)
 
    #写入
    with open(path + name, "wb") as f:
      f.write(res)


if __name__ == '__main__':
  #调用进行添加列表
  Pipa()
  # print(mylist)
  # for i in mylist:
  # xiangqing(i)
  for i in mylist:
    a = threading.Thread(target=xiangqing, args=(i,))
    a.start()
    a.join()
 
原文地址:https://www.cnblogs.com/chengdongzi/p/10490744.html