爬虫数据解析方式

1.数据解析

1.数据解析的作用

  可以帮助我们实现聚焦爬虫

2.数据解析的实现方式

  1. 正则
  2. bs4
  3. xpath
  4. pyquery

3.数据解析的通用原理

  问题:1.聚焦爬虫爬取的数据是存储的在哪里

    都被存储在了相关的标签之中和相关标签的属性中

  1.定位标签

  2.取文本或者取属性

requests模块与urllib模块的区别

# 爬取图片
# requests模块 import requests headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' }
url
= "http://www.xiaohuar.com/d/file/20190814/48f590249bcbc6a82aec5bf2a63ae54f.jpg"
img_data
= requests.get(url,headers=headers).content #byte类型数据 with open("tupiao.jpg","wb") as fp: fp.write(img_data)
# urllib模块 from urllib import request url = "http://pic25.nipic.com/20121112/9252150_150552938000_2.jpg" request.urlretrieve(url,filename="./tu.jpg") # urllib模块不能使用UA伪装

2.数据解析的实现方式

1.正则

正则相关知识点:https://www.cnblogs.com/zangyue/p/12044575.html

# 爬起糗图1-3页所有的图片
# 使用通过爬虫将前3页对应的页面源码数据进行爬取

import re
import os

import requests
from urllib import request

disname = "./imgLibs"

# 判断文件是否存在
if not os.path.exists(disname):
    os.mkdir(disname)

url = 'https://www.qiushibaike.com/pic/page/%d/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}

for page in range(1, 4):
    new_url = format(url % page)
    # 每一个页面对应的页面源码数据
    page_text = requests.get(url=url, headers=headers).text

    # 在通用爬虫的基础上实现聚焦爬虫(每一个页面对应页面源码数据中解析出图片地址)
    # 正则表达式
    ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?<div>'
    img_scr_list = re.findall(ex, page_text, re.S)
    for src in img_scr_list:
        src = "https" + src
        img_name = src.split('/')[-1]
        img_path = disname + "/" + img_name
        request.urlretrieve(src, filename=img_path)
        print(img_name, "下载成功")

2.bs4 解析

1.bs4解析的原理

  - 实例化一个BeautifulSoup的对象,需要将即将被解析的页面源码数据加载到该对象

  - 调用BeautifulSoup对象中的相关方法和属性进行标签定位和数据提取

2.环境的准备
pip3 install bs4
pip3 install lxml
3.BeautifulSoup实例化
# BeautifulSoup(fp,'lxml'); 将本地存储的一个html文档中的数据加载到实例化好的BeautifulSoup对象中
# BeautifulSoup(page_text,'lxml'); 将从互联网上获取的页面源码数据加载到实例化好的BeautifulSoup对象中
4.定位标签的操作
# 定位标签的操作
    标签定位: soup.tagname  # 定位到第一个出现的tagname标签
    属性定位: soup.find("tagname", attrname="value")
    属性定位: soup。find_all("tagname", attrname="value")  # 返回值了列表
    选择器定位:soup.select("选择器")
        层级选择器: > 表示一个层级,空格表示多个层级

# 取文本
    .string:获取直系的文本内容
    .text: 获取所有的文本内容

# 取属性
    tagname["attrname"]

定位标签操作的实例:

from bs4 import BeautifulSoup

fp = open("./text.html", "r", encoding="utf-8")
soup = BeautifulSoup(fp, "lxml")

# 定位到第一个出现的tagname标签
print(soup.div)

# 属性定位:soup.find('tagName',attrName='value')
print(soup.find("div", class_="song"))
print(soup.find("a", id="feng"))

# 属性定位:soup.find_all('tagName',attrName='value')
print(soup.find_all("div", class_="song"))

# 选择器定位:soup.select("选择器")
print(soup.select("#feng"))
    # > 表示一个层级
    soup.select('.tang > ul > li')
    # 空格表示多个层级
    soup.select('.tang li')

# 取文本
    .string: 获取直系的文本内容
    a_tag = soup.select("#fend")[0]
    print(a_tag.string)
    .text: 获取所有的文本内容
    div = soup.div
    print(div.text)
    div = soup.find("div", class_="song")
    print(div.text)

# 取属性
a_tag = soup.select("#feng")[0]
print(a_tag["href"])

 基于bs4爬取实例

# 需求
    爬取三国整篇内容(章节+章节内容) 
    http://www.shicimingju.com/book/sanguoyanyi.html
  
      
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}

fp = open("./sanguo.txt", "w", encoding="utf-8")
main_url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=main_url, headers=headers).text
# 解析出章节名称和章节详情页的url
soup = BeautifulSoup(page_text, "lxml")
a_list = soup.select(".book-mulu>ul>li>a")  # 返回的是列表中存储的是一个个li标签
for a in a_list:
    title = a.string
    data_url = "http://www.shicimingju.com" + a["href"]
    data_page_text = requests.get(url=data_url, headers=headers).text
    # 解析详情页中的章节内容
    soup = BeautifulSoup(data_page_text, "lxml")
    content = soup.find('div', class_="chapter_content").text
    fp.write(title + ":" + content + "
")
    print(title, "下载成功")
fp.close()

3 xpath 解析

1.xpath解析实现原理

  1.实例化一个etree对象,然后将即将被解析的页面源码加载到改对象中 

  2.使用etree对象中的xpath方法结合不同形式的xpath表达式实现标签定位和数据提提取

2.环境配置
pip3 install lxml
3.etree对象的实例化
etree.parse("test.html")
etree.HTML(page_text)
4.xpath表达式
# xpath 表达式:xpath方法的返回值一定是一个列表
    - 最左侧的/表示: xpath表达式一定从根标签进行标签查找和定位
    - 最左侧的//表示:xpath表达式可以从任意位置定位标签
    - 非最左侧的/:表示一个层级
    - 非最左侧的//:表示跨多个层级
    - 属性定位://tagname[@attrname="value"]
    - 索引定位://tagname[index]   # 索引是从1开始

# 取文本
    /text():直系文本内容
    //text():所有的文本内容

# 取属性
/@attrname

xpath表达式实例

from lxml import etree

tree = etree.parse('./test.heml')

# 最左侧的/表示:xpath表达式一定要从根标签逐层进行标签查找和定位
tree.xpath('/html/head/title')

# 最左侧的//表示:xpath表达式可以从任意位置定位标签
tree.xpath("//title")
tree.xpath('//p')

# 非最左侧的/表示:一个层级
tree.xpath()

# 非最左侧的//表示: 表示多个层级
tree.xpath('html/body//p')

# 属性定位://tanName[@attrName="value"]
tree.xpath('//div[@class="song"]')

# 索引定位://tagName[index] 索引是1开始
tree.xpath('//li[7]')

# 取文本:
    /text(): 直系文本内容
        tree.xpath('//a[@id="feng"]/text()')[0]
    //text():所有的文本内容
        tree.xpath('//div[@class="song"]//text()')

# 去属性
tree.xpath('//a[@id="feng"]/@href')

xpath爬虫实例

# 需求
  爬取糗事百科中的段子内容和作名称

url = 'https://www.qiushibaike.com/text/'
page_text = requests.get(url,headers=headers).text
# 解析内容 tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="content-left"]/div') for div in div_list: author = div.xpath('./div[1]/a[2]/h2/text()')[0] # 实现局部解析 content = div.xpath('./a[1]/div/span/text()') content = ''.join(content) print(author,content)

import os

import requests
from lxml import etree

dirname = "./meinvlibs"
if not os.path.exists(dirname):
    os.mkdir(dirname)

url = "http://pic.netbian.com/4kmeinv/index_%d.html"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
for page in range(1, 3):
    if page == 1:
        new_url = 'http://pic.netbian.com/4kmeinv/'
    else:
        new_url = format(url % page)
    page_text = requests.get(url=new_url, headers=headers).text
    tree = etree.HTML(page_text)
    a_list = tree.xpath('//div[@class="slist"]/ul/li/a')
    for a in a_list:
        img_src = "http://pic.netbian.com" + a.xpath("./img/@src")[0]
        img_name = a.xpath("./b/text()")[0]
        img_name = img_name.encode('iso-8859-1').decode('gbk')  # 解决乱码
        img_data = requests.get(url=img_src, headers=headers).content
        imgPath = dirname + '/' + img_name + '.jpg'
        with open(imgPath, "wb") as fp:
            fp.write(img_data)
            print(img_name, '下载成功!!!')
import requests
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
page_text = requests.get('https://www.aqistudy.cn/historydata/', headers=headers).text
tree = etree.HTML(page_text)
cities = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')  # 提高xpath的通用性
print(cities)
# http://sc.chinaz.com/jianli/free.html 免费的简历模板进行爬取和保存
    

"""
    从这里爬取免费的简历模板 
    第一页:'http://sc.chinaz.com/jianli/free.html'
    第其它页:f'http://sc.chinaz.com/jianli/free_{i}.html'
    
"""

import os

import requests
from lxml import etree

dirName = "./resume"
if not os.path.exists(dirName):
    os.mkdir(dirName)

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
    'Connection': 'close'
}
for i in range(1, 11):
    if i == 1:
        new_url = "http://sc.chinaz.com/jianli/free.html"
    else:
        new_url = f"http://sc.chinaz.com/jianli/free_{i}.html"
    page_text = requests.get(url=new_url, headers=headers).text
    tree = etree.HTML(page_text)
    a_list = tree.xpath('//div[@id="container"]//a[@class="title_wl"]')
    for a in a_list:
        title = a.xpath('./text()')[0]
        title = title.encode('iso-8859-1').decode('utf8')
        print(title)
        detail_path = a.xpath('./@href')[0]
        page_detail = requests.get(url=detail_path, headers=headers).text
        tree = etree.HTML(page_detail)
        download_url = tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
        filename = dirName + '/' + title + '.jpg'
        img_data = requests.get(url=download_url, headers=headers).content
        with open(filename, "wb") as fp:
            fp.write(img_data)

在爬取数据量大的时候会报一个错误:

HttpConnectinPool错误
原因:
    1.短时间内发起高频的请求导致ip被禁
    2.http连接池中的连接资源被耗尽
解决方法:
    1.代理
    2.headers中加Conection:"close"

待续

原文地址:https://www.cnblogs.com/zangyue/p/12161303.html