1、xpath解析网页源文件
from urllib import request
from lxml import etree
# 请求的url
url = "http://www.dfenqi.cn/Product/Index"
# 请求的头文件
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
# 创建请求对象
req = request.Request(url,headers = headers)
# 创建处理器对象
httpHandler = request.HTTPHandler()
# 创建opener
opener = request.build_opener(httpHandler)
# 发送请求
response = opener.open(req)
# 读取源文件
html = response.read().decode('utf-8')
# 创建xpath关系
xpath = "//div[@class='liebiao']/ul/li/p/text()"
# 获取属性值列表
# xpath = "//div[@class='liebiao']/ul/li/p/@class"
# 将html转换成可解析对象
selector = etree.HTML(html)
# 返回xpath查询列表
goodsList = selector.xpath(xpath)
# 显示商品标题
for goods in goodsList:
print(goods)
2、xpath解析源文件,并下载图片至本地
from urllib import request
from lxml import etree
import os
class Spilder():
def __init__(self,pageUrl):
# 需要爬取网页的url
self.pageUrl = pageUrl
# 请求头文件
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}
# 请求的处理器
self.httpHandler = request.HTTPHandler()
# 请求的opener
self.opener = request.build_opener(self.httpHandler)
def loadPage(self):
'''
请求网页
:return: 返回网页源文件
'''
req = request.Request(self.pageUrl,headers = self.headers)
response = self.opener.open(req)
return response.read()
def getImageUrls(self,html,xpath):
'''
根据xpath解析源文件
:param html: 源文件
:param xpath: xpath解析字符串
:return: 解析列表
'''
selector = etree.HTML(html)
imgUrls = selector.xpath(xpath)
return imgUrls
def loadImage(self,url):
'''
下载图片
:param url: 图片url
:return: 返回图片数据
'''
req = request.Request(url,headers=self.headers)
response = self.opener.open(req)
return response.read()
def writeImage(self,img,imgName):
'''
在当前文件夹下面创建image子文件夹,将图片写入本地,
:param img: 图片数据
:param imgName: 图片名称
:return:
'''
folderName = os.path.join(os.path.abspath(os.curdir),"image")
if not(os.path.isdir(folderName)):
os.mkdir(folderName)
with open('image/%s' % imgName,'wb') as f:
f.write(img)
if __name__ == "__main__":
url = "http://www.dfenqi.cn/Product/Index"
spilder = Spilder(url)
html = spilder.loadPage()
xpath = "//div[@class='liebiao']/ul/li/div/a/img/@src"
imgUrls = spilder.getImageUrls(html,xpath)
index = 0
for url in imgUrls:
index += 1
img = spilder.loadImage(url)
spilder.writeImage(img,'img%s.jpg' % index)