简单爬虫架构

爬虫架构

运行流程

网页解析器

网页解析器-BeautifulSoup-语法

简单解析实例1

 1 from bs4 import BeautifulSoup
 2 import re
 3 html_doc  = """
 4 <html><head><title> The Dormouse's story</title></head>
 5 <body>
 6 <p class="title"><b>The Documents's story</b></p>
 7 
 8 <p class="story">Once upon a time there were three littlesisters;and their name
 9 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
10 <a href="http://example.com/locie" class="sister" id="link2">Lacie</a> and 
11 <a href="http://example.com/title" class="sister" id="link3">Title</a>;
12 and they lived at the bottom of a well.</p>
13 
14 <p class="story">...</p>
15 """
16 soup = BeautifulSoup(html_doc,"html.parser",from_encoding='utf8')
17 print ('获取所有的连接')
18 links =  soup.find_all('a')
19 for link in links:
20     print (link.name,link['href'], link.get_text()) 
21 
22 print ('获取lacie的链接') 
23 link_node = soup.find('a', href='http://example.com/locie')
24 print  (link_node.name,link_node['href'],link_node.get_text())
25 
26 print ('正则匹配')
27 link_node = soup.find('a',href=re.compile(r"tl"))
28 print (link_node.name,link_node['href'], link_node.get_text())
29 
30 print ('获取P段落文字')
31 p_node = soup.find('p', class_="title")
32 print (p_node.name, p_node.get_text())

简单解析实例2

from bs4 import BeautifulSoup as bs
import re


html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.net/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# html.parser解析器解析
soup = bs(html_doc,"html.parser")

print(soup.prettify())

# 获取title标签及内容
print(soup.title) 

# 获取title标签的内容
print(soup.title.string) 

# 获取父标签
print(soup.title.parent.name) 

# 获得P标签及内容
print(soup.p)

# 获得P标签class元素的内容
print(soup.p['class'])

# 获取当前A标签及内容
print(soup.a)

'''
soup.tag只能获取当前标签所有标签当中的第一个

'''
# 获取所有A标签及内容
print(soup.find_all('a'))

# 获得 link元素所在的标签及内容
print(soup.find(id='link1'))

# 获取link元素所在标签的内容
print(soup.find(id='link1').string)

# 获取所有A标签下的链接和内容
for link in soup.find_all('a'):
    print('网址为:'+link.get('href')+'  内容为:'+link.string)

# 获取p标签里 class 元素的值为story 下的所有标签及内容
print(soup.find("p",{"class":"story"}))


# 获取p标签里class元素的只为story下的所有内容
print(soup.find("p",{"class":"story"}).get_text())


# 获取b开头的标签
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)


# 获取a标签下href包含http://example.com的所有标签及内容
print(soup.findAll("a",href=re.compile(r"http://example.com/")))

综合实例-爬取维基百科词条

#!/usr/bin/env python
#-*- coding:utf-8 -*-
#引入开发包
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode('utf-8')

#使用BeautifulSoup去解析
soup = BeautifulSoup(resp,"html.parser")

#查找以wiki开头的链接
listUrls=soup.findAll("a",href=re.compile("^/wiki/"))

#输出所有词条对应的名称和URL
for url in listUrls:
	#过滤掉.以jpg或JPG结尾的链接
	if not re.search('.(jpg|JPG)$',url["href"]):

		#输出URL的文字和对应的链接
		print(url.get_text+'<--->'+url['href'])