Day-8 python

课堂笔记:
1、BeautifulSoup解析库
2、MongoDB存储库
3、requests-html请求库

BeautifulSoup
1、什么是bs4,为什么要使用bs4?
是一个基于re开发的解析库,


作业:
1、基于豌豆荚爬取简介截图图片地址、网友评论

2、把豌豆荚爬取的数据插入MongoDB中
-创建一个wandoujia库
-把主页的数据存放一个名为index集合中
-把详情页的数据存放在一个名为detail集合中





一、解析库之bs4
 1 '''
 2 pip3 install beautifulsoup4 #安装bs4
 3 pip3 install lxml  #下载lxml解析器
 4 '''
 5 
 6 
 7 html_doc = """
 8 <html><head><title>The Dormouse's story</title></head>
 9 <body>
10 <p class="sister"><b>$37</b></p>
11 
12 <p class="story" id="p">Once upon a time there were three little sisters; and their names were
13 <a href="http://example.com/elsie" class="sister" >Elsie</a>,
14 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
15 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
16 and they lived at the bottom of a well.</p>
17 
18 <p class="story">...</p>
19 """
20 # 从bs4中导入BeautifulSoup
21 from bs4 import BeautifulSoup
22 
23 # 调用BeautifulSoup实例化得到一个soup对象
24 # 参数一:解析文本
25 # 参数二:
26 #参数三:解析器(html.parser、lxml...)
27 soup = BeautifulSoup(html_doc, 'lxml')
28 
29 print(soup)
30 print('*' * 100)
31 print(type(soup))
32 # 文档美化
33 html = soup.prettify()
34 print(html)


二、bs4之遍历文档树

 1 html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<p>shen</p><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""
 2 
 3 from bs4 import BeautifulSoup
 4 soup = BeautifulSoup(html_doc, 'lxml')
 5 
 6 '''
 7     1、直接使用
 8     2、获取标签的名称
 9     3、获取标签的属性
10     4、获取标签的内容
11     5、嵌套选择
12     6、子节点、子孙节点
13     7、父节点、祖先节点
14     8、兄弟节点
15 '''
16 
17 # 1、直接使用
18 # print(soup.p)  # 查找第一个p标签
19 # print(soup.a)  # 查找第一个a标签
20 
21 # 2、获取标签的名称
22 # print(soup.head.name)  # 获取head标签的名称
23 
24 # 3、获取标签的属性
25 # print(soup.a.attrs)  # 获取a标签中的所有属性
26 # print(soup.a.attrs['href'])  # 获取a标签中的href属性
27 
28 #  4、获取标签的内容
29 # print(soup.p.text)  # $37
30 
31 # 5、嵌套选择
32 # print(soup.html.head)
33 
34 # 6、子节点、子孙节点
35 # print(soup.body.children)  # body所有子节点,返回的是迭代器对象 # <list_iterator object at 0x000002738557A780> 迭代器
36 # print(list(soup.body.children))  # 强转成列表类型
37 
38 # print(soup.body.descendants)  # 子孙节点  # <generator object descendants at 0x0000026693F27468> 生成器对象
39 # print(list(soup.body.descendants))
40 
41 # 7、父节点、祖先节点
42 # print(soup.p.parent)  # 获取p标签的父亲节点
43 # print(soup.p.parents)  # 获取p标签所有的祖先节点 # <generator object parents at 0x0000012F95BC7468>
44 # print(list(soup.p.parents))
45 
46 # 8、兄弟节点
47 # 找下一个兄弟
48 # print(soup.p.next_sibling)
49 # 找下面所有的兄弟,返回的是生成器
50 # print(soup.p.next_siblings)
51 # print(list(soup.p.next_siblings))
52 
53 # 找上一个兄弟
54 print(soup.a.previous_sibling)  # 找到第一个a标签的上一个兄弟节点
55 # 找到a标签上面的所有兄弟节点
56 print(soup.a.previous_siblings)  # 返回的是生成器
57 print(list(soup.a.previous_siblings))

三、bs4之搜索文档树

 1 html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<p>shen</p><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""
 2 '''
 3 搜索文档树:
 4     find() 找一个
 5     find_all() 找多个
 6     
 7 标签查找与属性查找:
 8 
 9     标签:
10          - 字符串过滤器   字符串全局匹配
11             name 属性匹配
12             attrs 属性查找匹配
13             text 文本匹配
14 
15             
16         - 正则过滤器
17             re模块匹配
18 
19         - 列表过滤器
20             列表内的数据匹配
21 
22         - bool过滤器
23             True匹配
24 
25         - 方法过滤器
26             用于一些要的属性以及不需要的属性查找。
27 
28     属性:
29         - class_
30         - id
31 '''
32 
33 from bs4 import BeautifulSoup
34 soup = BeautifulSoup(html_doc, 'lxml')
35 
36 # 字符串过滤器
37 # name
38 # p_tag = soup.find(name='p')
39 # print(p_tag)  # 根据文本p查找某个标签
40 # 找到所有标签名为p的节点
41 # tag_s1 = soup.find_all(name='p')
42 # print(tag_s1)
43 
44 
45 # attrs
46 # 查找第一个class为sister的节点
47 # p= soup.find(attrs={"class":"sister"})
48 #print(p)
49 # 查找所有class为sister的节点
50 # tag_s2 = soup.find_all(attrs={"class":"sister"})
51 # print(tag_s2)
52 
53 
54 # text
55 # text = soup.find(text="$37")
56 # print(text)
57 
58 
59 # 配合使用
60 # 找到一个id为link2、文本为Lacie的a标签
61 # a_tag = soup.find(name="a", attrs={"id": "link2"}, text="Lacie")
62 # print(a_tag)
63 
64 
65 
66 
67 # 正则过滤器
68 # import re
69 # # name
70 # p_tag = soup.find(name=re.compile('p'))
71 # print(p_tag)
72 
73 
74 # 列表过滤器
75 # import re
76 # # name
77 # tags = soup.find_all(name=['p', 'a', re.compile('html')])
78 # print(tags)
79 
80 # -bool过滤器
81 # True匹配
82 # 找到y有id的p标签
83 # p = soup.find(name='p', attrs={"id": True})
84 # print(p)
85 
86 # 方法过滤器
87 # 匹配标签名为a、属性有id和class的标签
88 # def have_id_class(tag):
89 # #     if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
90 # #         return tag
91 # #
92 # # tag = soup.find(name=have_id_class)
93 # # print(tag)

四、爬取豌豆荚app数据

  1 '''
  2 主页:
  3     图标地址、
  4     https://www.wandoujia.com/category/6001
  5 
  6 
  7 
  8     32
  9 '''
 10 
 11 import requests
 12 from bs4 import BeautifulSoup
 13 # 1、发送请求
 14 def get_page(url):
 15     response = requests.get(url)
 16     return response
 17 
 18 # 2、开始解析
 19 # 解析详情页
 20 def parse_detail(text):
 21     soup = BeautifulSoup(text, 'lxml')
 22     # print(soup)
 23 
 24     # app名称
 25     name = soup.find(name="span", attrs={"class": "title"}).text
 26     # print(name)
 27 
 28     # 好评率
 29     love = soup.find(name='span', attrs={"class": "love"}).text
 30     # print(love)
 31 
 32     # 评论人数
 33     commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
 34     # print(commit_num)
 35 
 36     # 小编点评
 37     commit_content = soup.find(name='div', attrs={"class": "con"}).text
 38     # print(commit_content)
 39 
 40     # app下载链接
 41     download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
 42     # print(download_url)
 43 
 44     # data = {'name':name}
 45     #     # table.insert(data)
 46 
 47     print(
 48         f'''
 49         ===============tank============
 50         app名称:{name}
 51         好评率:{love}
 52         评论人数:{commit_num}
 53         小编点评:{commit_content}
 54         app下载链接:{download_url}
 55         '''
 56     )
 57 
 58 
 59 # 解析主页
 60 def parse_index(data):
 61     soup = BeautifulSoup(data, 'lxml')
 62 
 63     # 获取所有app的li标签
 64     app_list = soup.find_all(name='li', attrs={"class": "card"})
 65     for app in app_list:
 66         print('*' * 1000)
 67         # print(app)
 68         # 图标地址
 69         # 获第一个img标签中的data-origina属性
 70         img = app.find(name='img').attrs['data-original']
 71         print(img)
 72 
 73         # 下载次数
 74         # 获取class为install-count的span标签中的文本
 75         down_num = app.find(name='span',attrs={"class": "install-count"}).text
 76         print(down_num)
 77 
 78         # 大小
 79         # 根据文本正则获取到文本中包含 数字 + MB (d+代表数字)的span标签中的文本
 80         import re
 81         size = soup.find(name='span', text=re.compile("d+MB")).text
 82         print(size)
 83 
 84         # 详情页地址
 85         # 获取class为detail-check-btn的a标签中的href属性
 86         detail_url = app.find(name='a').attrs['href']
 87         print(detail_url)
 88 
 89         # 3、往app详情页发送请求
 90         response = get_page(detail_url)
 91         # print(response.text)
 92         # print('tank')
 93 
 94         # 4、解析详情页
 95         parse_detail(response.text)
 96 
 97 
 98 def main():
 99     for line in range(1,33):
100         url = f'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=ql8VkarJqaE7VAYNAEe2JueZ'
101 
102         # 1、往app接口发送前请求
103         response = get_page(url)
104         # print(response.text)
105         print('*' * 1000)
106         # 反序列化为字典
107         data = response.json()
108         # 获取接口中app标签数据
109         app_li = data['data']['content']
110         # print(app_li)
111         # 2、解析app标签数据
112         parse_index(app_li)
113 
114 
115 
116 if __name__ == '__main__':
117     main()

五、MongoDB存储库

 1 from pymongo import MongoClient
 2 
 3 # 1、链接MongoDB客户端
 4 # 参数1:
 5 # 参数2:端口号
 6 client = MongoClient('localhost', 27017)
 7 print(client)
 8 
 9 # 2、进入shen_db数据库,没有则创建
10 # print(client['shen_db'])
11 
12 # 3、创建集合
13 # print(client['shen_db']['people'])
14 
15 # 4、给shen_db库插入数据
16 # 1.插入一条
17 data1 = {
18     'name': 'shen',
19     'age': 20,
20     'sex': 'female'
21 }
22 data2 = {
23     'name': 'lu',
24     'age': 21,
25     'sex': 'female'
26 }
27 data3 = {
28     'name': 'liu',
29     'age': 22,
30     'sex': 'feale'
31 }
32 client['shen_db']['people'].insert_many([data1,data2,data3])
33 
34 # 5、查数据
35 # 查看所有数据
36 data_s = client['shen_db']['people'].find()
37 print(data_s)
38 # 需要循环打印所有数据
39 for data in data_s:
40     print(data)
41 
42 # 查看一条数据
43 data = client['shen_db']['people'].find_one()
44 print(data)
45 
46 # 官方推荐
47 # 插入一条insert_one
48 # client['shen_db']['people'].insert_one()
49 # 插入多条insert_many
50 # client['shen_db']['people'].insert_many()


 
 
原文地址:https://www.cnblogs.com/shendongnian/p/11062121.html