python爬虫实践——爬取豌豆荚“休闲益智”游戏app

  1 '''
  2 主页:
  3     图标地址、下载次数、大小、详情页地址
  4 
  5 详情页:
  6     游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
  7 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
  8 
  9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 10 
 11 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 12 
 13 32
 14 '''
 15 import requests
 16 from bs4 import BeautifulSoup
 17 from pymongo import MongoClient
 18 import re
 19 
 20 #连接mongoDB数据库
 21 client=MongoClient('localhost',27017)
 22 #主页信息
 23 index_col=client['wandoujia']['index']
 24 #详情页信息
 25 detail_col=client['wandoujia']['detail']
 26 
 27 # 1、发送请求
 28 def get_page(url):
 29     response = requests.get(url)
 30     return response
 31 
 32 # 2、开始解析
 33 # 解析详情页
 34 def parse_detail(text):
 35     soup = BeautifulSoup(text, 'lxml')
 36     # print(soup)
 37 
 38     # app名称
 39     try:
 40         name = soup.find(name="span", attrs={"class": "title"}).text
 41     except Exception:
 42         name=None
 43     # print(name)
 44 
 45     # 好评率
 46     try:
 47         love = soup.find(name='span', attrs={"class": "love"}).text
 48     except Exception:
 49         love = None
 50     # print(love)
 51 
 52     # 评论数
 53     try:
 54         commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
 55     except Exception:
 56         commit_num = None
 57     # print(commit_num)
 58 
 59     # 小编点评
 60     try:
 61         commit_content = soup.find(name='div', attrs={"class": "con"}).text
 62     except Exception:
 63         commit_content = None
 64     # print(commit_content)
 65 
 66     # app下载链接
 67     try:
 68         download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
 69     except Exception:
 70         download_url = None
 71     # print(download_url)
 72 
 73     print('''
 74         ============= tank ==============
 75         app名称:{name}
 76         好评率: {love}
 77         评论数: {commit_num}
 78         小编点评: {commit_content}
 79         app下载链接: {download_url}
 80         ============= end ==============
 81         '''.format(name='name',love='love',commit_num='commit_num',commit_content='commit_content',download_url='download_url')
 82          )
 83 
 84     #判断所有数据都存在，正常赋值
 85     if name and love and commit_num and commit_content and download_url:
 86         detail_data={
 87             'name':name,
 88             'love':love,
 89             'commit_num':commit_num,
 90             'commit_content':commit_content,
 91             'download_url':download_url,
 92         }
 93 
 94     #若love没有值，则设置为  没人点赞，很惨
 95     if not love:
 96         detail_data = {
 97             'name': name,
 98             'love': "没人点赞，很惨",
 99             'commit_num':commit_num,
100             'commit_content':commit_content,
101             'download_url':download_url
102         }
103 
104     # 若download_url没有值，则设置为  没有安装包
105     if not love:
106         detail_data = {
107             'name':name,
108             'love':love,
109             'commit_num': commit_num,
110             'commit_content': commit_content,
111             'download_url': "没有安装包",
112         }
113 
114     #插入详情页数据
115     detail_col.insert(detail_data)
116     print('{name}app数据插入成功！')
117 
118 
119 
120 
121 # 解析主页
122 def parse_index(data):
123     soup = BeautifulSoup(data, 'lxml')
124 
125     # 获取所有app的li标签
126     app_list = soup.find_all(name='li', attrs={"class": "card"})
127     for app in app_list:
128         # print(app)
129         # print('tank' * 1000)
130         # print('tank *' * 1000)
131         # print(app)
132         # 图标地址
133         # 获取第一个img标签中的data-original属性
134         img = app.find(name='img').attrs['data-original']
135         print(img)
136 
137         # 下载次数
138         # 获取class为install-count的span标签中的文本
139         down_num = app.find(name='span', attrs={"class": "install-count"}).text
140         print(down_num)
141 
142 
143         # 大小
144         # 根据文本正则获取到文本中包含 数字 + MB（d+代表数字）的span标签中的文本
145         size = soup.find(name='span', text=re.compile("d+MB")).text
146         print(size)
147 
148         # 详情页地址
149         # 获取class为detail-check-btn的a标签中的href属性
150         # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
151         # print(detail_url)
152 
153         # 详情页地址
154         detail_url = app.find(name='a').attrs['href']
155         print(detail_url)
156 
157         # 拼接数据
158         index_data = {
159             'img': img,
160             'down_num': down_num,
161             'size': size,
162             'detail_url': detail_url,
163         }
164 
165         # 插入数据
166         index_col.insert(index_data)
167         print('主页数据插入成功！')
168 
169         # 3、往app详情页发送请求
170         response = get_page(detail_url)
171 
172         # 4、解析app详情页
173         parse_detail(response.text)
174 
175 
176 def main():
177     for line in range(1, 33):
178         url = "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
179 
180         # 1、往app接口发送请求
181         response = get_page(url)
182         # print(response.text)
183         print('*' * 1000)
184         # 反序列化为字典
185         data = response.json()
186 
187         # 获取接口中app标签数据
188         app_li = data['data']['content']
189         # print(app_li)
190         # 2、解析app标签数据
191         parse_index(app_li)
192 
193         #执行完所有函数关闭mongoDB客户端
194         client.close()
195 
196 if __name__ == '__main__':
197     main()