python 简书用户爬虫

python 简书用户爬虫 广度优先

 1 # python
 2 # -*- coding: utf-8 -*-
 3 """
 4 __title__ = ''
 5 __author__ = 'wlc'
 6 __mtime__ = '2017/10/15'
 7 """
 8 import re
 9 import time
10 import math
11 import csv
12 import requests
13 from bs4 import BeautifulSoup
14 from collections import deque
15 import sys
16 #python 默认递归限制为900
17 sys.setrecursionlimit(10000)
18 
19 #建立一个csv文件保存信息
20 path = 'dataCollection/userInfo.csv'
21 csvFile = open(path, 'a+', newline='', encoding='utf-8')
22 writer = csv.writer(csvFile)
23 writer.writerow(('id','name','following','follower','article','word','like'))
24 
25 #全局变量用来存储userid 和关注的人数
26 idContainer = set()
27 #用来放置用户的链接使用双向队列
28 linkDeque  = deque()
29 
30 class jianshu(object):
31     def __init__(self):
32         #定制url模板
33         self.url = 'http://www.jianshu.com/users/{userId}/following?page={page}'
34         #用户id与name的匹配规则
35         self.idPattern = re.compile('<a class="name" href="/u/(.*?)">(.*?)</a>')
36         #用户的关注 粉丝 文章 文集 的匹配规则
37         self.metalPattern = re.compile('<span>关注 (d+)</span><span>粉丝 (d+)</span><span>文章 (d+)</span>')
38         self.meta = re.compile('写了 (d+) 字,获得了 (d+) 个喜欢')
39         #伪装成浏览器
40         self.header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
41 
42     def createRequest(self, userId, page):
43         url = self.url.format(userId = userId, page = page)
44         requ = requests.get(url, headers = self.header).text
45         return requ
46 
47     def pageResponse(self, requ):
48         bsOBJ = BeautifulSoup(requ, 'lxml')
49         userContainer = bsOBJ.find_all('ul',{'class':'user-list'})[0]
50         userContent = userContainer.contents
51         userContent = [str(user) for user in userContent if user != '
']
52         #关注用户列表
53         return userContent
54 
55     def parserUserInfo(self, user):
56         id, name = re.findall(self.idPattern, user)[0]
57         followingNum, followerNum, articleNum = re.findall(self.metalPattern, user)[0]
58         try:
59             wordNum, likeNum = re.findall(self.meta, user)[0]
60         except:
61             wordNum, likeNum = 0, 0
62         content = (id, name, followingNum, followerNum, articleNum, wordNum, likeNum)
63         writer.writerow(content)
64         return  content
65 
66     def getUserList(self, userId, following):
67         idContainer.add((userId, following))
68         num = int(following) / 10
69         page = math.ceil(num)
70         for pg in range(1, page + 1, 1):
71             requ = self.createRequest(userId, pg)
72             userList = self.pageResponse(requ)
73             for user in userList:
74                 content = self.parserUserInfo(user)
75                 linkDeque.append((content[0], content[2]))
76             time.sleep(1)
77         for deq in linkDeque:
78             if deq not in idContainer:
79                 self.getUserList(deq[0],deq[1])
80                 print("what")
81 jianshu = jianshu().getUserList('1562c7f16a04',45)
原文地址:https://www.cnblogs.com/wlc297984368/p/7673777.html