QQ空间爬虫--获取好友信息

QQ空间网页版:https://user.qzone.qq.com/

登陆后,进入设置,有一个权限设置,设置“谁能看我的空间”为好友可见,然后构造爬虫。

(1)获取Cookie

两种方式:

第一种:通过chrome F12慢慢找获取

第二种:selenium模拟登陆获取

 1 from selenium import webdriver
 2 from time import sleep
 3 import json
 4 
 5 QQ_num = ''
 6 QQ_s = ''
 7 driver = webdriver.Firefox()
 8 driver.get('https://user.qzone.qq.com/' + QQ_num + '/main')
 9 driver.switch_to.frame('login_frame')
10 #找到账号密码登陆的地方
11 driver.find_element_by_id('switcher_plogin').click()
12 driver.find_element_by_id('u').send_keys(QQ_num)
13 driver.find_element_by_id('p').send_keys(QQ_s)
14 driver.find_element_by_id('login_button').click()
15 #保存本地的cookie
16 sleep(1)
17 cookies = driver.get_cookies()
18 cookie_dic = {}
19 for cookie in cookies:
20     if 'name' in cookie and 'value' in cookie:
21         cookie_dic[cookie['name']] = cookie['value']
22     with open('cookie_dict.txt', 'w') as f:
23         json.dump(cookie_dic, f)
View Code

(2)构造链接

这个从知乎上看别人的思路,主要是g_tk的构造

(3)获取所有好友信息(主要是昵称和QQ号)

 1 import urllib
 2 import requests
 3 import csv
 4 import json
 5 import re
 6 
 7 class Qzone:
 8 
 9     #算出来gtk
10     def get_gtk(self):
11         p_skey = cookie['p_skey']
12         h = 5381
13         for i in p_skey:
14             h += (h << 5) + ord(i)
15         g_tk = h & 2147483647
16         return g_tk
17 
18     #得到uin
19     def get_uin(self):
20         uin = cookie['ptui_loginuin']
21         return uin
22 
23         # 得到好友qq
24     def get_qq(self):
25         qq_list = []
26         friend_list = self.get_friend()
27 
28         csvfile = open('friends.csv', 'w', newline='')
29         csv_write = csv.writer(csvfile, dialect='excel')
30 
31         for friend in friend_list:
32             csv_write.writerow(friend)
33             qq_list.append(friend[1])
34         csvfile.close()
35         return qq_list
36 
37         # 找出好友列表
38     def get_friend(self):
39         url_friend = 'https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/tfriend/friend_ship_manager.cgi?'
40         g_tk = self.get_gtk()
41         uin = self.get_uin()
42         data = {
43             'uin': uin,
44              'do': 1,
45             'g_tk': g_tk
46         }
47         data_encode = urllib.parse.urlencode(data)
48         url_friend += data_encode
49         res = requests.get(url_friend, headers=header, cookies=cookie)
50         friend_json = re.findall('((.*))', res.text, re.S)[0]
51         friend_dict = json.loads(friend_json)
52         friend_result_list = []
53          # 循环将好友的姓名qq号存入字典中
54         for friend in friend_dict['data']['items_list']:
55             friend_result_list.append([friend['name'], friend['uin']])
56         # 得到的好友字典是{name: qqNum}格式的
57         return friend_result_list
58         
59 if __name__ == '__main__':
60     qzone = Qzone()
61 
62     #将关系设置为全局变量以供方便调用
63     relationships = []
64     header = {
65         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0",
66         "Accepted-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
67         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
68     }
69     with open('cookie_dict.txt','r') as f:
70         cookie = json.load(f)
71     #得到qq列表,
72     qq_list = qzone.get_qq()
View Code
原文地址:https://www.cnblogs.com/flyuz/p/9606828.html