Python爬虫QQ空间

#coding:utf-8

import time
from selenium import webdriver
from lxml import etree

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

friend = '' # 目的QQ号,目的QQ空间要求允许被访问
user = ''  # 你的QQ号
pw = ''  # 你的QQ密码

driver = webdriver.Chrome(executable_path='/Users/jiwu/Downloads/chromedriver')

driver.maximize_window()

driver.get("http://i.qq.com")

driver.switch_to.frame("login_frame")

driver.find_element_by_id("switcher_plogin").click()

driver.find_element_by_id("u").send_keys(user)

driver.find_element_by_id("p").send_keys(pw)

driver.find_element_by_id("login_button").click()

driver.switch_to.default_content()


driver.get("http://user.qzone.qq.com/" + friend + "/311")

next_num = 0 
while True:

    
        for i in range(1,6):
            height = 20000*i
            strWord = "window.scrollBy(0,"+str(height)+")"
            driver.execute_script(strWord)
            time.sleep(4)

        driver.switch_to.frame("app_canvas_frame")
        selector = etree.HTML(driver.page_source)
        divs = selector.xpath('//*[@id="msgList"]/li/div[3]')

        with open('qq_word.txt','a') as f:
            for div in divs:
                qq_name = div.xpath('./div[2]/a/text()')
                qq_content = div.xpath('./div[2]/pre/text()')
                qq_time = div.xpath('./div[4]/div[1]/span/a/text()')
                qq_name = qq_name[0] if len(qq_name)>0 else ''
                qq_content = qq_content[0] if len(qq_content)>0 else ''
                qq_time = qq_time[0] if len(qq_time)>0 else ''
                print(qq_name,qq_time,qq_content)
                f.write(qq_content+"
")

        if driver.page_source.find('pager_next_' + str(next_num)) == -1:
         break

        driver.find_element_by_id('pager_next_' + str(next_num)).click()

        next_num += 1

        driver.switch_to.parent_frame()

生成词云:

#coding:utf-8

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba

def create_word_cloud(filename):
    text= open("{}.txt".format(filename)).read()
    wordlist = jieba.cut(text, cut_all=True)
    wl = " ".join(wordlist)

    wc = WordCloud(
       background_color="white",
       max_words=2000,
       font_path='/System/Library/Fonts/PingFang.ttc',
       height= 1200,
       width= 1600,
       max_font_size=100,
       random_state=30,
    )

    myword = wc.generate(wl) 
    plt.imshow(myword)
    plt.axis("off")
    plt.show()
    wc.to_file('py_book.png')

if __name__ == '__main__':
    create_word_cloud('qq_word')

原文地址:https://www.cnblogs.com/131li/p/8933562.html