爬虫综合大作业

爬了一下腾讯漫画的网页,因为腾讯漫画中的国漫是非常多的,也想看看国漫的近况:

url:http://ac.qq.com/Comic/all/search/time/page/1

实现过程:

1、获取腾讯漫画上面的“作者”,“人气”,“状态”元素
def getCartoonDetail(cartoonUrl):
    resd = requests.get(cartoonUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    # print(cartoonUrl)
    cartoons = {}
 
 
    p1 = soupd.select('.works-intro-digi')[0]
    p2 = soupd.select('.works-score.clearfix')[0]
    p3 = soupd.select('.works-cover.ui-left')[0]
    # print(p3)
    # 作者
    cartoons['作者'] = p1.select('.first')[0].text.lstrip('作者:').replace('xa0', " ")
    #人气
    # cartoons['人气'] = p2.select('.ui-text-orange')[0].text.replace('xa0'," ")...
    #状态
    cartoons['状态'] = p3.select('.works-intro-status')[0].text.replace('xa0'," ")
    # + cartoons['人气'] + ' '
    writeCartoonDetail(cartoons['作者'] + ' ' + cartoons['状态'] + '
')
    return cartoons

  2、从漫画中的列表页中提取所有漫画

def getListPage(pageUrl):
    res = requests.get(pageUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
 
    cartoonlist = []
 
    for cartoon in soup.select('.ret-works-info'):
        if len(cartoon.select('.ret-works-title.clearfix')) > 0:
            a = cartoon.select('a')[0].attrs['href']#链接
            a1 = 'http://ac.qq.com'
            url = a1 + a
            # print(url)
            cartoonlist.append(getCartoonDetail(url))
    # print(cartoonlist)
    return cartoonlist

  3、从尾页列表页中获取总的漫画列表页数

def getPageN():
    res = requests.get('http://ac.qq.com/Comic/all/search/time/page/2300')
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    p = soup.select('.ret-page-wr.mod-page')[0]
    n = 2300
    # n = int(soup.select('.current')[0].text)
    # print(n)
    return n

  4、获取详情并保存在execl文档和记事本中

def writeCartoonDetail(content):
    f=open('t.txt', 'a', encoding='utf-8')
    f.write(content)
    f.close()
 
cartoontotal = []
pageUrl = 'http://ac.qq.com/Comic/all/search/time/page/1'
cartoontotal.extend(getListPage(pageUrl))
# print(pageUrl)
print(cartoontotal)
 
n = getPageN()
# print(n)
for i in range(2, 2299):
    pageUrl = 'http://ac.qq.com/Comic/all/search/time/page/{}'.format(i)
    cartoontotal.extend(getListPage(pageUrl))
    # print(cartoontotal)
# print(cartoontotal)
 
cartoonsList = {}
for c in cartoontotal:
    # print(c)
    cartoonsList['作者'] = c['作者']
    # cartoonsList['人气'] = c['人气']..
    cartoonsList['状态'] = c['状态']
print(cartoonsList)
 
df = pandas.DataFrame(cartoontotal)
print(df)
df.to_excel('t.xlsx', encoding='utf-8')

  

 5、下载wordcloud,将爬到的数据读取并制成词云图

def ToDict():
        f = open("t.txt", "r", encoding='utf-8')
        str = f.read()
        stringList = list(jieba.cut(str))
        title_dict = {}
        for i in stringSet:
          title_dict[i] = stringList.count(i)
        return title_dict
 
 
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
 
title_dict = ToDict()
graph = np.array(title_dict)
font = r'C:WindowsFontssimhei.ttf'
backgroud_Image = plt.imread("C:UsersAdministratorDesktop	ext.jpg")
wc = WordCloud(background_color='white',max_words=500,font_path=font, mask=backgroud_Image)
WordCloud(background_color='white',max_words=500,font_path=font)
wc.generate_from_frequencies(title_dict)
plt.imshow(wc)
plt.axis("off")
plt.show()

  

 解释分析:由以上这张词云图,我们可以看到,腾讯漫画中的大多数漫画是处于连载中的,而且在其中,有着不少小说家的名字,如天蚕土豆,唐家三少等,暗示有名小说进军漫画界并且有着很高人气,而且国内产业对于漫画的支持也是很高,如其中的触漫,开源动漫等大公司都有不少作品在其中。

问题及解决方法:主要还是词云的问题,之前是Pycharm提示需要安装Visual C++ Build Tools,但是因为本人使用的校园网网速极差,下载了很久却没见成功,于是就用了第二个方法:

下载wordcloud-1.4.1-cp36-cp36m-win32.whl,然后在cmd中,用cd指令打开到文件目录下,

pip install wordcloud-1.4.1-cp36-cp36m-win32.whl

pip install wordcloud

完成安装。

另外在爬去网站的信息是,有些元素的class是两个单词,则需要在其中加点,才能爬取:

soupd.select('.works-score.clearfix')[0]

 完整代码:

import requests
import re
from bs4 import BeautifulSoup
import pandas
import jieba
 
def getCartoonDetail(cartoonUrl):
    resd = requests.get(cartoonUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    # print(cartoonUrl)
    cartoons = {}
  
  
    p1 = soupd.select('.works-intro-digi')[0]
    p2 = soupd.select('.works-score.clearfix')[0]
    p3 = soupd.select('.works-cover.ui-left')[0]
    # print(p3)
    # 作者
    cartoons['作者'] = p1.select('.first')[0].text.lstrip('作者:').replace('xa0', " ")
    #人气
    # cartoons['人气'] = p2.select('.ui-text-orange')[0].text.replace('xa0'," ")...
    #状态
    cartoons['状态'] = p3.select('.works-intro-status')[0].text.replace('xa0'," ")
    # + cartoons['人气'] + ' '
    writeCartoonDetail(cartoons['作者'] + ' ' + cartoons['状态'] + '
')
    return cartoons
 
def getListPage(pageUrl):
    res = requests.get(pageUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
  
    cartoonlist = []
  
    for cartoon in soup.select('.ret-works-info'):
        if len(cartoon.select('.ret-works-title.clearfix')) > 0:
            a = cartoon.select('a')[0].attrs['href']#链接
            a1 = 'http://ac.qq.com'
            url = a1 + a
            # print(url)
            cartoonlist.append(getCartoonDetail(url))
    # print(cartoonlist)
    return cartoonlist
 
def getPageN():
    res = requests.get('http://ac.qq.com/Comic/all/search/time/page/2300')
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    p = soup.select('.ret-page-wr.mod-page')[0]
    n = 2300
    # n = int(soup.select('.current')[0].text)
    # print(n)
    return n
 
def writeCartoonDetail(content):
    f=open('t.txt', 'a', encoding='utf-8')
    f.write(content)
    f.close()
  
cartoontotal = []
pageUrl = 'http://ac.qq.com/Comic/all/search/time/page/1'
cartoontotal.extend(getListPage(pageUrl))
# print(pageUrl)
print(cartoontotal)
  
n = getPageN()
# print(n)
for i in range(2, 2299):
    pageUrl = 'http://ac.qq.com/Comic/all/search/time/page/{}'.format(i)
    cartoontotal.extend(getListPage(pageUrl))
    # print(cartoontotal)
# print(cartoontotal)
  
cartoonsList = {}
for c in cartoontotal:
    # print(c)
    cartoonsList['作者'] = c['作者']
    # cartoonsList['人气'] = c['人气']..
    cartoonsList['状态'] = c['状态']
print(cartoonsList)
  
df = pandas.DataFrame(cartoontotal)
print(df)
df.to_excel('t.xlsx', encoding='utf-8')
 
def ToDict():
        f = open("t.txt", "r", encoding='utf-8')
        str = f.read()
        stringList = list(jieba.cut(str))
        title_dict = {}
        for i in stringSet:
          title_dict[i] = stringList.count(i)
        return title_dict
  
  
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
  
title_dict = ToDict()
graph = np.array(title_dict)
font = r'C:WindowsFontssimhei.ttf'
backgroud_Image = plt.imread("C:UsersAdministratorDesktop	ext.jpg")
wc = WordCloud(background_color='white',max_words=500,font_path=font, mask=backgroud_Image)
WordCloud(background_color='white',max_words=500,font_path=font)
wc.generate_from_frequencies(title_dict)
plt.imshow(wc)
plt.axis("off")
plt.show()

  

原文地址:https://www.cnblogs.com/lamonein/p/10786325.html