爬虫大作业

1.选一个自己感兴趣的主题(所有人不能雷同)。

2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

3.对爬了的数据进行文本分析,生成词云。

4.对文本分析结果进行解释说明。

5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

6.最后提交爬取的全部数据、爬虫及数据分析源代码。

本次选的主题是通过爬取雷锋网业界模块的所有新闻的作者,统计出哪个作者在这里发表文章最多,

1.首先,要获取到每条新闻的url,代码如下

def getNewsUrl(newsurl):
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    list = soup.select('.list')[0].select('li')
    allnewsList = []
    for news in list:
        newsList = []
        NewUrl = news.select('.img')[0].select('a')[1].attrs['href']
        newsList=getListPage(NewUrl)
        print(newsList)
        allnewsList.append(newsList)
    return allnewsList

2.通过上面的url,获取到每条新闻的详细信息,代码如下

def getListPage(NewUrl):
    res = requests.get(NewUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    list = soup.select('.article-template')
    newList =[]

    new = {}
    new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip()
    new['作者'] = soup.select('.article-template')[0].select('a')[0].text

    new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip()
    new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip()
    a = soup.select('.article-template')[0].select('p')
        # print(new)
    newList.append(new)
    for i in range(0,len(a)-2):
        content = a[i].text;
    title = soup.select('.article-template')[0].select('a')[0].text
    writeNewsContent(title)

    return newList

3.通过获取总页数

def getPageN():
    res = requests.get('https://www.leiphone.com/category/sponsor')
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, 'html.parser')
    n = int(soup.select('.pages')[0].select('a')[4].text)
    return n

n=getPageN();

4.获取所有新闻的信息

allnewList = []
for i in range(1,n+1):
    newsList = []
    newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i)
    print(newsurl)
    newsList = getNewsUrl(newsurl)
    allnewList.append(newsList)

5.把所有新闻的作者都写入到txt文档

def writeNewsContent(title):
    f=open('gzccNews.txt','a',encoding='utf-8')
    f.write(title)
    f.close()

6.对txt文档里的信息进行统计

import jieba

f = open('gzccNews.txt', 'r', encoding='utf-8')
text = f.read()
f.close()
jieba.add_word('归〇')
jieba.add_word('新智造')
jieba.add_word('刘芳平')
jieba.add_word('吕倩')
jieba.add_word('木子')
jieba.add_word('李诗')
jieba.add_word('Jennings_Zhu')
jieba.add_word('包永刚')
jieba.add_word('王金许')
jieba.add_word('李赓')
jieba.add_word('Dude')
jieba.add_word('温晓桦')
jieba.add_word('李雨晨')
jieba.add_word('思颖')
jieba.add_word('李智勇')
jieba.add_word('咲甜')
jieba.add_word('陈伊莉')
jieba.add_word('彭赛琼')
jieba.add_word('camel')
jieba.add_word('赵青晖')
jieba.add_word('Alter')
jieba.add_word('聊IT')
jieba.add_word('大公司日报')
jieba.add_word('又田')
jieba.add_word('跃斌')
jieba.add_word('奕欣')
jieba.add_word('张驰')


punctuation = ''',。‘’“”:;()!?、 '''
a = {'
','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'}
for i in punctuation:
    text = text.replace(i,'')

tempwords = list(jieba.cut(text))

count = {}
words = list(set(tempwords) - a)


for i in range(0, len(words)):
    count[words[i]] = text.count(str(words[i]))

countList = list(count.items())
countList.sort(key=lambda x: x[1], reverse=True)
print(countList)

f = open('b.txt', 'a')
for i in range(20):
    f.write(countList[i][0] + ':' + str(countList[i][1]) + '
')
f.close()

遇到问题及解决办法

1.在爬取数据的时候没有什么大的难度,就是有时候会出现类似于这个网站的通知,解决方法是吧所有相同的元素用一个链表包起来,然后删掉那些通知所在的元素给删掉

2.因为对词云库的安装方法不是很了解,网上看到资料也是五花八门,所以在这次的作业里,我依旧是延续了用jieba统计出发表文章最多的20个作者

结论:

通过对统计结果进行分析,发现木子发表的文章最多,所以我们可以得出这样一个结论,这位作者可能对业界有着很多独特见解,所以我们可以通过阅读他的文章去认识他的见解

爬取的全部数据、爬虫及数据分析源代码:

new.py

import requests
import re
from datetime import datetime
from bs4 import BeautifulSoup
import openpyxl
import pandas




        # print(NewUrl)
# getNewsUrl('https://www.leiphone.com/category/sponsor/page/1')

def writeNewsContent(title):
    f=open('gzccNews.txt','a',encoding='utf-8')
    f.write(title)
    f.close()
#获取新闻详情
def getListPage(NewUrl):
    res = requests.get(NewUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    list = soup.select('.article-template')
    newList =[]

    new = {}
    new['题目'] = soup.select('.article-template')[0].select('h1')[0].text.strip()
    new['作者'] = soup.select('.article-template')[0].select('a')[0].text

    new['时间'] = soup.select('.article-template')[0].select('.time')[0].text.strip()
    new['导语'] = soup.select('.article-template')[0].select('.article-lead')[0].text.strip()
    a = soup.select('.article-template')[0].select('p')
        # print(new)
    newList.append(new)
    for i in range(0,len(a)-2):
        content = a[i].text;
    title = soup.select('.article-template')[0].select('a')[0].text
    writeNewsContent(title)

    return newList

#获取详情url
def getNewsUrl(newsurl):
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    list = soup.select('.list')[0].select('li')
    allnewsList = []
    for news in list:
        newsList = []
        NewUrl = news.select('.img')[0].select('a')[1].attrs['href']
        newsList=getListPage(NewUrl)
        print(newsList)
        allnewsList.append(newsList)
    return allnewsList

#获取总页数
def getPageN():
    res = requests.get('https://www.leiphone.com/category/sponsor')
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, 'html.parser')
    n = int(soup.select('.pages')[0].select('a')[4].text)
    return n

n=getPageN();

allnewList = []
for i in range(1,n+1):
    newsList = []
    newsurl = 'https://www.leiphone.com/category/sponsor/page/{}'.format(i)
    print(newsurl)
    newsList = getNewsUrl(newsurl)
    allnewList.append(newsList)

jieba.py  

import jieba

f = open('a.txt', 'r', encoding='utf-8')
text = f.read()
f.close()
jieba.add_word('归〇')
jieba.add_word('新智造')
jieba.add_word('刘芳平')
jieba.add_word('吕倩')
jieba.add_word('木子')
jieba.add_word('李诗')
jieba.add_word('Jennings_Zhu')
jieba.add_word('包永刚')
jieba.add_word('王金许')
jieba.add_word('李赓')
jieba.add_word('Dude')
jieba.add_word('温晓桦')
jieba.add_word('李雨晨')
jieba.add_word('思颖')
jieba.add_word('李智勇')
jieba.add_word('咲甜')
jieba.add_word('陈伊莉')
jieba.add_word('彭赛琼')
jieba.add_word('camel')
jieba.add_word('赵青晖')
jieba.add_word('Alter')
jieba.add_word('聊IT')
jieba.add_word('大公司日报')
jieba.add_word('又田')
jieba.add_word('跃斌')
jieba.add_word('奕欣')
jieba.add_word('张驰')


punctuation = ''',。‘’“”:;()!?、 '''
a = {'
','子','张','秀琴','李秀','归','〇','亚','金','峰','亮','恒','赓','程','弢','木子李','三','大'}
for i in punctuation:
    text = text.replace(i,'')

tempwords = list(jieba.cut(text))

count = {}
words = list(set(tempwords) - a)


for i in range(0, len(words)):
    count[words[i]] = text.count(str(words[i]))

countList = list(count.items())
countList.sort(key=lambda x: x[1], reverse=True)
print(countList)

f = open('b.txt', 'a')
for i in range(20):
    f.write(countList[i][0] + ':' + str(countList[i][1]) + '
')
f.close()

  

原文地址:https://www.cnblogs.com/cktcom/p/8974768.html