一个完整的大作业

一、选一个自己感兴趣的主题。

  对新浪新闻的时政页面进行爬取,分析今天一天的新闻关键词是什么。

二、对新浪页面进行分析

  的到新浪页面的URL地址为:http://www.sohu.com/c/8/1460.html,需要得到的页面的每条新闻的链接地址,在a标签页面下用正则表达式进行过滤,得到每条新闻的链接地址。

抓取每条新闻的链接地址:

#访问搜狐新闻时政界面第一页,将界面的每条新闻的地址链接挑选出来
html = requests.get('http://www.sohu.com/c/8/1460.html')
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text,'html.parser')
news_list = soup.find_all('a',href=re.compile(r'//www.sohu.com/'))
news_urls = set()
for url in news_list:
    news_url = url['href']
    news_urls.add(news_url)

三、对抓取到的每个链接进行访问并下载页面

  分析出每个界面的标题是拥有text-title类的div标签,每个界面的新闻是拥有article类的article标签。并将下载的新闻放到news字符串里面

news=""

for url in news_urls:
    html = requests.get('http:' + url)
    html.encoding = 'utf-8'
    soup = BeautifulSoup(html.text,'html.parser')
    title=soup.find('div',class_='text-title').find('h1')
    content=soup.find('article',class_='article')
    news = news + content.get_text()

四、对抓取出来的所有新闻进行文本分析及过滤。

ex = [chr(x) for x in range(32,125)]
for ch in ex:
    news = news.replace(ch,'')
exchi= ['','','','','','','','','','','','','','','','','','{','}']
for ch in exchi:
    news = news.replace(ch,'')

print(news,"






")
#对新闻进行文本分析

ls = []
words = jieba.lcut(news)
counts = {}
for word in words:
    if len(word) == 1:
        continue
    
    else:
        counts[word] = counts.get(word,0)+1
        ls.append(word)

items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(10):
    word , count = items[i]
    print ("{:<10}{:>5}".format(word,count))

ex =['
','','','','','','','','u3000',' ','n',
     '','','','','','','','','{','}','原标题','']

for ke in ls:
    if ke in ex:
        ls.remove(ke)

五、制作词云

wz = open('ms.txt','w+')
wz.write(str(ls))
wz.close()

wz = open('ms.txt','r').read()
backgroud_Image = plt.imread('apple.jpg')
wc = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = backgroud_Image,        # 设置背景图片
                max_words = 2000,            # 设置最大现实的字数
                stopwords = STOPWORDS,        # 设置停用词
                font_path = 'C:/Users/Windows/fonts/msyh.ttf',# 设置字体格式,如不设置显示不了中文
                max_font_size = 200,            # 设置字体最大值
                random_state = 30,            # 设置有多少种随机生成状态,即有多少种配色方案
                )

wc.generate(wz)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.imshow(wc)
plt.axis('off')
plt.show()

六、输出结果

  利用jieba进行文本分析的结果。

  词云的输出结果。

七、程序源代码

import re
import requests
from bs4 import BeautifulSoup
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator

#访问搜狐新闻时政界面第一页,将界面的每条新闻的地址链接挑选出来
html = requests.get('http://www.sohu.com/c/8/1460.html')

html.encoding = 'utf-8'

soup = BeautifulSoup(html.text,'html.parser')
news_list = soup.find_all('a',href=re.compile(r'//www.sohu.com/'))
news_urls = set()
for url in news_list:
    news_url = url['href']
    news_urls.add(news_url)

#逐一访问刚刚爬取的新闻界面,并将界面的新闻下载
news=""

for url in news_urls:
    html = requests.get('http:' + url)
    html.encoding = 'utf-8'
    soup = BeautifulSoup(html.text,'html.parser')
    title=soup.find('div',class_='text-title').find('h1')
    content=soup.find('article',class_='article')
    news = news + content.get_text()

ex = [chr(x) for x in range(32,125)]
for ch in ex:
    news = news.replace(ch,'')
exchi= ['','','','','','','','','','','','','','','','','','{','}']
for ch in exchi:
    news = news.replace(ch,'')

#对新闻进行文本分析

ls = []
words = jieba.lcut(news)
counts = {}
for word in words:
    if len(word) == 1:
        continue
    
    else:
        counts[word] = counts.get(word,0)+1
        ls.append(word)

items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(10):
    word , count = items[i]
    print ("{:<10}{:>5}".format(word,count))

ex =['
','','','','','','','','u3000',' ','n',
     '','','','','','','','','{','}','原标题','']

for ke in ls:
    if ke in ex:
        ls.remove(ke)


#制作词云
wz = open('ms.txt','w+')
wz.write(str(ls))
wz.close()

wz = open('ms.txt','r').read()

backgroud_Image = plt.imread('apple.jpg')
wc = WordCloud( background_color = 'white',    # 设置背景颜色
                mask = backgroud_Image,        # 设置背景图片
                max_words = 2000,            # 设置最大现实的字数
                stopwords = STOPWORDS,        # 设置停用词
                font_path = 'C:/Users/Windows/fonts/msyh.ttf',# 设置字体格式,如不设置显示不了中文
                max_font_size = 200,            # 设置字体最大值
                random_state = 30,            # 设置有多少种随机生成状态,即有多少种配色方案
                )

wc.generate(wz)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.imshow(wc)
plt.axis('off')
plt.show()
原文地址:https://www.cnblogs.com/xypbk/p/7768200.html