信息化领域热词分类分析及解释

一 项目名称:

信息化领域热词分类分析及解释

二 功能设计:

1) 数据 采集:要求从定期自动从网络中爬取信息领域的相关热
词;
2) 数据 清洗:对热词信息进行数据清洗,并采用自动分类技术
生成信息领域热词目录,;
3) 热 热 词 解释:针对每个热词名词自动添加中文解释(参照百度
百科或维基百科);
4) 热词 引用 :并对近期引用热词的文章或新闻进行标记,生成
超链接目录,用户可以点击访问;
5) 数据 可视化 展示:
① 用字符云或热词图进行可视化展示;
② 用关系图标识热词之间的紧密程度。
6) 数据 报告:可将所有热词目录和名词解释生成 WORD 版报告
形式导出。

三 项目源码:

python:

热点新闻爬取

import requests
import re
import xlwt
url = 'https://news.cnblogs.com/n/recommend'
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}
def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print('获取网页成功')
            print(response.encoding)
            return response.text
        else:
            print('获取网页失败')
    except Exception as e:
        print(e)
f = xlwt.Workbook(encoding='utf-8')
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet01.write(0, 0, '博客最热新闻')  # 第一行第一列
urls = ['https://news.cnblogs.com/n/recommend?page={}'.format(i * 1) for i in range(100)]
temp=0
num=0
for url in urls:
    print(url)
    page = get_page(url)
    items = re.findall('<h2 class="news_entry">.*?<a href=".*?" target="_blank">(.*?)</a>',page,re.S)
    print(len(items))
    print(items)
    for i in range(len(items)):
        sheet01.write(temp + i + 1, 0, items[i])
    temp += len(items)
    num+=1
    print("已打印完第"+str(num)+"")
print("打印完!!!")
f.save('Hotword.xls')

热词拆分:

import jieba
import pandas as pd
import re
import mysql.connector
from collections import Counter
if __name__ == '__main__':
    filehandle = open("Hotword.txt", "r", encoding='GBK');
    mystr = filehandle.read()
    seg_list = jieba.cut(mystr)  # 默认是精确模式
    print(seg_list)
    stopwords = ['', '', '', '', '-', '.', '', '', '', '(', ')', '', '', '使用', '实现', '', '', '', '', '', '',
           '', '', '中国', '发布', '公司', '首次', '全球', '正式', '2019', '2020', '亿美元', '20', '10', '&#', '首个', '正在', '最大',
           '成为', '第一', '', '', '如何', '人类', '什么', '一个', '宣布', '可能', '推出', '没有', '地球', '到底', '回应', '50', '100', '可以',
           '开始', '这个', '问题', '为什么', '我们', '背后', '终于', '重磅', '160', '国内', '需要', '为何', '亿元', '发现', '成功', '最强', '不是', '人生']
    c = Counter()
    for x in seg_list:
        if x not in stopwords:
            if len(x) > 1 and x != '
':
                c[x] += 1

    print('
词频统计结果:')
    f = open("finalwords.txt", 'w', encoding='utf-8')
    f2 = open("finalnumber.txt", 'w', encoding='utf-8')
    for (k, v) in c.most_common(100):  # 输出词频最高的前两个词
        print("%s:%d" % (k, v))
        words = k
        number = str(v)
        f.write( words )
        f2.write(number)
        line = '
'
        f.write( line )
        f2.write(line)

    f.close()

    filehandle.close();

解释关联及导出

import requests
import re
import xlwt
import linecache
import mysql.connector
url = 'https://baike.baidu.com/'
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}
mydb = mysql.connector.connect(host='localhost', user='root', password='123456', database='python',charset='utf8')
mycursor = mydb.cursor()
lst=[]
def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        if response.status_code == 200:
            print('获取网页成功')
            #print(response.encoding)
            return response.text
        else:
            print('获取网页失败')
    except Exception as e:
        print(e)
fopen = open('finalwords.txt', 'r',encoding='utf-8')
fopen2 = open('finalnumber.txt', 'r',encoding='utf-8')
lines = fopen.readlines()
urls = ['https://baike.baidu.com/item/{}'.format(line) for line in lines]
i=0
for url in urls:
     print(url.replace("
", ""))
     page = get_page(url.replace("
", ""))
     items = re.findall('<meta name="description" content="(.*?)">',page,re.S)
     if len(items)>0:
            words = linecache.getline("finalwords.txt", i+1).strip()
            num = linecache.getline("finalnumber.txt", i+1).strip()
            message = items[0]
            print(message)
            wurl = url.replace("
", "")
            lst.append((words,num,message,wurl))
            i+= 1
     print("总爬取完毕数量:" + str(i))
print("打印完!!!")
print(lst)

tuple_lst = tuple(lst)
sql = "insert into hotwords (words,num,message,url) values (%s,%s,%s,%s)"
mycursor.executemany(sql, tuple_lst)
mydb.commit()

java:

RCServlet

package com.servlet;

import java.io.IOException;
import java.util.Map;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.dao.Dao;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;

@WebServlet("/RcServlet")
public class RcServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public RcServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

        this.doPost(request, response);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        request.setCharacterEncoding("utf-8");
        response.setContentType("text/html;charset=utf-8");
        Map<String, Integer>sortMap=Dao.getrc();
        JSONArray json =new JSONArray();
        int k=0;
        for (Map.Entry<String, Integer> entry : sortMap.entrySet()) 
        {
            JSONObject ob=new JSONObject();
            ob.put("name", entry.getKey());
            ob.put("value", entry.getValue());
           
                json.add(ob);
                k++;
            if(k==100)
                break;
        }
        System.out.println(json.toString());
        
        response.getWriter().write(json.toString());
    }
}

ClickServlet

package com.servlet;

import java.io.IOException;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.bean.Data;
import com.dao.Dao;

/**
 * Servlet implementation class ClickServlet
 */
@WebServlet("/ClickServlet")
public class ClickServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public ClickServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

        doPost(request, response);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        request.setCharacterEncoding("UTF-8");
        response.setContentType("text/html;charset=utf-8");
        String words=request.getParameter("words");
        Dao dao = new Dao();
        List<Data> list=null;
        list=dao.list(words);
        System.out.println(list);
        request.setAttribute("list",list); 
        request.getRequestDispatcher("RC.jsp").forward(request, response);
    }

}

RC.jsp

<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%>
<%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>

<%request.setCharacterEncoding("utf-8"); 
response.setCharacterEncoding("utf-8");%>
<!DOCTYPE html>
<html>
<head>
<meta charset="ISO-8859-1">
<title>热词云</title>
</head>

<meta charset="UTF-8">
     <link type="text/css" rel="stylesheet" href="css/style.css">
<script src="js/jquery-3.4.1.min.js"></script>
<script src="js/echarts.min.js"></script>
<script src="js/echarts-cloud.js"></script>
<style>
  
                
           #main{
              width: 30%;
              height: 500px;
              
              border:1px solid #ddd;
              float:right;
          }
          #table{
                overflow-x: auto;
                 overflow-y: auto;
                width: 70%;
                height: 500px;
                float:left;
                margin-top:100dp;
                padding-top:100dp;
                
            }

        </style>

</head>

<body >
<br>
<h1>热词云</h1>
<br>
<br>
<br>

<div id="table">
  <table id='gradient-style' >
    <tr>
      <th align="center">热词简介</th>
    </tr>
    <c:forEach var="item" items="${list}">
      <tr>
        <td><a href="${item.url}">${item.message}</a></td>
      </tr>
    </c:forEach>
  </table>
</div>


  <div id="main">
  
  </div>
  <script type="text/javascript">

    var dt;
   
            $.ajax({
                url : "RcServlet",
                async : true,
                type : "POST",
                data : {        
                },
                dataType : "json",
                contentType: 'application/x-www-form-urlencoded; charset=UTF-8',
                success : function(data) {
                    dt = data;
                    
                     var mydata = new Array(0);
                     for (var i = 0; i < dt.length; i++) {
                          var d = {};
                          
                          d["name"] = dt[i].name;
                         
                          d["value"] = dt[i].value;
                          mydata.push(d);
                      }
                     var myChart = echarts.init(document.getElementById('main'));
                     //设置点击效果
                    
                     
                     
                     myChart.setOption({
                         title: {
                             text: ''
                         },
                         tooltip: {},
                         series: [{
                             type : 'wordCloud',  //类型为字符云
                                 shape:'smooth',  //平滑
                                 gridSize : 8, //网格尺寸
                                 size : ['50%','50%'],
                                 //sizeRange : [ 50, 100 ],
                                 rotationRange : [-45, 0, 45, 90], //旋转范围
                                 textStyle : {
                                     normal : {
                                         fontFamily:'微软雅黑',
                                         color: function() {
                                             return 'rgb(' + 
                                                 Math.round(Math.random() * 255) +
                                          ', ' + Math.round(Math.random() * 255) +
                                          ', ' + Math.round(Math.random() * 255) + ')'
                                                }
                                         },
                                     emphasis : {
                                         shadowBlur : 5,  //阴影距离
                                         shadowColor : '#333'  //阴影颜色
                                     }
                                 },
                                 left: 'center',
                                 top: 'center',
                                 right: null,
                                 bottom: null,
                                 '100%',
                                 height:'100%',
                                 data:mydata
                         }]
                     });
                     
                     myChart.on('click', function (params) {
                         var url = "ClickServlet?words=" + params.name;
                         window.location.href = url;
                       });
                     
                    alert("成功!");
                   
   
                },
                error : function() {
                    alert("请求失败");
                },
           });
    
         
       


</script>
    

</body>
</html>

四 运行截图:

点击热词:

点击热词解释:

原文地址:https://www.cnblogs.com/wendi/p/13535274.html