热词云

我的队友是徐姣美 这是她的博客  https://home.cnblogs.com/u/xjmm/

开始就是先从网站上面爬取相关信息  https://blog.csdn.net/u014636245/article/details/91426736  是在这个网站上面

代码的爬取我是使用的py,下面是我的代码

  1 import requests
  2 from bs4 import BeautifulSoup
  3 import bs4
  4 # -*- coding: UTF-8 -*
  5 from urllib.request import urlopen
  6 from pdfminer.pdfinterp import PDFResourceManager, process_pdf
  7 from pdfminer.converter import TextConverter
  8 from pdfminer.layout import LAParams
  9 from io import StringIO
 10 from pyhanlp import *
 11 import time
 12 
 13 import requests
 14 import json
 15 from pymysql import *
 16 
 17 #连接数据库的方法
 18 def connectDB():
 19     try:
 20         db=connect(host='localhost',port=3306,user='root',password='123456',db='python')
 21         print("数据库连接成功")
 22         return db
 23     except Exception as e:
 24         print(e)
 25     return NULL
 26 
 27 db = connectDB()
 28 
 29 #向数据库中插入数据的方法
 30 def insertInformation(title,abstract,keywords,href):
 31     cursor=db.cursor()
 32     try:
 33         cursor.execute("insert into new_table(title,abstract,keywords,href) values('%s','%s','%s','%s')" % (title,abstract,keywords,href))
 34         print("插入成功")
 35         db.commit()
 36         cursor.close()
 37         return True
 38     except Exception as e:
 39         print(e)
 40         db.rollback()
 41     return False
 42 
 43 list_href=[]
 44 list_title=[]
 45 
 46 def getHtmlText(url):
 47     r = requests.get(url)
 48     r.raise_for_status()
 49     r.encoding = r.apparent_encoding
 50     html = r.text
 51     return html
 52 
 53 
 54 
 55 def getDataFromHtml(list,html):
 56     bs = BeautifulSoup(html, "lxml")
 57     for td in bs.tbody.find_all("td"):
 58         if isinstance(td,bs4.element.Tag):
 59             for a in td.find_all("a"):
 60                 list_href.append(a['href'])
 61                 list_title.append(a.text)
 62 
 63 def showAll(list):
 64     for univ in list:
 65         print(univ)
 66 
 67 
 68 def readPDF(pdfFile):
 69     rsrcmgr = PDFResourceManager()
 70     retstr = StringIO()
 71     laparams = LAParams()
 72     device = TextConverter(rsrcmgr, retstr, laparams=laparams)
 73     process_pdf(rsrcmgr, device, pdfFile)
 74     device.close()
 75     content = retstr.getvalue()
 76     retstr.close()
 77     return content
 78 
 79 if __name__ == '__main__':
 80     url = "https://blog.csdn.net/u014636245/article/details/91426736"
 81     try:
 82         html = getHtmlText(url)
 83         getDataFromHtml(list,html)
 84         for i in range(0,len(list_title)):
 85             print(i)
 86             pdfFile = urlopen(list_href[i])
 87             # 远程
 88             outputString = readPDF(pdfFile)
 89             if "Abstract" in outputString:
 90                 document = ""
 91                 if "1. Introduction" in outputString and "Abstract" in outputString:
 92                     document = outputString[outputString.index("Abstract"):outputString.index("1. Introduction")]
 93                 elif "1.Introduction" in outputString and "Abstract" in outputString:
 94                     document = outputString[outputString.index("Abstract"):outputString.index("1.Introduction")]
 95                 else :
 96                     document = outputString[outputString.index("Abstract"):outputString.index("Abstract")+800]
 97                 # print(document)
 98                 keywords = HanLP.extractKeyword(document, 10)
 99                 print(keywords)
100                 str = ""
101                 for k in keywords:
102                     str+=k+" "
103                 pdfFile.close()
104                 insertInformation(list_title[i],document,str,list_href[i])
105             time.sleep(0.1)
106     except Exception as e:
107         print(e)
108         print("爬取失败")
py

然后爬取结束后是这个样子

有很多很多条 ,关键词是每个keyword里面有10个关键词;

然后就是将他们从数据库中取出来放在数组中,然后再进行排序,找最大;

不要忘记将介词等无用词去掉;

进行排序最简单的是使用的map

// 排序
List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
//在java中,如果要对集合对象或数组对象进行排序,需要实现Comparator接口以达到我们想要的目标
Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() {
    public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
        return (left.getValue().compareTo(right.getValue()));
    }
};
// 集合默认升序升序
Collections.sort(list,comparator);
String ten[]=new String[50];
int shu[]=new int[50];
for(int i=0;i<50;i++){// 由高到低输出
    
 ten[i]=list.get(list.size()-i-1).getKey();
 shu[i]=list.get(list.size()-i-1).getValue();
 
 Tu tu =new Tu();
 tu.name=ten[i];
 tu.value=shu[i];
 list_tu.add(tu);
    System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue());
}

然后设置一个点击事件,转换成json的代码形式

Gson gson = new Gson();
String json = gson.toJson(list_tu);
response.getWriter().write(json);

然后使用echarts设计热词云

 1 <%@ page language="java" contentType="text/html; charset=UTF-8"
 2     pageEncoding="UTF-8"%>
 3 <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
 4 <!DOCTYPE html>
 5 <html>
 6 <head>
 7 <meta charset="UTF-8">
 8 <title>Insert title here</title>
 9 <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" />
10 <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script>
11 <script type="text/javascript" src="js/echarts.min.js"></script>
12 <script type="text/javascript" src="js/china.js"></script>
13 <script src="js/bootstrap.min.js" type="text/javascript"></script>
14 <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
15 <script src='js/echarts-wordcloud.js'></script>
16 </head>
17 <body>
18 <div id="main" style=" 100%;height: 400px"></div>
19 <div>
20   <table class="table" style=" 100%;align-content: center;" >
21     <tr>
22       <th align="center">论文连接</th>
23     </tr>
24     <c:forEach var="item" items="${list}">
25       <tr>
26         <td><a href="${item.lianjie }">${item.title}</a></td>
27       </tr>
28     </c:forEach>
29   </table>
30 </div>
31 <script>
32   var chart = echarts.init(document.getElementById('main'));
33   var dt;
34   $.ajax({
35     url : "PaperServlet_",
36     async : false,
37     type : "POST",
38     success : function(data) {
39       dt = data;
40      // alert(dt[0].title);
41     },
42     error : function() {
43       alert("请求失败");
44     },
45     dataType : "json"
46   });
47   var mydata = new Array(0);
48   for (var i = 0; i < dt.length; i++) {
49       var d = {};
50       
51       d["name"] = dt[i].name;
52       //alert(dt[i].name);
53       d["value"] = dt[i].value;
54       mydata.push(d);
55   }
56   var option = {
57     tooltip: {},
58     series: [ {
59       type: 'wordCloud',
60       gridSize: 2,
61       sizeRange: [20, 50],
62       rotationRange: [-90, 90],
63       shape: 'pentagon',
64        600,
65       height: 300,
66       drawOutOfBound: true,
67       textStyle: {
68         normal: {
69           color: function () {
70             return 'rgb(' + [
71               Math.round(Math.random() * 160),
72               Math.round(Math.random() * 160),
73               Math.round(Math.random() * 160)
74             ].join(',') + ')';
75           }
76         },
77         emphasis: {
78           shadowBlur: 10,
79           shadowColor: '#333'
80         }
81       },
82       data: mydata
83     } ]
84   };
85 
86   chart.setOption(option);
87   chart.on('click', function (params) {
88       var url = "ClickServlet?geunjian=" + params.name;
89       window.location.href = url;
90     });
91   window.onresize = chart.resize;
92 </script>
93 </body>
94 </html>
View Code

然后点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表

 1 import java.io.IOException;
 2 import java.sql.SQLException;
 3 import java.util.ArrayList;
 4 import java.util.List;
 5 
 6 import javax.servlet.ServletException;
 7 import javax.servlet.annotation.WebServlet;
 8 import javax.servlet.http.HttpServlet;
 9 import javax.servlet.http.HttpServletRequest;
10 import javax.servlet.http.HttpServletResponse;
11 
12 import com.me.dao.LWDao;
13 import com.me.domain.LunWen;
14 
15 /**
16  * Servlet implementation class ClickServlet
17  */
18 @WebServlet("/ClickServlet")
19 public class ClickServlet extends HttpServlet {
20     private static final long serialVersionUID = 1L;
21     LWDao dao = new LWDao();
22     
23     public ClickServlet() {
24         super();
25         // TODO Auto-generated constructor stub
26     }
27 
28     protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
29         String geunjian = request.getParameter("geunjian");
30         System.out.println(geunjian);
31         List<LunWen> guan = new ArrayList<LunWen>();
32         try {
33             guan = dao.login(geunjian);
34         } catch (SQLException e) {
35             e.printStackTrace();
36         }
37         for(int i=0;i<guan.size();i++) {
38             if(guan.get(i).getLianjie()!=null) {
39                 String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length());
40                 guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss);
41             }
42             
43         }
44         request.setAttribute("list", guan);
45         System.out.println(guan.size());
46         request.getRequestDispatcher("lw.jsp").forward(request, response);
47     }
48 
49     /**
50      * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
51      */
52     protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
53         // TODO Auto-generated method stub
54         doGet(request, response);
55     }
56 
57 }
View Code

然后嘞,运行一下就可以了

大佬的博客写的非常的详细可以参考一哈  https://www.cnblogs.com/20183544-wangzhengshuai/p/12702137.html

原文地址:https://www.cnblogs.com/1234yyf/p/12715824.html