顶会热词统计

根据CVPR论文生成热点词汇云图 

1、用怕python 爬取论文到数据库中;

2、分析、查找关键词,对他排序;

3、生成热词汇云图;

 一、python爬取数据

import requests
import pymysql
from bs4 import BeautifulSoup
 
db = pymysql.connect('127.0.0.1',
                     port=3306,
                     user='root',
                     password='123',
                     db='mytest',
                     charset='utf8')
 
cursor = db.cursor()
 
headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
        }
url="http://openaccess.thecvf.com/CVPR2019.py"
html=requests.get(url)
 
soup=BeautifulSoup(html.content,'html.parser')
 
soup.a.contents=='pdf'
 
pdfs=soup.findAll(name="a",text="pdf")
 
lis = []
jianjie=""
for i,pdf in enumerate(pdfs):
    pdf_name=pdf["href"].split('/')[-1]
    name=pdf_name.split('.')[0].replace("_CVPR_2019_paper","")
    link="http://openaccess.thecvf.com/content_CVPR_2019/html/"+name+"_CVPR_2019_paper.html"
    url1=link
    html1 = requests.get(url1)
    soup1 = BeautifulSoup(html1.content, 'html.parser')
    weizhi = soup1.find('div', attrs={'id':'abstract'})
    if weizhi:
        jianjie =weizhi.get_text();
    print("这是第"+str(i)+"条数据")
    keyword = str(name).split('_')
    keywords = ''
    for k in range(len(keyword)):
        if (k == 0):
            keywords += keyword[k]
        else:
            keywords += ',' + keyword[k]
    info = {}
    info['title'] = name
    info['link'] =link
    info['abstract']=jianjie
    info['keywords']=keywords
    lis.append(info)
 
cursor = db.cursor()
for i in range(len(lis)):
    cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
    print(cols)  # '`name`, `age`'
 
    val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
    print(val_cols)  # '%(name)s, %(age)s'
 
    sql = "insert into lunwen(%s) values(%s)"
    res_sql = sql % (cols, val_cols)
    print(res_sql)
 
    cursor.execute(res_sql, lis[i])  # 将字典a传入
    db.commit()
    num=1
    print(num)
    print("成功")

 二、分析、查找关键词

 借助Map存储关键词, key为关键词,value为出现的次数。遍历到相同的关键词value值+1,然后根据value值排序。

dao层:

package dao;
 
 
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.stream.Collectors;
import  Bean.copy.*;
import jdbc.Util;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
 
import com.sun.xml.internal.ws.policy.privateutil.PolicyUtils.Collections;
 
public class Dao {
    public static Map<String,Integer> getrc()
    {
        String sql="select * from lunwen";
        Map<String, Integer>map= new HashMap<String, Integer>();
        Map<String, Integer>results= new LinkedHashMap<String, Integer>();
        Connection con=null;
        Statement state=null;
        ResultSet rs=null;
        con=Util.getConn();
        try {
            state=con.createStatement();
            rs=state.executeQuery(sql);
            while(rs.next())
            {
                String keywords=rs.getString("keywords");
                String[] split = keywords.split(",");
                for(int i=0;i<split.length;i++)
                {
                    if(map.get(split[i])==null)
                    {
                        map.put(split[i],0);
                    }
                    else
                    {
                        map.replace(split[i], map.get(split[i])+1);
                    }
                }
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        Util.close(rs, state, con);
        map.entrySet()                
        .stream()               
        .sorted((p1, p2) -> p2.getValue().compareTo(p1.getValue()))                
        .collect(Collectors.toList())
        .forEach(ele -> results.put(ele.getKey(), ele.getValue()));
 
       
        return results;
    }
    
    
    public List<Data> list(String keywords) { // 查询所有信息
 
 
        List<Data> list = new ArrayList<Data>(); // 创建集合
        Connection conn = Util.getConn();
        String sql = "select * from lunwen where keywords like "+"'%"+keywords+"%'"; // SQL查询语句
 
        try {
 
            PreparedStatement pst = conn.prepareStatement(sql);
 
            ResultSet rs = pst.executeQuery();
            
            Data data = null;
            
            while (rs.next()) {
 
 
                String title = rs.getString("title");
                
                String link = rs.getString("link");
                
                String as= rs.getString("abstract");
                
 
                
 
                    data = new Data(title,link,as,keywords);
                
                list.add(data);
 
            }
 
            rs.close(); // 关闭
 
            pst.close(); // 关闭
 
        } catch (SQLException e1) {
 
            e1.printStackTrace(); // 抛出异常
 
        }
 
        return list; // 返回一个集合
 
    }
 
 
 
 
}

  servlet层:

package servlet;
 
import java.io.IOException;
import java.util.Map;
 
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
 
import dao.Dao;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
 
@WebServlet("/RcServlet")
public class RcServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public RcServlet() {
        super();
        // TODO Auto-generated constructor stub
    }
 
    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
 
        this.doPost(request, response);
    }
 
    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        request.setCharacterEncoding("utf-8");
        Map<String, Integer>sortMap=Dao.getrc();
        JSONArray json =new JSONArray();
        int k=0;
        for (Map.Entry<String, Integer> entry : sortMap.entrySet()) 
        {
            JSONObject ob=new JSONObject();
            ob.put("name", entry.getKey());
            ob.put("value", entry.getValue());
            if(!(entry.getKey().equals("for")||entry.getKey().equals("and")||entry.getKey().equals("With")||entry.getKey().equals("of")||entry.getKey().equals("in")||entry.getKey().equals("From")||entry.getKey().equals("A")||entry.getKey().equals("to")||entry.getKey().equals("a")||entry.getKey().equals("the")||entry.getKey().equals("by")))
            {
                json.add(ob);
                k++;
            }
            if(k==10)
                break;
        }
        System.out.println(json.toString());
        
        response.getWriter().write(json.toString());
    
    }
 
}

  三、生成热词汇云图

<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
 
<%request.setCharacterEncoding("utf-8"); 
response.setCharacterEncoding("utf-8");%>
<!DOCTYPE html>
<html>
<head>
<meta charset="ISO-8859-1">
<title>热词云</title>
</head>
 
<meta charset="UTF-8">
     <link type="text/css" rel="stylesheet" href="css/style.css">
<script src="js/jquery-3.4.1.min.js"></script>
<script src="js/echarts.min.js"></script>
<script src="js/echarts-cloud.js"></script>
<style>
  
                
           #main{
               30%;
              height: 500px;
              
              border:1px solid #ddd;
              float:right;
          }
          #table{
                overflow-x: auto;
                 overflow-y: auto;
                 70%;
                height: 500px;
                float:left;
                margin-top:100dp;
                padding-top:100dp;
                
            }
 
        </style>
 
</head>
 
<body >
<br>
<h1>热词云</h1>
<br>
<br>
<br>
 
<div id="table">
  <table id='gradient-style' >
    <tr>
      <th align="center">论文连接</th>
    </tr>
    <c:forEach var="item" items="${list}">
      <tr>
        <td><a href="${item.link}">${item.title}</a></td>
      </tr>
    </c:forEach>
  </table>
</div>
 
 
  <div id="main">
  
  </div>
  <script type="text/javascript">
 
    var dt;
   
            $.ajax({
                url : "RcServlet",
                async : true,
                type : "POST",
                data : {        
                },
                dataType : "json",
                success : function(data) {
                    dt = data;
                    
                     var mydata = new Array(0);
                     for (var i = 0; i < dt.length; i++) {
                          var d = {};
                          
                          d["name"] = dt[i].name;
                         
                          d["value"] = dt[i].value;
                          mydata.push(d);
                      }
                     var myChart = echarts.init(document.getElementById('main'));
                     //设置点击效果
                    
                     
                     
                     myChart.setOption({
                         title: {
                             text: ''
                         },
                         tooltip: {},
                         series: [{
                             type : 'wordCloud',  //类型为字符云
                                 shape:'smooth',  //平滑
                                 gridSize : 8, //网格尺寸
                                 size : ['50%','50%'],
                                 //sizeRange : [ 50, 100 ],
                                 rotationRange : [-45, 0, 45, 90], //旋转范围
                                 textStyle : {
                                     normal : {
                                         fontFamily:'微软雅黑',
                                         color: function() {
                                             return 'rgb(' + 
                                                 Math.round(Math.random() * 255) +
                                          ', ' + Math.round(Math.random() * 255) +
                                          ', ' + Math.round(Math.random() * 255) + ')'
                                                }
                                         },
                                     emphasis : {
                                         shadowBlur : 5,  //阴影距离
                                         shadowColor : '#333'  //阴影颜色
                                     }
                                 },
                                 left: 'center',
                                 top: 'center',
                                 right: null,
                                 bottom: null,
                                 '100%',
                                 height:'100%',
                                 data:mydata
                         }]
                     });
                     
                     myChart.on('click', function (params) {
                         var url = "ClickServlet?keywords=" + params.name;
                         window.location.href = url;
                       });
                     
                    alert("成功!");
                   
   
                },
                error : function() {
                    alert("请求失败");
                },
           });
    
</script>
    
 
</body>
</html>
生成图:
 

原文地址:https://www.cnblogs.com/mxk123456/p/13085267.html