CVPR顶会热词统计

任务:

爬取CVPR2019年所有论文的题目,并提取题目中的关键字,做成按照热度显示大小的热词云。

代码:

爬虫:

# coding=utf-8
import pymysql
import requests
from lxml import etree
 
 
class Spider:
    def __init__(self):
        self.url = "http://openaccess.thecvf.com/CVPR2019.py"
        self.header = {
            "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
        self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='abc456', db='paperdata',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.html_list = []
 
    def getHtmlList(self):
        response = requests.get(self.url, headers=self.header)
        html_body = etree.HTML(response.text)
        title = html_body.xpath("//dt[@class='ptitle']/a/@href")
        for item in title:
            self.html_list.append("http://openaccess.thecvf.com/" + item)
 
    def getContent(self, url):
        try:
            response = requests.get(url, headers=self.header)
            body = etree.HTML(response.text)
            title = body.xpath("//div[@id='papertitle']/text()")[0]
            abstract = body.xpath("//div[@id='abstract']/text()")[0]
            down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
 
            sql = '''insert into data values({},"{}","{}","{}")'''.format(0, title, down_url, str(abstract))
            self.cursor.execute(sql)
            print(title + "插入成功!")
            self.db.commit()
        except Exception as e:
            print(e)
 
    def run(self):
        self.getHtmlList()
        for url in self.html_list:
            self.getContent(url)
 
 
if __name__ == '__main__':
    spwder = Spider()
    spwder.run()
 
package dao;

import java.sql.SQLException;
import java.util.List;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;

import pojo.Data;
import utils.DataSourceUtils;

/** 
* @author: connor
* @version锛�2020骞�4鏈�15鏃� 涓婂崍10:19:06 
* 
*/
public class DataDao {

    public List<Data> getData() throws SQLException {
        QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());
        String sql = "select * from data ";
        List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class));
        return dataList;
        
        
    }

    public List<Data> getLink(String name) throws SQLException {
        QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());
        String sql = "select * from data where papername like ?";
        List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class),"%"+name+"%");
        return dataList;
    }

}
 
DataDao.java
package pojo; 


public class Data {
    private int id;
    private String papername;
    private String paperlink;
    private String paperabstract;
    public int getId() {
        return id;
    }
    public void setId(int id) {
        this.id = id;
    }
    public String getPapername() {
        return papername;
    }
    public void setPapername(String papername) {
        this.papername = papername;
    }
    public String getPaperlink() {
        return paperlink;
    }
    public void setPaperlink(String paperlink) {
        this.paperlink = paperlink;
    }
    public String getPaperabstract() {
        return paperabstract;
    }
    public void setPaperabstract(String paperabstract) {
        this.paperabstract = paperabstract;
    }
}
 
Data.java
package pojo; 

public class Word {
    private String name;
    private int value;
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public int getValue() {
        return value;
    }
    public void setValue(int value) {
        this.value = value;
    }
}
 
World.java
package service;

import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;


import dao.DataDao;
import pojo.Data;
import pojo.Word;


public class DataService {

    public List<Word> getData() throws SQLException {
        DataDao dao = new DataDao();
        List<Data>  dataList= dao.getData();
        List<Word> wordList = new ArrayList<Word>();
        String [] names = new String[100000];
        for(Data data:dataList) {
            String name = data.getPapername();
            String[] namestemp = name.split(" ");
            names = (String[]) ArrayUtils.addAll(namestemp, names);
        }
        HashMap<String, Integer> name_value = new HashMap<>();
        
        for(String name:names) {
            name_value.put(name, !name_value.containsKey(name)?1:name_value.get(name)+1);
        }
        
        for(String name:name_value.keySet()) {
            Word word = new Word();
            if(name!=null&&(name_value.get(name)>1)&&(name.length()>4)) {
                word.setName(name);
                word.setValue(name_value.get(name));
                wordList.add(word);
            }

        }
        return wordList;
    }

    public List<Data> getLink(String name) throws SQLException {
        DataDao dao = new DataDao();
        return dao.getLink(name);
    }

}
 
DataService.java
package servlet;

import java.io.IOException;
import java.sql.SQLException;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import pojo.Data;
import service.DataService;

/**
 * Servlet implementation class ClickFunctionServlet
 */
@WebServlet("/clickFunction")
public class ClickFunctionServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public ClickFunctionServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        request.setCharacterEncoding("utf-8");
        response.setContentType("text/html;charset=UTF-8");
        String name = request.getParameter("name");
        List<Data> dataList =null;
        DataService service = new DataService();
        try {
            dataList = service.getLink(name);
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        request.setAttribute("dataList", dataList);
        request.getRequestDispatcher("papercloud.jsp").forward(request, response);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }

}
ClickFunctionServlet.java
package servlet;

import java.io.IOException;
import java.sql.SQLException;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.google.gson.Gson;

import pojo.Word;
import service.DataService;

/**
 * Servlet implementation class GetDataServlet
 */
@WebServlet("/getData")
public class GetDataServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public GetDataServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        request.setCharacterEncoding("utf-8");
        response.setContentType("text/html;charset=UTF-8");
        List<Word> wordList = null;
        DataService service = new DataService();
        try {
            wordList = service.getData();
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        Gson gson = new Gson();
        String json = gson.toJson(wordList);
        response.getWriter().write(json);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }

}
GetDataServlet.java
package utils;

import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import javax.sql.DataSource;

import com.mchange.v2.c3p0.ComboPooledDataSource;

public class DataSourceUtils {

    private static DataSource dataSource = new ComboPooledDataSource();

    private static ThreadLocal<Connection> tl = new ThreadLocal<Connection>();


    public static DataSource getDataSource() {
        return dataSource;
    }

    
    public static Connection getConnection() throws SQLException {

        Connection con = tl.get();
        if (con == null) {
            con = dataSource.getConnection();
            tl.set(con);
        }
        return con;
    }

    
    public static void startTransaction() throws SQLException {
        Connection con = getConnection();
        if (con != null) {
            con.setAutoCommit(false);
        }
    }

    
    public static void rollback() throws SQLException {
        Connection con = getConnection();
        if (con != null) {
            con.rollback();
        }
    }

    public static void commitAndRelease() throws SQLException {
        Connection con = getConnection();
        if (con != null) {
            con.commit(); 
            con.close();
            tl.remove();
        }
    }


    public static void closeConnection() throws SQLException {
        Connection con = getConnection();
        if (con != null) {
            con.close();
        }
    }

    public static void closeStatement(Statement st) throws SQLException {
        if (st != null) {
            st.close();
        }
    }

    public static void closeResultSet(ResultSet rs) throws SQLException {
        if (rs != null) {
            rs.close();
        }
    }

}
DataSourceUtils.java
<?xml version="1.0" encoding="UTF-8"?>
<c3p0-config>
    <default-config>
        <property name="user">root</property>
        <property name="password">0608</property>
        <property name="driverClass">com.mysql.jdbc.Driver</property>
        <property name="jdbcUrl">jdbc:mysql://localhost:3306/paperdata?serverTimezone=GMT%2B8&amp;useUnicode=true&amp;characterEncoding=UTF-8</property>
    </default-config> 
</c3p0-config> 
c3p0-config.xml
<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>论文云</title>
<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
<script src="./js/echarts-wordcloud.js"></script>
<script src="./js/jquery-1.11.3.min.js"></script>
<!-- 引入Bootstrap核心样式文件 -->
<link href="css/bootstrap.css" rel="stylesheet">
<!-- 引入BootStrap核心js文件 -->
<script src="./js/bootstrap.js"></script>
<style>
html, body, #main {
     100%;
    height: 100%;
    margin: 0;
}
</style>
</head>
<body>
    <div id="main"></div>
    <div>
        <table class="table table-hover">
            <thead>
                <tr>
                    <td style="font-size: 20px;">论文链接</td>
                </tr>
            </thead>
            <tbody>
                <c:forEach items="${dataList}" var="data" varStatus="vs">
                    <tr>
                        <td><a href="${data.paperlink}">${data.papername}</a></td>
                    </tr>
                </c:forEach>
            </tbody>
        </table>
    </div>
    <script>
        var chart = echarts.init(document.getElementById('main'));
        var postURL = "/PaperData/getData";
        var mydata = new Array();
        $.ajaxSettings.async = false;
        $.post(postURL, {}, function(rs) {
            var dataList = JSON.parse(rs);
            for (var i = 0; i < dataList.length; i++) {
                var d = {};
                d['name'] = dataList[i].name;
                d['value'] = dataList[i].value;
                mydata.push(d);
            }
        });
        $.ajaxSettings.async = true;
        var option = {
            tooltip : {},
            series : [ {
                type : 'wordCloud',
                gridSize : 2,
                sizeRange : [ 20, 50 ],
                rotationRange : [ -90, 90 ],
                shape : 'pentagon',
                width : 800,
                height : 600,
                drawOutOfBound : false,
                textStyle : {
                    normal : {
                        color : function() {
                            return 'rgb('
                                    + [ Math.round(Math.random() * 160),
                                            Math.round(Math.random() * 160),
                                            Math.round(Math.random() * 160) ]
                                            .join(',') + ')';
                        }
                    },
                    emphasis : {
                        shadowBlur : 10,
                        shadowColor : '#333'
                    }
                },
                data : mydata
            } ]
        };
        chart.setOption(option);
        chart.on('click', function(params) {
            var url = "clickFunction?name=" + params.name;
            window.location.href = url;
        });
    </script>
</body>
</html>
papercloud.jsp
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>论文云</title>
<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
<script src="./js/echarts-wordcloud.js"></script>
<script src="./js/jquery-1.11.3.min.js"></script>
<!-- 引入Bootstrap核心样式文件 -->
<link href="css/bootstrap.css" rel="stylesheet">
<!-- 引入BootStrap核心js文件 -->
<script src="./js/bootstrap.js"></script>
<style>
html, body, #main {
     100%;
    height: 100%;
    margin: 0;
}
</style>
</head>
<body>
    <div id="main"></div>
    <div>
        <table class="table table-hover">
            <thead>
                <tr>
                    <td style="font-size: 20px;">论文链接</td>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td><a>www.baidu.com</a></td>
                </tr>
            </tbody>
        </table>
    </div>
    <script>
        var chart = echarts.init(document.getElementById('main'));
        var postURL = "/PaperData/getData";
        var mydata = new Array();
        $.ajaxSettings.async = false;
        $.post(postURL, {}, function(rs) {
            var dataList = JSON.parse(rs);
            for (var i = 0; i < dataList.length; i++) {
                var d = {};
                d['name'] = dataList[i].name;
                d['value'] = dataList[i].value;
                mydata.push(d);
            }
        });
        $.ajaxSettings.async = true;
        var option = {
            tooltip : {},
            series : [ {
                type : 'wordCloud',
                gridSize : 2,
                sizeRange : [ 20, 50 ],
                rotationRange : [ -90, 90 ],
                shape : 'pentagon',
                width : 800,
                height : 600,
                drawOutOfBound : false,
                textStyle : {
                    normal : {
                        color : function() {
                            return 'rgb('
                                    + [ Math.round(Math.random() * 160),
                                            Math.round(Math.random() * 160),
                                            Math.round(Math.random() * 160) ]
                                            .join(',') + ')';
                        }
                    },
                    emphasis : {
                        shadowBlur : 10,
                        shadowColor : '#333'
                    }
                },
                data : mydata
            } ]
        };
        chart.setOption(option);
        chart.on('click', function(params) {
            var url = "clickFunction?name=" + params.name;
            window.location.href = url;
        });
    </script>
</body>
</html>
papercloud.html

截图:

 

原文地址:https://www.cnblogs.com/Aming-/p/13085339.html