顶会热词统计

爬取CVPR2019年所有论文的题目,并提取题目中的关键字,做成按照热度显示大小的热词云。

代码:

 

# coding=utf-8import pymysqlimport requestsfrom lxml import etree
 
 class Spider:
    def __init__(self):
        self.url = "http://openaccess.thecvf.com/CVPR2019.py"
        self.header = {
            "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
        self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='abc456', db='paperdata',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.html_list = []
 
    def getHtmlList(self):
        response = requests.get(self.url, headers=self.header)
        html_body = etree.HTML(response.text)
        title = html_body.xpath("//dt[@class='ptitle']/a/@href")
        for item in title:
            self.html_list.append("http://openaccess.thecvf.com/" + item)
 
    def getContent(self, url):
        try:
            response = requests.get(url, headers=self.header)
            body = etree.HTML(response.text)
            title = body.xpath("//div[@id='papertitle']/text()")[0]
            abstract = body.xpath("//div[@id='abstract']/text()")[0]
            down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
 
            sql = '''insert into data values({},"{}","{}","{}")'''.format(0, title, down_url, str(abstract))
            self.cursor.execute(sql)
            print(title + "插入成功!")
            self.db.commit()
        except Exception as e:
            print(e)
 
    def run(self):
        self.getHtmlList()
        for url in self.html_list:
            self.getContent(url)
 
 if __name__ == '__main__':
    spwder = Spider()
    spwder.run()
 
 

DataDao.java

 

package dao;

 

import java.sql.SQLException;

import java.util.List;

 

import org.apache.commons.dbutils.QueryRunner;

import org.apache.commons.dbutils.handlers.BeanListHandler;

 

import pojo.Data;

import utils.DataSourceUtils;

 

public class DataDao {

 

    public List<Data> getData() throws SQLException {

        QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());

        String sql = "select * from data ";

        List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class));

        return dataList;

       

       

    }

 

    public List<Data> getLink(String name) throws SQLException {

        QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());

        String sql = "select * from data where papername like ?";

        List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class),"%"+name+"%");

        return dataList;

    }

 

}

 

 

 

Data.java

 

package pojo;

 

 

public class Data {

    private int id;

    private String papername;

    private String paperlink;

    private String paperabstract;

    public int getId() {

        return id;

    }

    public void setId(int id) {

        this.id = id;

    }

    public String getPapername() {

        return papername;

    }

    public void setPapername(String papername) {

        this.papername = papername;

    }

    public String getPaperlink() {

        return paperlink;

    }

    public void setPaperlink(String paperlink) {

        this.paperlink = paperlink;

    }

    public String getPaperabstract() {

        return paperabstract;

    }

    public void setPaperabstract(String paperabstract) {

        this.paperabstract = paperabstract;

    }

}

 

 

 

World.java

 

package pojo;

 

public class Word {

    private String name;

    private int value;

    public String getName() {

        return name;

    }

    public void setName(String name) {

        this.name = name;

    }

    public int getValue() {

        return value;

    }

    public void setValue(int value) {

        this.value = value;

    }

}

 

 

DataService.java

 

package service;

 

import java.sql.SQLException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

 

import org.apache.commons.lang.ArrayUtils;

 

 

import dao.DataDao;

import pojo.Data;

import pojo.Word;

 

 

public class DataService {

 

    public List<Word> getData() throws SQLException {

        DataDao dao = new DataDao();

        List<Data>  dataList= dao.getData();

        List<Word> wordList = new ArrayList<Word>();

        String [] names = new String[100000];

        for(Data data:dataList) {

            String name = data.getPapername();

            String[] namestemp = name.split(" ");

            names = (String[]) ArrayUtils.addAll(namestemp, names);

        }

        HashMap<String, Integer> name_value = new HashMap<>();

       

        for(String name:names) {

            name_value.put(name, !name_value.containsKey(name)?1:name_value.get(name)+1);

        }

       

        for(String name:name_value.keySet()) {

            Word word = new Word();

            if(name!=null&&(name_value.get(name)>1)&&(name.length()>4)) {

                word.setName(name);

                word.setValue(name_value.get(name));

                wordList.add(word);

            }

 

        }

        return wordList;

    }

 

    public List<Data> getLink(String name) throws SQLException {

        DataDao dao = new DataDao();

        return dao.getLink(name);

    }

 

}

 

 

 

ClickFunctionServlet.java

 

package servlet;

 

import java.io.IOException;

import java.sql.SQLException;

import java.util.List;

 

import javax.servlet.ServletException;

import javax.servlet.annotation.WebServlet;

import javax.servlet.http.HttpServlet;

import javax.servlet.http.HttpServletRequest;

import javax.servlet.http.HttpServletResponse;

 

import pojo.Data;

import service.DataService;

 

/**

 * Servlet implementation class ClickFunctionServlet

 */

@WebServlet("/clickFunction")

public class ClickFunctionServlet extends HttpServlet {

    private static final long serialVersionUID = 1L;

      

    /**

     * @see HttpServlet#HttpServlet()

     */

    public ClickFunctionServlet() {

        super();

        // TODO Auto-generated constructor stub

    }

 

    /**

     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)

     */

    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

        // TODO Auto-generated method stub

        request.setCharacterEncoding("utf-8");

        response.setContentType("text/html;charset=UTF-8");

        String name = request.getParameter("name");

        List<Data> dataList =null;

        DataService service = new DataService();

        try {

            dataList = service.getLink(name);

        } catch (SQLException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }

        request.setAttribute("dataList", dataList);

        request.getRequestDispatcher("papercloud.jsp").forward(request, response);

    }

 

    /**

     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)

     */

    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

        // TODO Auto-generated method stub

        doGet(request, response);

    }

 

}

 

 

 

 

GetDataServlet.java

 

package servlet;

 

import java.io.IOException;

import java.sql.SQLException;

import java.util.List;

 

import javax.servlet.ServletException;

import javax.servlet.annotation.WebServlet;

import javax.servlet.http.HttpServlet;

import javax.servlet.http.HttpServletRequest;

import javax.servlet.http.HttpServletResponse;

 

import com.google.gson.Gson;

 

import pojo.Word;

import service.DataService;

 

/**

 * Servlet implementation class GetDataServlet

 */

@WebServlet("/getData")

public class GetDataServlet extends HttpServlet {

    private static final long serialVersionUID = 1L;

      

    /**

     * @see HttpServlet#HttpServlet()

     */

    public GetDataServlet() {

        super();

        // TODO Auto-generated constructor stub

    }

 

    /**

     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)

     */

    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

        // TODO Auto-generated method stub

        request.setCharacterEncoding("utf-8");

        response.setContentType("text/html;charset=UTF-8");

        List<Word> wordList = null;

        DataService service = new DataService();

        try {

            wordList = service.getData();

        } catch (SQLException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }

        Gson gson = new Gson();

        String json = gson.toJson(wordList);

        response.getWriter().write(json);

    }

 

    /**

     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)

     */

    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

        // TODO Auto-generated method stub

        doGet(request, response);

    }

 

}

 

DataSourceUtils.java

 

package utils;

 

import java.sql.Connection;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

 

import javax.sql.DataSource;

 

import com.mchange.v2.c3p0.ComboPooledDataSource;

 

public class DataSourceUtils {

 

    private static DataSource dataSource = new ComboPooledDataSource();

 

    private static ThreadLocal<Connection> tl = new ThreadLocal<Connection>();

 

 

    public static DataSource getDataSource() {

        return dataSource;

    }

 

   

    public static Connection getConnection() throws SQLException {

 

        Connection con = tl.get();

        if (con == null) {

            con = dataSource.getConnection();

            tl.set(con);

        }

        return con;

    }

 

   

    public static void startTransaction() throws SQLException {

        Connection con = getConnection();

        if (con != null) {

            con.setAutoCommit(false);

        }

    }

 

   

    public static void rollback() throws SQLException {

        Connection con = getConnection();

        if (con != null) {

            con.rollback();

        }

    }

 

    public static void commitAndRelease() throws SQLException {

        Connection con = getConnection();

        if (con != null) {

            con.commit();

            con.close();

            tl.remove();

        }

    }

 

 

    public static void closeConnection() throws SQLException {

        Connection con = getConnection();

        if (con != null) {

            con.close();

        }

    }

 

    public static void closeStatement(Statement st) throws SQLException {

        if (st != null) {

            st.close();

        }

    }

 

    public static void closeResultSet(ResultSet rs) throws SQLException {

        if (rs != null) {

            rs.close();

        }

    }

 

}

 

 

 

c3p0-config.xml

 

<?xml version="1.0" encoding="UTF-8"?>

<c3p0-config>

    <default-config>

        <property name="user">root</property>

        <property name="password">0608</property>

        <property name="driverClass">com.mysql.jdbc.Driver</property>

        <property name="jdbcUrl">jdbc:mysql://localhost:3306/paperdata?serverTimezone=GMT%2B8&useUnicode=true&characterEncoding=UTF-8</property>

    </default-config>

</c3p0-config>

 

 

 

papercloud.jsp

 

<%@ page language="java" contentType="text/html; charset=UTF-8"

    pageEncoding="UTF-8"%>

<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>

<!DOCTYPE html>

<html>

<head>

<meta charset="UTF-8">

<title>论文云</title>

<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>

<script src="./js/echarts-wordcloud.js"></script>

<script src="./js/jquery-1.11.3.min.js"></script>

<!-- 引入Bootstrap核心样式文件 -->

<link href="css/bootstrap.css" rel="stylesheet">

<!-- 引入BootStrap核心js文件 -->

<script src="./js/bootstrap.js"></script>

<style>

html, body, #main {

    width: 100%;

    height: 100%;

    margin: 0;

}

</style>

</head>

<body>

    <div id="main"></div>

    <div>

        <table class="table table-hover">

            <thead>

                <tr>

                    <td style="font-size: 20px;">论文链接</td>

                </tr>

            </thead>

            <tbody>

                <c:forEach items="${dataList}" var="data" varStatus="vs">

                    <tr>

                        <td><a href="${data.paperlink}">${data.papername}</a></td>

                    </tr>

                </c:forEach>

            </tbody>

        </table>

    </div>

    <script>

        var chart = echarts.init(document.getElementById('main'));

        var postURL = "/PaperData/getData";

        var mydata = new Array();

        $.ajaxSettings.async = false;

        $.post(postURL, {}, function(rs) {

            var dataList = JSON.parse(rs);

            for (var i = 0; i < dataList.length; i++) {

                var d = {};

                d['name'] = dataList[i].name;

                d['value'] = dataList[i].value;

                mydata.push(d);

            }

        });

        $.ajaxSettings.async = true;

        var option = {

            tooltip : {},

            series : [ {

                type : 'wordCloud',

                gridSize : 2,

                sizeRange : [ 20, 50 ],

                rotationRange : [ -90, 90 ],

                shape : 'pentagon',

                width : 800,

                height : 600,

                drawOutOfBound : false,

                textStyle : {

                    normal : {

                        color : function() {

                            return 'rgb('

                                    + [ Math.round(Math.random() * 160),

                                            Math.round(Math.random() * 160),

                                            Math.round(Math.random() * 160) ]

                                            .join(',') + ')';

                        }

                    },

                    emphasis : {

                        shadowBlur : 10,

                        shadowColor : '#333'

                    }

                },

                data : mydata

            } ]

        };

        chart.setOption(option);

        chart.on('click', function(params) {

            var url = "clickFunction?name=" + params.name;

            window.location.href = url;

        });

    </script>

</body>

</html>

 

 

 

 

 

papercloud.html

 

<!DOCTYPE html>

<html>

<head>

<meta charset="UTF-8">

<title>论文云</title>

<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>

<script src="./js/echarts-wordcloud.js"></script>

<script src="./js/jquery-1.11.3.min.js"></script>

<!-- 引入Bootstrap核心样式文件 -->

<link href="css/bootstrap.css" rel="stylesheet">

<!-- 引入BootStrap核心js文件 -->

<script src="./js/bootstrap.js"></script>

<style>

html, body, #main {

    width: 100%;

    height: 100%;

    margin: 0;

}

</style>

</head>

<body>

    <div id="main"></div>

    <div>

        <table class="table table-hover">

            <thead>

                <tr>

                    <td style="font-size: 20px;">论文链接</td>

                </tr>

            </thead>

            <tbody>

                <tr>

                    <td><a>www.baidu.com</a></td>

                </tr>

            </tbody>

        </table>

    </div>

    <script>

        var chart = echarts.init(document.getElementById('main'));

        var postURL = "/PaperData/getData";

        var mydata = new Array();

        $.ajaxSettings.async = false;

        $.post(postURL, {}, function(rs) {

            var dataList = JSON.parse(rs);

            for (var i = 0; i < dataList.length; i++) {

                var d = {};

                d['name'] = dataList[i].name;

                d['value'] = dataList[i].value;

                mydata.push(d);

            }

        });

        $.ajaxSettings.async = true;

        var option = {

            tooltip : {},

            series : [ {

                type : 'wordCloud',

                gridSize : 2,

                sizeRange : [ 20, 50 ],

                rotationRange : [ -90, 90 ],

                shape : 'pentagon',

                width : 800,

                height : 600,

                drawOutOfBound : false,

                textStyle : {

                    normal : {

                        color : function() {

                            return 'rgb('

                                    + [ Math.round(Math.random() * 160),

                                            Math.round(Math.random() * 160),

                                            Math.round(Math.random() * 160) ]

                                            .join(',') + ')';

                        }

                    },

                    emphasis : {

                        shadowBlur : 10,

                        shadowColor : '#333'

                    }

                },

                data : mydata

            } ]

        };

        chart.setOption(option);

        chart.on('click', function(params) {

            var url = "clickFunction?name=" + params.name;

            window.location.href = url;

        });

    </script>

</body>

</html>

posted @ 2020-06-12 22:22  ICDTAD  阅读(207)  评论(0编辑  收藏  举报