nodeJS爬虫实例
项目概述
要求:选取3-5个新闻网站,1.对网站下每篇内容的作者,日期,来源,标题等结构化数据进行爬取,2.数据保存在数据库中,3.建立网站展示爬取数据,要求实现对爬取内容的分项搜索、以及所查关键词的时间热度分析。4.爬虫和网站后端均用node.js实现。
前后端设计
该网站前后端设计主要考量了以下几点:
1)组成部分:网站由登录页,查询页面,以及时间热度分析页面组成
2)功能:用户登录注册,词云,对内容或标题的搜索,对关键词的时间热度分析
3)数据可视化:使用echarts实现词云,实现折线图热度分析
4)后端数据来源:爬取了大约2000条新闻,来源为参考消息网,光明网,以及中国新闻网,数据存储于mysql中。
开始前的准备
一、新建项目文件夹
首先用express框架新建项目文件夹
1.安装node.js,当前镜像不是淘宝镜像的切换镜像: npm config set registry https://registry.npm.taobao.org
2.安装express:npm install express --save
3.安装express-generator生成器:npm i express-generator -g
4.输入express --view=ejs project 之后系统会新建project文件夹,文件夹中有如下项:
bin:存放启动脚本文件 bin/www:启动脚本文件,可修改端口号,等功能。 public:存放图片,css,js等静态文件 routes:存放路由模块文件 views:存放视图文件,使用的ejs模板引擎 app.js:入口文件,重要的配置文件 package.json:工程信息和安装依赖文件
二、安装mysql数据库
1.安装mysql 详细安装mysql教程: https://zhuanlan.zhihu.com/p/46905335?ivk_sa=1024320u
2.配置mysql环境变量 将mysql.exe所在路径放到全局变量Path下即可
3.安装数据库依赖项:
npm install
npm install mysql -save
4.当前文件夹打开PowerShell,输入:mysql -uroot -p回车;之后输入刚刚安装mysql数据库时的密码再次回车
5.创建表fetches,用于存放爬取的数据 复制以下粘贴到PowerShell 回车
create database crawl;
use crawl;
CREATE TABLE `fetches` (
`id_fetches` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(200) DEFAULT NULL,
`source_name` varchar(200) DEFAULT NULL,
`source_encoding` varchar(45) DEFAULT NULL,
`title` varchar(200) DEFAULT NULL,
`keywords` varchar(200) DEFAULT NULL,
`author` varchar(200) DEFAULT NULL,
`publish_date` date DEFAULT NULL,
`crawltime` datetime DEFAULT NULL,
`content` longtext,
`createtime` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id_fetches`),
UNIQUE KEY `id_fetches_UNIQUE` (`id_fetches`),
UNIQUE KEY `url_UNIQUE` (`url`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
6.创建表user存放用户的账号及密码 复制以下粘贴到PowerShell 回车
CREATE TABLE `crawl`.`user`(
`id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`username` VARCHAR(45) NOT NULL,
`password` VARCHAR(45) NOT NULL,
`registertime` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY(`id`),
UNIQUE KEY `username_UNIQUE`(`username`))
ENGINE=InnoDB DEFAULT CHARSET=utf8;
7.创建表user_action存放用户操作日志 复制以下粘贴到PowerShell 回车
CREATE TABLE `crawl`.`user_action` (
`id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`username` VARCHAR ( 45 ) NOT NULL,
`request_time` VARCHAR ( 45 ) NOT NULL,
`request_method` VARCHAR ( 20 ) NOT NULL,
`request_url` VARCHAR ( 300 ) NOT NULL,
`status` INT ( 4 ),
`remote_addr` VARCHAR ( 1000 ) NOT NULL,
PRIMARY KEY ( `id` )
) ENGINE = INNODB DEFAULT CHARSET = utf8;
三、为网页绑定mysql数据库
1.创建dao文件夹,内部创建四个js文件
logDAO.JS用于存放用户操作日志
//该文件其实并没用到,最后选择使用express中间件 morgan记录日志(app.js中) var mysql = require('mysql'); var mysqlConf = require('../conf/mysqlConf'); var pool = mysql.createPool(mysqlConf.mysql); // 使用了连接池,重复使用数据库连接,而不必每执行一次CRUD操作就获取、释放一次数据库连接,从而提高了对数据库操作的性能。 // 记录用户操作 module.exports = { userlog :function (useraction, callback) { pool.query('insert into user_action(username,request_time,request_method,request_url,status,remote_addr) values(?, ?,?,?,?,?)', useraction, function (error, result) { if (error) throw error; callback(result.affectedRows > 0); }); }, };
newsDAO.js用于新闻查询
var mysql = require('mysql'); var mysqlConf = require('../conf/mysqlConf'); var userSqlMap = require('./userSqlMap'); var pool = mysql.createPool(mysqlConf.mysql); // 使用了连接池,重复使用数据库连接,而不必每执行一次CRUD操作就获取、释放一次数据库连接,从而提高了对数据库操作的性能。 module.exports = { add: function (user, callback) { pool.query(userSqlMap.add, [user.username, user.password], function (error, result) { if (error) throw error; callback(result.affectedRows > 0); }); }, getByUsername: function (username, callback) { pool.query(userSqlMap.getByUsername, [username], function (error, result) { if (error) throw error; callback(result); }); }, };
userSqlMap.js
var userSqlMap = {
add: 'insert into user(username, password) values(?, ?)',//注册时用
getByUsername: 'select username, password from user where username = ?'//登陆时用
};
module.exports = userSqlMap;
2.在project文件夹下新建conf文件夹—>新建mysqlConf.js
module.exports = {
mysql: {
host: 'localhost',//本地数据库,127.0.0.1也可
user: 'root',//用户
password: 'root',//密码
database:'crawl',//数据库名
// 最大连接数,默认为10
connectionLimit: 10
}
};
3.在project文件夹下新建mysql.js文件
var mysql = require("mysql");
var pool = mysql.createPool({
host: '127.0.0.1',
user: 'root',
password: 'root',
database: 'crawl'
});
var query = function(sql, sqlparam, callback) {
pool.getConnection(function(err, conn) {
if (err) {
callback(err, null, null);
} else {
conn.query(sql, sqlparam, function(qerr, vals, fields) {
conn.release(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
});
}
});
};
var query_noparam = function(sql, callback) {
pool.getConnection(function(err, conn) {
if (err) {
callback(err, null, null);
} else {
conn.query(sql, function(qerr, vals, fields) {
conn.release(); //释放连接
callback(qerr, vals, fields); //事件驱动回调
});
}
});
};
exports.query = query;
exports.query_noparam = query_noparam;
爬取数据
一、编写爬虫代码
在project文件夹下新建crawle2.js(例子,名字不固定),写入针对目标网站的爬取代码,下面以‘参考消息网’的爬取代码为例。
我采取的爬取方式是,首先找到根网址,也就是一个新闻网站的首页,遍历首页中所有的a标签,如果该a标签中的网址符合子新闻网址的格式,则进入爬取具体内容,否则跳过,若该网址已经爬取过,则不再存入数据库中。并且设置定时,每天自动爬取两次。
var mysql = require('./mysql.js') //导入所需模块
var myRequest = require('request')
var myCheerio = require('cheerio')
var myIconv = require('iconv-lite')
require('date-utils');
var schedule = require('node-schedule');
var source_name = "参考消息网"; //声明来源,编码,以及根网址
var myEncoding = "utf-8";
var seedURL = 'http://www.cankaoxiaoxi.com/';
之后我们要对目标网站的具体内容进行分析,找到元素对应的标签,使用jquery进行绑定,比如在参考消息网的页面中,文章关键词对应的是keywords标签,题目标签是title,日期的标签id是pubtime_baidu等等。
var seedURL_format = "$('a')";//定义具体哪些元素可以读取
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.articleText').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
//使用正则表达式来筛选读取到的网址是否正确
var url_reg = /\/([a-zA-Z]{7})\/(\d{8})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
}
var rule = new schedule.RecurrenceRule();
var times = [0, 12]; //每天2次自动执行
var times2 = 5; //定义在第几分钟执行
rule.hour = times;
rule.minute = times2;
schedule.scheduleJob(rule, function() {
seedget();
});
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; ////开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch当根网址改变后,比如爬取中国新闻网的子新闻,则只需改变褐色字体的根网址和来源,以及标签即可。.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
console.log(myURL);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
/*var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
"_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
////存储json
fs.writeFileSync(filename, JSON.stringify(fetch));
*/
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUE(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url,fetch.source_name,fetch.source_encoding,
fetch.title,fetch.keywords,fetch.author,fetch.publish_date,
fetch.crawltime.toFormat("YYYY_MM_DD HH24:MI:SS"),fetch.content
];
mysql.query(fetchAddSql,fetchAddSql_Params,function(qerr,vals,fields){
if(qerr){
console.log(qerr);
}
});
});
}
二、运行爬虫
在命令行中输入node crawler2.js
就可以运行crawler2.js爬虫了
三、检查数据库中的数据
推荐使用navicat进行数据可视化
navicat下载地址:http://www.navicat.com.cn/products
爬取成功后的crawl库如图
建立网站
一、注册与登录页面
在这个页面我们主要实现以下功能
1.用户登录(账号存在)
2. 用户注册(账号不存在)
3. 将账号存入mysql中
project\public\index.html代码如下
<!DOCTYPE html>
<html ng-app="login" lang='en'>
<head>
<meta charset="utf-8" />
<title>Login</title>
<link rel="stylesheet" href="http://cdn.bootcss.com/bootstrap/3.3.0/css/bootstrap.min.css">
<script src="https://cdn.staticfile.org/jquery/3.2.1/jquery.min.js"></script>
<script src="https://cdn.staticfile.org/popper.js/1.12.5/umd/popper.min.js"></script>
<script src="https://cdn.staticfile.org/twitter-bootstrap/4.1.0/js/bootstrap.min.js"></script>
<!-- <script src="../node_modules/angular/angular.min.js"></script>-->
<script src="https://cdn.staticfile.org/angular.js/1.4.6/angular.min.js"></script>
<link rel="stylesheet" type="text/css" href="stylesheets/style.css">
<!-- <link rel="stylesheet" type="text/css" href="stylesheets/index.css">
-->
<script type="text/javascript" src="javascripts/index.js"></script> <script> var app = angular.module('login', []); app.controller('loginCtrl', function ($scope, $http, $timeout) { // 登录时,检查用户输入的账户密码是否与数据库中的一致 $scope.check_pwd = function () { var data = JSON.stringify({ username: $scope.username, password: $scope.password }); $http.post("/users/login", data) .then( function (res) { if(res.data.msg=='ok') { window.location.href='/news.html'; }else{ $scope.msg=res.data.msg; } }, function (err) { $scope.msg = err.data; }); }; //增加注册用户 $scope.doAdd = function () { // 检查用户注册时,输入的两次密码是否一致 if($scope.add_password!==$scope.confirm_password){ // $timeout(function () { // $scope.msg = '两次密码不一致!'; // },100); $scope.msg = '两次密码不一致!'; } else { var data = JSON.stringify({ username: $scope.add_username, password: $scope.add_password }); $http.post("/users/register", data) .then(function (res) { if(res.data.msg=='成功注册!请登录') { $scope.msg=res.data.msg; $timeout(function () { window.location.href='index.html'; },2000); } else { $scope.msg = res.data.msg; } }, function (err) { $scope.msg = err.data; }); } }; }); </script> </head> <body> <div class="container" ng-controller="loginCtrl"> <div class="login-wrapper"> <a href="#" class="active" id="login-form-link">Login</a> <div class="form-wrapper"> <form id="login-form" method="post" role="form" style="display: block;"> <input ng-model="username" tabindex="1" class="form-control" placeholder="Username" value=""> <input type="password" ng-model="password" tabindex="2" class="form-control" placeholder="Password"> <button id="login-submit" tabindex="4" class="form-control btn btn-login" ng-click="check_pwd()">LOG IN</button> </form> <form id="register-form" method="post" role="form" style="display: none;"> <input ng-model="add_username" tabindex="1" class="form-control" placeholder="Username" value=""/> <input type="password" ng-model="add_password" tabindex="2" class="form-control" placeholder="Password"> <input type="password" ng-model="confirm_password" tabindex="2" class="form-control" placeholder="Confirm Password"> <button tabindex="4" class="form-control btn btn-register" ng-click="doAdd()">Register Now</button> </form> </div> <div class="alert alert-warning" ng-if="msg && msg!='ok'"> <a href="#" class="close" data-dismiss="alert">×</a> <strong>警告!</strong>{{msg}} </div> <a href="#" id="register-form-link"><p>Dont have an account?</p><p>Register</p></a> </div> </div> </body> </html>
设置其angular代码,project\public\javascripts\index.js
$(function() {
$('#login-form-link').click(function(e) {
$("#login-form").delay(100).fadeIn(100);
$("#register-form").fadeOut(100);
$('#register-form-link').removeClass('active');
$(this).addClass('active');
e.preventDefault();
});
$('#register-form-link').click(function(e) {
$("#register-form").delay(100).fadeIn(100);
$("#login-form").fadeOut(100);
$('#login-form-link').removeClass('active');
$(this).addClass('active');
e.preventDefault();
});
});
设置路由project\routes\users.js
var express = require('express');
var router = express.Router();
var userDAO = require('../dao/userDAO');
router.post('/login', function(req, res) {
var username = req.body.username;
var password = req.body.password;
// var sess = req.session;
userDAO.getByUsername(username, function (user) {
if(user.length==0){
res.json({msg:'用户不存在!请检查后输入'});
}else {
if(password===user[0].password){
req.session['username'] = username;
res.cookie('username', username);
res.json({msg: 'ok'});
// res.json({msg:'ok'});
}else{
res.json({msg:'用户名或密码错误!请检查后输入'});
}
}
});
});
/* add users */
router.post('/register', function (req, res) {
var add_user = req.body;
// 先检查用户是否存在
userDAO.getByUsername(add_user.username, function (user) {
if (user.length != 0) {
// res.render('index', {msg:'用户不存在!'});
res.json({msg: '用户已存在!'});
}else {
userDAO.add(add_user, function (success) {
res.json({msg: '成功注册!请登录'});
})
}
});
});
// 退出登录
router.get('/logout', function(req, res, next){
// 备注:这里用的 session-file-store 在destroy 方法里,并没有销毁cookie
// 所以客户端的 cookie 还是存在,导致的问题 --> 退出登陆后,服务端检测到cookie
// 然后去查找对应的 session 文件,报错
// session-file-store 本身的bug
req.session.destroy(function(err) {
if(err){
res.json('退出登录失败');
return;
}
// req.session.loginUser = null;
res.clearCookie('username');
res.json({result:'/index.html'});
});
});
module.exports = router;
二、建立新闻查询页面
在这个页面我们主要实现以下功能
1.对关键词的复合查询
2. 对单词的时间热度分析
project\public\news.html代码如下
<!DOCTYPE html>
<head>
<meta charset="utf-8" />
<title>news</title>
<link rel="stylesheet" href="http://cdn.bootcss.com/bootstrap/3.3.0/css/bootstrap.min.css">
<script src="https://cdn.staticfile.org/jquery/3.2.1/jquery.min.js"></script>
<script src="https://cdn.staticfile.org/popper.js/1.12.5/umd/popper.min.js"></script>
<script src="https://cdn.staticfile.org/twitter-bootstrap/4.1.0/js/bootstrap.min.js"></script>
<!-- <script src="../node_modules/angular/angular.min.js"></script>-->
<script src="https://cdn.staticfile.org/angular.js/1.4.6/angular.min.js"></script>
<script src="echarts.min.js"></script>
<script src='javascripts/dist/echarts-wordcloud.min.js'></script>
<link rel="stylesheet" type="text/css" href="stylesheets/news.css">
</head>
<body ng-app='news' ng-controller="news_Ctrl">
<script src="javascripts/news.js"></script>
<nav class="navbar navbar-inverse navbar-fixed-top">
<div class="container">
<div class="navbar-header">
<a class="navbar-brand" href="#">News</a>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li ><a ng-click="showSearch()" ng-model="isShow">词条检索</a></li>
<li><a ng-click="showsearchline()">时间热度分析</a></li>
<li>
<a href="#" class="dropdown-toggle" data-toggle="dropdown">账号管理<span class="caret"></span></a>
<ul class="dropdown-menu">
<li class="dropdown-header">账号</li>
<li><a ng-click="logout()">退出登录</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<span ng-show="isShowtext" id="main1" style=" width: 1000px;height:600px;position:fixed; top:70px;left:80px"></span>
<div class="search-container" ng-show="isShow" ng-init="isShow=true" ng-init="isShowtext=false" >
<!--查询页面-->
<div class="search-container-include" ng-include="'search.html'"></div>
</div>
<div ng-show="isShows" id="main2" ng-init="wordcloud()" >
</div>
<div class="search-container2" ng-show="isShowtext" >
<form class="search-container-show" >
关键词: <input type="text" ng-model="searchline">
<button type="submit" ng-click="line()">查询</button>
</form>
</div>
</body>
public\search.html 该文件在news文件内被引用
<form class="form-horizontal" role="form">
<div class="row" style="margin-bottom: 10px;">
<label class="col-lg-2 control-label" >标题关键字</label>
<div class="col-lg-3">
<input type="text" class="form-control" placeholder="标题关键字" ng-model="$parent.title1">
</div>
<div class="col-lg-1">
<select class="form-control" autocomplete="off" ng-model="$parent.selectTitle">
<option selected="selected" >AND</option>
<option >OR</option>
</select>
</div>
<div class="col-lg-3">
<input type="text" class="form-control" placeholder="标题关键字" ng-model="$parent.title2">
</div>
</div>
<div class="row" style="margin-bottom: 10px;">
<label class="col-lg-2 control-label" >内容关键字</label>
<div class="col-lg-3">
<input type="text" class="form-control" placeholder="内容关键字" ng-model="$parent.content1">
</div>
<div class="col-lg-1">
<select class="form-control" autocomplete="off" ng-model="$parent.selectContent">
<option selected="selected" >AND</option>
<option >OR</option>
</select>
</div>
<div class="col-lg-3">
<input type="text" class="form-control" placeholder="内容关键字" ng-model="$parent.content2">
</div>
</div>
<div class="form-group">
<div class="col-md-offset-9">
<button type="submit" class="btn btn-default" ng-click="search()">查询</button>
</div>
</div>
</form>
<!--显示查询结果-->
<div class='formshow' ng-show="isisshowresult">
<table class="table table-striped">
<thead>
<tr>
<td>序号</td>
<td>标题</td>
<td>作者</td>
<!-- <td>内容</td>-->
<td>关键词</td>
<td>链接</td>
<td>发布时间</td>
</tr>
</thead>
<tbody>
<tr ng-repeat="(key, item) in items">
<td>{{index+key}}</td>
<td>{{item.title}}</td>
<td>{{item.author}}</td>
<!-- <td>{{item.content}}</td>-->
<td>{{item.keywords}}</td>
<td>{{item.url}}</td>
<td>{{item.publish_date}}</td>
</tr>
</tbody>
</table>
<div class="row">
<!-- <div class="form-group">-->
<div class="pull-left" style="margin-top: 12px;">
<button type="submit" class="btn btn-primary" ng-click="searchsortASC()" >发布时间升序</button>
<button type="submit" class="btn btn-primary" ng-click="searchsortDESC()">发布时间降序</button>
</div>
<!-- </div>-->
<div class="pull-right">
<nav>
<ul class="pagination">
<li>
<a ng-click="Previous()" role="button"><span role="button">上一页</span></a>
</li>
<li ng-repeat="page in pageList" ng-class="{active:isActivePage(page)}" role="button">
<a ng-click="selectPage(page)" >{{ page }}</a>
</li>
<li>
<a ng-click="Next()" role="button"><span role="button">下一页</span></a>
</li>
</ul>
</nav>
</div>
</div>
</div>
设置angular代码,public\javascript\news.js
在这里我们使用了外部Echarts库,用来生成饼状图,词云等数据表
从而完成数据可视化
var app = angular.module('news', []);
app.controller('news_Ctrl', function ($scope, $http, $timeout) {
// 控制查询页面是否显示
$scope.showSearch = function () {
$scope.isShow = true;
$scope.isShow2 = true;
$scope.isisshowresult = false;
$scope.isShow22 = false;
$scope.isShowtext = false;
// 再次回到查询页面时,表单里要保证都空的
$scope.title1=undefined;
$scope.title2=undefined;
$scope.selectTitle='AND';
$scope.content1=undefined;
$scope.content2=undefined;
$scope.selectContent='AND';
$scope.sorttime=undefined;
};
// 查询数据
$scope.search = function () {
$scope.isShows =false;
var title1 = $scope.title1;
var title2 = $scope.title2;
var selectTitle = $scope.selectTitle;
var content1 = $scope.content1;
var content2 = $scope.content2;
var selectContent = $scope.selectContent;
var sorttime = $scope.sorttime;
// 检查用户传的参数是否有问题
//用户有可能这样输入:___ and/or 新冠(直接把查询词输在了第二个位置)
if(typeof title1=="undefined" && typeof title2!="undefined" && title2.length>0){
title1 = title2;
}
if(typeof content1=="undefined" && typeof content2!="undefined" && content2.length>0){
content1 = content2;
}
// 用户可能一个查询词都不输入,默认就是查找全部数据
var myurl = `/news/search?t1=${title1}&ts=${selectTitle}&t2=${title2}&c1=${content1}&cs=${selectContent}&c2=${content2}&stime=${sorttime}`;
$http.get(myurl).then(
function (res) {
if(res.data.message=='data'){
$scope.isisshowresult = true; //显示表格查询结果
// $scope.searchdata = res.data;
$scope.initPageSort(res.data.result)
}else {
window.location.href=res.data.result;
}
},function (err) {
$scope.msg = err.data;
});
};
// 分页
$scope.initPageSort=function(item){
$scope.pageSize=5; //每页显示的数据量,可以随意更改
$scope.selPage = 1;
$scope.data = item;
$scope.pages = Math.ceil($scope.data.length / $scope.pageSize); //分页数
$scope.pageList = [];//最多显示5页,后面6页之后不会全部列出页码来
$scope.index = 1;
// var page = 1;
// for (var i = page; i < $scope.pages+1 && i < page+5; i++) {
// $scope.pageList.push(i);
// }
var len = $scope.pages> 5 ? 5:$scope.pages;
$scope.pageList = Array.from({length: len}, (x,i) => i+1);
//设置表格数据源(分页)
$scope.items = $scope.data.slice(0, $scope.pageSize);
};
//打印当前选中页
$scope.selectPage = function (page) {
//不能小于1大于最大(第一页不会有前一页,最后一页不会有后一页)
if (page < 1 || page > $scope.pages) return;
//最多显示分页数5,开始分页转换
var pageList = [];
if(page>2){
for (var i = page-2; i <= $scope.pages && i < page+3; i++) {
pageList.push(i);
}
}else {
for (var i = page; i <= $scope.pages && i < page+5; i++) {
pageList.push(i);
}
}
$scope.index =(page-1)*$scope.pageSize+1;
$scope.pageList = pageList;
$scope.selPage = page;
$scope.items = $scope.data.slice(($scope.pageSize * (page - 1)), (page * $scope.pageSize));//通过当前页数筛选出表格当前显示数据
console.log("选择的页:" + page);
};
//设置当前选中页样式
$scope.isActivePage = function (page) {
return $scope.selPage == page;
};
//上一页
$scope.Previous = function () {
$scope.selectPage($scope.selPage - 1);
};
//下一页
$scope.Next = function () {
$scope.selectPage($scope.selPage + 1);
};
$scope.searchsortASC = function () {
$scope.sorttime = '1';
$scope.search();
};
$scope.searchsortDESC = function () {
$scope.sorttime = '2';
$scope.search();
};
$scope.line = function () {
var line_keyword = $scope.searchline;
var myurl = `/news/line?keyword=${line_keyword}`;
$scope.isShow = false;
$scope.isShow2 = false;
$scope.isShow22 = false;
//$scope.isShowtext = false;
$http.get(myurl).then(
function (res) {
if(res.data.message=='url'){
window.location.href=res.data.result;
}else {
var myChart = echarts.init(document.getElementById("main1"));
option = {
title: {
text: `“${line_keyword}”该词在新闻中的出现次数随时间变化图`
},
xAxis: {
type: 'category',
data: Object.keys(res.data.result)
},
yAxis: {
type: 'value'
},
series: [{
data: Object.values(res.data.result),
type: 'line',
itemStyle: {normal: {label: {show: true}}}
}],
};
if (option && typeof option === "object") {
myChart.setOption(option, true);
}
}
});
};
$scope.showsearchline = function (){
$scope.isShowtext = true;
$scope.isShows = false;
$scope.isShow = false;
$scope.isShow2 = false;
$scope.isShow22 = false;
//再次回到查询页面时,表单里要保证都空的
$scope.searchline=undefined;
};
$scope.wordcloud = function () {
$scope.isisshowresult = false;
$scope.isShows = true;
$http.get("/news/wordcloud").then(
function (res) {
if(res.data.message=='url'){
window.location.href=res.data.result;
}else {
var mainContainer = document.getElementById('main2');
var chart = echarts.init(mainContainer);
var data = [];
for (var name in res.data.result) {
data.push({
name: name,
value: Math.sqrt(res.data.result[name])
})
}
var maskImage = new Image();
maskImage.src = './images/logo2.png';
var option = {
title: {
text: '所有新闻内容的词云展示'
},
series: [{
type: 'wordCloud',
sizeRange: [10, 60],
width:1000,
height:500,
rotationRange: [-90, 90],
rotationStep: 45,
gridSize: 2,
shape: 'circle',
maskImage: maskImage,
drawOutOfBound: false,
textStyle: {
normal: {
fontFamily: 'sans-serif',
fontWeight: 'bold',
// Color can be a callback function or a color string
color: function () {
// Random color
return 'rgb(' + [
Math.round(Math.random() * 160),
Math.round(Math.random() * 160),
Math.round(Math.random() * 160)
].join(',') + ')';
}
},
emphasis: {
shadowBlur: 10,
shadowColor: '#333'
}
},
data: data
}]
};
maskImage.onload = function () {
// option.series[0].data = data;
chart.clear();
chart.setOption(option);
};
window.onresize = function () {
chart.resize();
};
}
});
};
});
设置路由
routes\news.js
var newsDAO = require('../dao/newsDAO');
var express = require('express');
var router = express.Router();
var mywordcutModule = require('./wordcut.js');
var myfreqchangeModule = require('./freqchange.js');
router.get('/search', function(request, response) {
console.log(request.session['username']);
//sql字符串和参数
if (request.session['username']===undefined) {
response.json({message:'url',result:'/index.html'});
}else {
var param = request.query;
newsDAO.search(param,function (err, result, fields) {
response.json({message:'data',result:result});
})
}
});
router.get('/line', function(request, response) {
//sql字符串和参数
console.log(request.session['username']);
//sql字符串和参数
if (request.session['username']===undefined) {
// response.redirect('/index.html')
response.json({message:'url',result:'/index.html'});
}else {
//var keyword = '疫情'; //也可以改进,接受前端提交传入的搜索词
var keyword = request.query.keyword;
var fetchSql = "select content,publish_date from fetches where content like'%" + keyword + "%' order by publish_date;";
newsDAO.query_noparam(fetchSql, function (err, result, fields) {
response.writeHead(200, {
"Content-Type": "application/json",
"Cache-Control": "no-cache, no-store, must-revalidate",
"Pragma": "no-cache",
"Expires": 0
});
response.write(JSON.stringify({message:'data',result:myfreqchangeModule.freqchange(result, keyword)}));
response.end();
});
}
});
router.get('/wordcloud', function(request, response) {
//sql字符串和参数
console.log(request.session['username']);
//sql字符串和参数
if (request.session['username']===undefined) {
// response.redirect('/index.html')
response.json({message:'url',result:'/index.html'});
}else {
var fetchSql = "select content from fetches;";
newsDAO.query_noparam(fetchSql, function (err, result, fields) {
response.writeHead(200, {
"Content-Type": "application/json",
"Cache-Control": "no-cache, no-store, must-revalidate",
"Pragma": "no-cache",
"Expires": 0
});
response.write(JSON.stringify({message:'data',result:mywordcutModule.wordcut(result)}));//返回处理过的数据
response.end();
});
}
});
module.exports = router;
routes\wordcut.js
// 载入模块
// var nodejieba = require('nodejieba');
var Segmenter = require('node-analyzer');
//正则表达式去掉一些无用的字符,与高频但无意义的词。
const regex = /[\t\s\r\n\d\w]|[\+\-\(\),\.。,!?《》@、【】"'::%-\/“”]/g;
var wordcut = function(vals) {
var segmenter = new Segmenter();
var word_freq = {};
vals.forEach(function (content){
if(content['content']!=null){
var newcontent = content["content"].replace(regex,'');
if(newcontent.length !== 0){
// console.log();
var words = segmenter.analyze(newcontent).split(' ');
// var words = nodejieba.cut(newcontent);
words.forEach(function (word){
word = word.toString();
word_freq[word] = (word_freq[word] +1 ) || 1;
});
};
}
});
return word_freq;
};
exports.wordcut = wordcut;
routes\freqchange.js
// var mysql = require('../mysql.js');
//用于关键字查找
//正则表达式去掉一些无用的字符。
const regex_c = /[\t\s\r\n\d\w]|[\+\-\(\),\.。,!?《》@、【】"'::%-\/“”]/g;
var regex_d = /\w{3}\s(.*?) 2021/; //只留下日期的年月
var freqchange = function(vals, keyword) {
var regex_k = eval('/'+keyword+'/g');
var word_freq = {};
vals.forEach(function (data){
var content = data["content"].replace(regex_c,'');
var publish_date = regex_d.exec(data['publish_date'])[1];
var freq = content.match(regex_k).length;// 直接搜这个词。
word_freq[publish_date] = (word_freq[publish_date] + freq ) || 0;
});
return word_freq;
};
exports.freqchange = freqchange;
三、其他文件的完善
project\app.js代码如下
var createError = require('http-errors');
var express = require('express');
var path = require('path');
var cookieParser = require('cookie-parser');
var session = require('express-session');
var logger = require('morgan');
var logDAO = require('./dao/logDAO.js');
// var fs = require('fs');//加了文件操作的模块
// var accessLogStream = fs.createWriteStream(path.join(__dirname, 'access.log'), { flag: 'a' });//创建一个写文件流,并且保存在当前文件夹的access.log文件中
// var indexRouter = require('./routes/users');
var usersRouter = require('./routes/users');
var newsRouter = require('./routes/news');
var app = express();
//设置session
app.use(session({
secret: 'sessiontest',//与cookieParser中的一致
resave: true,
saveUninitialized: false, // 是否保存未初始化的会话
cookie : {
maxAge : 1000 * 60 * 60, // 设置 session 的有效时间,单位毫秒
},
}));
// view engine setup
// app.set('views', path.join(__dirname, 'views'));
// app.set('view engine', 'ejs');
let method = '';
app.use(logger(function (tokens, req, res) {
console.log('打印的日志信息:');
var request_time = new Date();
var request_method = tokens.method(req, res);
var request_url = tokens.url(req, res);
var status = tokens.status(req, res);
var remote_addr = tokens['remote-addr'](req, res);
if(req.session){
var username = req.session['username']||'notlogin';
}else {
var username = 'notlogin';
}
// 直接将用户操作记入mysql中
if(username!='notlogin'){
logDAO.userlog([username,request_time,request_method,request_url,status,remote_addr], function (success) {
console.log('成功保存!');
})
}
console.log('请求时间 = ', request_time);
console.log('请求方式 = ', request_method);
console.log('请求链接 = ', request_url);
console.log('请求状态 = ', status);
console.log('请求长度 = ', tokens.res(req, res, 'content-length'),);
console.log('响应时间 = ', tokens['response-time'](req, res) + 'ms');
console.log('远程地址 = ', remote_addr);
console.log('远程用户 = ', tokens['remote-user'](req, res));
console.log('http版本 = ', tokens['http-version'](req, res));
console.log('浏览器信息 = ', tokens['user-agent'](req, res));
console.log('用户 = ', username);
console.log(' ===============',method);
}, ));
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(cookieParser());
app.use(express.static(path.join(__dirname, 'public')));
app.use('/angular', express.static(path.join(__dirname , '/node_modules/angular')));
// app.use('/', indexRouter);
app.use('/users', usersRouter);
app.use('/news', newsRouter);
app.use(function(req, res, next) {
next(createError(404));
});
// error handler
app.use(function(err, req, res, next) {
// set locals, only providing error in development
res.locals.message = err.message;
res.locals.error = req.app.get('env') === 'development' ? err : {};
// render the error page
res.status(err.status || 500);
// res.render('error');
});
module.exports = app;
四、引用js外部文件
在数据可视化这方面,我们需要引用echarts库,所以我们需要下载echarts.min.js文件,保存在public文件夹下。
由于使用了angular,所以也要下载angular.min.js保存在public\angular文件夹下。
词云的实现也依赖于外部库,所以下载echarts-wordcloud.min.js保存在javascript\dist文件夹下
五、css美化布局
这部分就看自己的审美了
当前没有样式,增加样式需要在
public/stylesheets/ 下新建index.css文件
然后在public/index.html导入
导入方法
将<!-- <link rel="stylesheet" type="text/css" href="stylesheets/index.css">
-->注释打开即可
六、npm出现的一些问题
在npm运行一些代码时可能会出现 Error: Cannot find module 'node-analyzer' 类似的错误
可以使用 npm install node-analyzer --save 来解决
最终效果
在project中打开命令行,输入node bin/www
然后在浏览器上打开http://localhost:3000/index.html网址,就可以看到最终效果了(当前效果为转载文章效果,该大佬也未提供样式文件,/(ㄒoㄒ)/~~,只能自力更生了)
转自:https://zhuanlan.zhihu.com/p/369031469
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了