北京市政百姓信件分析实战
北京市政百姓信件分析实战
1.爬虫
import json
import requests
import demjson
from bs4 import BeautifulSoup
import jieba
import jieba.analyse as analyse
import csv
kv = {
'Host': 'www.beijing.gov.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'text/json',
'X-Requested-With': 'XMLHttpRequest',
'Content-Length': '155',
'Origin': 'http://www.beijing.gov.cn',
'Connection': 'keep-alive',
'Referer': 'http://www.beijing.gov.cn/hudong/hdjl/'}
kv1 = {'user-agent': 'Mozilla/5.0'}
if __name__ == "__main__":
page=1
query = {
}
datas = json.dumps(query)
while page<175:
print(page)
url = "https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo="+str(page)+"&page.pageSize=6&orgtitleLength=26"
r = requests.post(url, data=datas, headers=kv)
rr = demjson.decode(r.text);
for item in rr["result"]:
originalId=item["originalId"]#编号
letterTypeName=item["letterTypeName"]#信件类型
#咨询
url1 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+originalId
#建议
url2 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="+originalId
if letterTypeName=="咨询":
r1 = requests.get(url1, headers=kv1)
else:
r1 = requests.get(url2, headers=kv1)
demo = r1.text
soup = BeautifulSoup(demo, "html.parser")
title=soup.find("strong").get_text().replace("\n","")
fromPeople=soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip('来信人:').lstrip().rstrip()
fromTime=soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip('时间:')
problem=soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip().replace("\r","").replace("\n","")
office=soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().replace("\n","")
answerTime=soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip('答复时间:')
answer=soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace("\n","").replace("\r","")
#json格式的导进去对不上
# item1={
# 'originalId':originalId,
# 'letterTypeName':letterTypeName,
# 'title':title,
# 'fromPeople':fromPeople,
# 'fromTime':fromTime,
# 'problem':problem,
# 'office':office,
# 'answerTime':answerTime,
# 'answer':answer
# }
# itemm=str(item1)
#全部爬下来的
itemm=originalId+"|"+letterTypeName+"|"+title+"|"+fromPeople+"|"+fromTime+"|"+problem+"|"+office+"|"+answerTime+"|"+answer
fp = open("yijian.txt", 'a', encoding='utf-8')
fp.write(itemm + '\n')
fp.close()
page=page+1
2.数据清洗
爬取下来的txt文件内容如下:
然后需要对txt文件进行清洗,这里由于爬下来的数据比较规范,所以不采用MapReduce进行清洗,选择使用csv导入分隔符由图可知选择使用‘|’,导入后会自动划分数据,如下图所示:
3.导入数据
然后将test.csv上传到Hadoop的指定目录,这里我上传到了/export/data/test.csv
下,然后启动Hadoop、hive、beeline、zookeeper集群等,启动完成后在beeline里创建hive数据库:
create datebase edu3;
然后跳转到要使用的数据库并创建汇总表,用于将所有数据储存:
create table govdata(
leixing string,
biaoti string,
laixinren string,
shijian string,
number string,
problem string,
offic string,
officpt string,
officp string)
row format delimited
fields terminated by ',';
然后将刚才上传的csv文件导入到表里(这里要记得要在终端使用命令,不要在beeline里使用):
load data local inpath '/export/data/test.csv' into table govdata;
可以用一下命令检查一下输出信息对不对,应该是可以变成理想的输出信息:
select substr(shijian,0,4) as dt,
count(1) as num from govdata group by substr(shijian,0,4);
select leixing,
count(1) as num from govdata group by leixing order by num desc;
select govdata.offic,
count(1) as num from govdata group by offic order by num desc;
验证完输出信息后创建第一个表来储存第一张图,并将信息保存到这个表里:
create table lenum(
year string,
num int
)
row format delimited
fields terminated by ','
stored as textfile;
insert into table lenum
select
substr(shijian,0,4) as dt,
count(1) as num
from govdata
group by substr(shijian,0,4)
order by num;
然后是第二张表:
create table typenums(
leixing string,
num int
)
row format delimited
fields terminated by ','
stored as textfile;
insert into typenums
select
leixing,
count(1) as num
from govdata
group by leixing
order by num desc;
第三张表:
create table govnums(
govname string,
num int
)
row format delimited
fields terminated by ','
stored as textfile;
insert into govnums
select
govdata.offic,
count(1) as num
from govdata
group by offic
order by num desc;
最后可以执行数据库查询语句查看信息是否正确保存:
select * from 表名 limit 10;
4.导出到本地mysql
这里的导出使用sqoop来进行导出操作,直接在终端执行命令即可。
这里要记得要提前在本地数据库建好对应的表,表的数据类型要一一对应。
第一张表:
sqoop export \
--connect jdbc:mysql://47.92.246.30:3306/edu4out?characterEncoding=UTF-8 \
--username root \
--password Lht+2023kybs=v. \
--table lenumsql \
--export-dir /user/hive/warehouse/edu3.db/lenum/000000_0 \
--input-fields-terminated-by ',';
第二张表:
sqoop export \
--connect jdbc:mysql://47.92.246.30:3306/edu4out?characterEncoding=UTF-8 \
--username root \
--password Lht+2023kybs=v. \
--table typenum \
--export-dir /user/hive/warehouse/edu3.db/typenums/000000_0 \
--input-fields-terminated-by ',';
第三张表:
sqoop export \
--connect jdbc:mysql://47.92.246.30:3306/edu4out?characterEncoding=UTF-8 \
--username root \
--password Lht+2023kybs=v. \
--table govnum \
--export-dir /user/hive/warehouse/edu3.db/govnums/000000_0 \
--input-fields-terminated-by ',';
汇总表:
sqoop export \
--connect jdbc:mysql://47.92.246.30:3306/edu4out?characterEncoding=UTF-8 \
--username root \
--password Lht+2023kybs=v. \
--table govdata \
--export-dir /user/hive/warehouse/edu3.db/govdata/test.csv \
--input-fields-terminated-by ',';
5.可视化展示
这里采用springboot架构来进行设计,后端代码不再展示,仅展示前端代码,后端只需要查询数据库所有信息并传输给前端即可,记得导入vue.js
、axios.js
、echarts.js
。
<!doctype html>
<html lang="en" xmlns:th="http://www.thymeleaf.org">
<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>欢迎来到三刘一陈的网站</title>
<!-- Favicon -->
<link rel="shortcut icon" th:href="@{images/favicon.ico}"/>
<!-- Bootstrap CSS -->
<link rel="stylesheet" th:href="@{css/bootstrap.min.css}">
<!-- Typography CSS -->
<link rel="stylesheet" th:href="@{css/typography.css}">
<!-- Style CSS -->
<link rel="stylesheet" th:href="@{css/style.css}">
<!-- Responsive CSS -->
<link rel="stylesheet" th:href="@{css/responsive.css}">
<script src="https://cdn.staticfile.org/echarts/4.3.0/echarts.min.js"></script>
</head>
<body class="sidebar-main-active right-column-fixed header-top-bgcolor">
<!-- loader Start -->
<div id="loading">
<div id="loading-center">
</div>
</div>
<!-- loader END -->
<!-- Wrapper Start -->
<div class="wrapper" id="app">
<!-- Sidebar -->
<!--<div th:replace="~{commons/commons::sidebar}"></div>-->
<!-- TOP Nav Bar -->
<!-- <div th:replace="~{commons/commons::topnavbar}"></div>-->
<!-- TOP Nav Bar END -->
<!-- Page Content -->
<div >
<div class="container-fluid">
<div class="row">
<div class="col-md-6 col-lg-7">
<div class="iq-card iq-card-block iq-card-stretch iq-card-height overflow-hidden">
<div class="iq-card-body p-0">
<div >
<div id="main1" style="width: 850px;height:700px;"></div>
</div>
</div>
</div>
</div>
<div class="col-md-6 col-lg-5">
<div class="row">
<div class="iq-card iq-card-block iq-card-stretch iq-card-height">
<div class="iq-card-body p-0">
<div >
<div id="main2" style="width: 675px;height:350px;"></div>
</div>
</div>
</div>
<div class="iq-card iq-card-block iq-card-stretch iq-card-height">
<div class="iq-card-body p-0">
<div >
<div id="main3" style="width: 675px;height:350px;"></div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Wrapper END -->
<!-- Footer -->
<!--<div th:replace="~{commons/commons::footer}"></div>-->
<!-- Footer END -->
<!-- Optional JavaScript -->
<!-- jQuery first, then Popper.js, then Bootstrap JS -->
<div th:replace="~{commons/commons::js}"></div>
<script th:src="@{js/vue.js}"></script>
<script th:src="@{js/axios.js}"></script>
<script>
new Vue({
el: "#app",
mounted() {
var myChart1 = echarts.init(document.getElementById('main1'));
var myChart2 = echarts.init(document.getElementById('main2'));
var myChart3 = echarts.init(document.getElementById('main3'));
// 绘制图表1
var option1=({
title: {text: '柱状图'}, //{text: 'ECharts入门示例'},
xAxis: {
type: 'category',
data: ''
},
yAxis: {
type: 'value'
},
series:[
{
data: '',
type: 'bar',
showBackground: true,
backgroundStyle: {
color: 'rgb(2,145,248)'
}
}
]
});
// 绘制图表2
var option2=({
title: {text: '折线图'},
xAxis: {
type: 'category',
data: ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
},
yAxis: {
type: 'value'
},
series: [
{
data: [820, 932, 901, 934, 1290, 1330, 1320],
type: 'line',
smooth: true
}
]
});
// 绘制图表3
var option3=({
title: {text: '饼图'},
tooltip: {
trigger: 'item'
},
legend: {
top: '5%',
left: 'center'
},
series: [
{
data: [
{ value: 1048, name: 'Search Engine' },
{ value: 735, name: 'Direct' },
{ value: 580, name: 'Email' },
{ value: 484, name: 'Union Ads' },
{ value: 300, name: 'Video Ads' }
],
name: 'Access From',
type: 'pie',
radius: ['40%', '70%'],
avoidLabelOverlap: false,
itemStyle: {
borderRadius: 10,
borderColor: '#fff',
borderWidth: 2
},
label: {
show: false,
position: 'center'
},
emphasis: {
label: {
show: true,
fontSize: 40,
fontWeight: 'bold'
}
},
labelLine: {
show: false
},
}
]
});
axios.get('http://localhost:8080/gov/queryGov').then(resp => {
var govname=[];
for(var i=0;i<resp.data.length;i++){
govname.push(resp.data[i].govname);
}
console.log(govname);
option1.xAxis.data=govname;
var num=[];
for(var i=0;i<resp.data.length;i++){
num.push(resp.data[i].num);
}
console.log(num);
option1.series[0].data=num;
myChart1.setOption(option1);
});
axios.get('http://localhost:8080/le/queryLe').then(resp => {
var time=[];
for(var i=0;i<resp.data.length;i++){
time.push(resp.data[i].time);
}
console.log(time);
option2.xAxis.data=time;
var number=[];
for(var i=0;i<resp.data.length;i++){
number.push(resp.data[i].number);
}
console.log(number);
option2.series[0].data=number;
myChart2.setOption(option2);
});
axios.get('http://localhost:8080/type/queryType').then(resp => {
var leixing=[];
// for(var i=0;i<resp.data.length;i++){
// leixing.push(resp.data[i].leixing);
// }
// option3.series[0].data[1].name=leixing;
//
// var num=[];
// for(var i=0;i<resp.data.length;i++){
// num.push(resp.data[i].num);
// }
// console.log(num);
// option3.series[0].data[0].value=num;
//{ value: 1048, name: 'Search Engine' }
option3.series[0].data=[];
for(var i=0;i<resp.data.length;i++){
let str='{ value:'+resp.data[i].num+',name:"'+ resp.data[i].leixing +'"}';
// let str='{ value:333,name:"aaa" }';
option3.series[0].data.push(eval('('+str+')'));
console.log(eval('('+str+')'));
}
myChart3.setOption(option3);
});
},
})
</script>
</body>
</html>
最终界面展示如下: