企业发票异常分析
一、 数据说明:
1、数据组成
(1)增值税发票数据,文件名zzsfp
(2)发票对应货物明细数据,文件名zzsfp_hwmx
(3)企业信息,文件名nsrxx
2、数据字段说明
(1)zzsfp表字典
字段名称 |
字段含义 |
数据类型 |
备注 |
fp_nid |
发票id |
String |
发票唯一标识 |
xf_id |
销方识别号 |
String |
企业唯一身份标识 |
gf_id |
购方识别号 |
String |
企业唯一身份标识 |
je |
金额 |
Double |
|
se |
税额 |
Double |
|
jshj |
价税合计 |
Double |
|
kpyf |
开票月份 |
String |
|
kprq |
开票日期 |
String |
|
zfbz |
作废标志 |
String |
‘Y’代表作废 |
zzsfp表内容($ less zzsfp)
(2)zzsfp_hwmx表
字段名称 |
字段含义 |
数据类型 |
备注 |
fp_nid |
发票id |
String |
发票唯一标识 |
date_key |
开票月份 |
String |
|
hwmc |
货物名称 |
String |
|
ggxh |
规格型号 |
String |
|
dw |
单位 |
String |
|
sl |
数量 |
Double |
|
dj |
单价 |
Double |
|
je |
金额 |
Double |
|
se |
税额 |
Double |
|
spbm |
商品编码 |
String |
|
zzsfp_hwmx表内容($ less zzsfp_hwmx)
(3)nsrxx表
字段名称 |
字段含义 |
数据类型 |
备注 |
hydm |
行业代码 |
String |
|
nsr_id |
纳税人id |
String |
企业唯一身份标识 |
djzclx_dm |
登记注册类型代码 |
String |
网上可查阅相关代码含义 |
kydjrq |
开业登记日期 |
String |
|
xgrq |
修改日期 |
String |
给企业打标签的时间 |
label |
标签 |
String |
‘0’代表正常企业 ‘1’代表问题企业 |
nsrxx表内容($ less nsrxx)
3、关联数据的必要说明
(1)zzsfp表可通过fp_nid进行关联
(2)zzsfp表可通过xf_id或者gf_id与nsrxx中的nsr_id进行关联,分离出销项发票表和进项发票表
二、 测试要求:
1、数据导入:
---nsrxx
--建表
CREATE TABLE `nsrxx` ( `hydm` string ,
`nsr_id` string,
`djzclx_dm` string,
`kydjrq` string,
`xgrq` string ,
`label` string
) row format
delimited fields terminated by ',' STORED AS TEXTFILE;
load data
local inpath '/kkb/data/import-hive/nsrxx.csv' into table nsrxx;
---nsrxx
---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/nsrxx_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(hydm,2)as hydm,nsr_id,djzclx_dm,kydjrq,xgrq,substr(label,1,1)as label from nsrxx;
--建表
CREATE TABLE `nsrxx_01` ( `hydm` string ,
`nsr_id` string,
`djzclx_dm` string,
`kydjrq` string,
`xgrq` string ,
`label` string
) row format
delimited fields terminated by ',' STORED AS TEXTFILE;
load data
local inpath '/tmp/hive-result/nsrxx_01' into table nsrxx_01;
---zzsfp
--建表
CREATE TABLE `zzsfp` (
`fp_nid` string,
`xf_id` string,
`gf_id` string ,
`je` double ,
`se` double,
`jshj` double ,
`kpyf` string,
`kprq` string,
`zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data local inpath '/kkb/data/import-hive/zzsfp.csv' into table zzsfp;
---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,xf_id,gf_id,je,se,jshj,kpyf,kprq,substr(zfbz,1,1)as zfbz from zzsfp;
--建表
CREATE TABLE `zzsfp_01` (
`fp_nid` string,
`xf_id` string,
`gf_id` string ,
`je` double ,
`se` double,
`jshj` double ,
`kpyf` string,
`kprq` string,
`zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data local inpath '/tmp/hive-result/zzsfp_01' into table zzsfp_01;
---zzsfp_hwmx
---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_hwmx_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,date_key,hwmc,ggxh,dw,sl,dj,je,se,substr(spbm,1,1)as spbm from zzsfp_hwmx;
--建表 zzsfp_hwmx
create table zzsfp_hwmx_01(
fp_nid String,
date_key String,
hwmc String,
ggxh String,
dw String,
sl String,
dj String ,
je String,
se String,
spbm String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
load data local inpath '/tmp/hive-result/zzsfp_hwmx_01' into table zzsfp_hwmx_01
2、数据分析:
企业异常的判断标准参考:
(1)、企业增值税发票进项与出项严重不符即出现只出不进或者只进不出的企业;
(2)企业发票数据与详细流水信息不符;
(3)个人上网查阅企业异常信息数据标准;
//销方的数据
create table xf_data
(xf_id String, xf_count String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/xf_data"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select xf_id , count(xf_id) from zzsfp group by xf_id;
load data local inpath '/tmp/hive-result/xf_data' into table xf_data;
//购方的数据
create table gf_data
(gf_id String, gf_count String )
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/gf_data"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select gf_id , count(gf_id) from zzsfp_01 group by gf_id;
load data local inpath '/tmp/hive-result/gf_data' into table gf_data;
//按照企业id,把购方销方总次数联合起来
drop table xf_gf_data;
create table xf_gf_data
(xf_gf_id String, xf_count String, gf_count String )
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
Insert into xf_gf_data(xf_gf_id,xf_count,gf_count)
Select xf_id as xf_gf_id,xf_data.xf_count as xf_count,gf_data.gf_count as gf_count from xf_data join gf_data on xf_id=gf_id;
create table fp_abs
(xf_gf_id String, abs_xg String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
INSERT INTO fp_abs (xf_gf_id, abs_xg)
SELECT
xf_gf_id,
ABS(xf_count - gf_count)
FROM
xf_gf_data;
看差值:
select count(xf_gf_id)
from fp_abs
where abs_xg>900;
查找企业发票作废次数存入zfbz_data:
create table zfbz_data
(xf_id String , gf_id String , zfcs String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
Insert into table zfbz_data(xf_id,gf_id,zfcs)
(select xf_id, gf_id ,count(zfbz) from zzsfp_01 where zfbz = 'Y' group by xf_id,gf_id);
Insert into table wenti(xf_id)
SELECT
xx.xf_gf_id
FROM
(SELECT
a.xf_gf_id
FROM
(SELECT
xf_gf_id
FROM
fp_abs
WHERE abs_xg > 400) AS a
JOIN
(SELECT
xf_id
FROM
zfbz_data
WHERE zfcs > 15) AS b
ON a.xf_gf_id = b.xf_id GROUP BY a.xf_gf_id) AS xx
JOIN
(SELECT
aa.xf_gf_id
FROM
(SELECT
xf_gf_id
FROM
fp_abs
WHERE abs_xg > 400) AS aa
JOIN
(SELECT
gf_id
FROM
zfbz_data
WHERE zfcs > 15) AS bb
ON aa.xf_gf_id = bb.gf_id GROUP BY aa.xf_gf_id) AS yy
ON xx.xf_gf_id = yy.xf_gf_id ;
3、处理结果入库:
将上述异常标准的结果分别汇总统计,并将结果数据导出到mySQL数据库中。
最终结果参考提示:
最终给出的数据情况
企业总数:33,829
非正常企业总数:318
4、数据可视化展示:
利用Echarts将上述统计结果以图形化展示的方式展现出来:饼图、柱状图、地图、折线图等。
三、 测试报告:
1、按照测试题目顺序,将实验步骤说明和结果截图存储到答题纸上。
石家庄铁道大学2021年秋季
2019 级课堂测试试卷-企业发票异常分析
课程名称:大型数据库应用技术 任课教师:王建民 测试时间:150分钟
一、 数据说明:
1、数据组成
(1)增值税发票数据,文件名zzsfp
(2)发票对应货物明细数据,文件名zzsfp_hwmx
(3)企业信息,文件名nsrxx
2、数据字段说明
(1)zzsfp表字典
字段名称 |
字段含义 |
数据类型 |
备注 |
fp_nid |
发票id |
String |
发票唯一标识 |
xf_id |
销方识别号 |
String |
企业唯一身份标识 |
gf_id |
购方识别号 |
String |
企业唯一身份标识 |
je |
金额 |
Double |
|
se |
税额 |
Double |
|
jshj |
价税合计 |
Double |
|
kpyf |
开票月份 |
String |
|
kprq |
开票日期 |
String |
|
zfbz |
作废标志 |
String |
‘Y’代表作废 |
zzsfp表内容($ less zzsfp)
(2)zzsfp_hwmx表
字段名称 |
字段含义 |
数据类型 |
备注 |
fp_nid |
发票id |
String |
发票唯一标识 |
date_key |
开票月份 |
String |
|
hwmc |
货物名称 |
String |
|
ggxh |
规格型号 |
String |
|
dw |
单位 |
String |
|
sl |
数量 |
Double |
|
dj |
单价 |
Double |
|
je |
金额 |
Double |
|
se |
税额 |
Double |
|
spbm |
商品编码 |
String |
|
zzsfp_hwmx表内容($ less zzsfp_hwmx)
(3)nsrxx表
字段名称 |
字段含义 |
数据类型 |
备注 |
hydm |
行业代码 |
String |
|
nsr_id |
纳税人id |
String |
企业唯一身份标识 |
djzclx_dm |
登记注册类型代码 |
String |
网上可查阅相关代码含义 |
kydjrq |
开业登记日期 |
String |
|
xgrq |
修改日期 |
String |
给企业打标签的时间 |
label |
标签 |
String |
‘0’代表正常企业 ‘1’代表问题企业 |
nsrxx表内容($ less nsrxx)
3、关联数据的必要说明
(1)zzsfp表可通过fp_nid进行关联
(2)zzsfp表可通过xf_id或者gf_id与nsrxx中的nsr_id进行关联,分离出销项发票表和进项发票表
二、 测试要求:
1、数据导入:
---nsrxx
--建表
CREATE TABLE `nsrxx` ( `hydm` string ,
`nsr_id` string,
`djzclx_dm` string,
`kydjrq` string,
`xgrq` string ,
`label` string
) row format
delimited fields terminated by ',' STORED AS TEXTFILE;
load data
local inpath '/kkb/data/import-hive/nsrxx.csv' into table nsrxx;
---nsrxx
---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/nsrxx_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(hydm,2)as hydm,nsr_id,djzclx_dm,kydjrq,xgrq,substr(label,1,1)as label from nsrxx;
--建表
CREATE TABLE `nsrxx_01` ( `hydm` string ,
`nsr_id` string,
`djzclx_dm` string,
`kydjrq` string,
`xgrq` string ,
`label` string
) row format
delimited fields terminated by ',' STORED AS TEXTFILE;
load data
local inpath '/tmp/hive-result/nsrxx_01' into table nsrxx_01;
---zzsfp
--建表
CREATE TABLE `zzsfp` (
`fp_nid` string,
`xf_id` string,
`gf_id` string ,
`je` double ,
`se` double,
`jshj` double ,
`kpyf` string,
`kprq` string,
`zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data local inpath '/kkb/data/import-hive/zzsfp.csv' into table zzsfp;
---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,xf_id,gf_id,je,se,jshj,kpyf,kprq,substr(zfbz,1,1)as zfbz from zzsfp;
--建表
CREATE TABLE `zzsfp_01` (
`fp_nid` string,
`xf_id` string,
`gf_id` string ,
`je` double ,
`se` double,
`jshj` double ,
`kpyf` string,
`kprq` string,
`zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data local inpath '/tmp/hive-result/zzsfp_01' into table zzsfp_01;
---zzsfp_hwmx
---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_hwmx_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,date_key,hwmc,ggxh,dw,sl,dj,je,se,substr(spbm,1,1)as spbm from zzsfp_hwmx;
--建表 zzsfp_hwmx
create table zzsfp_hwmx_01(
fp_nid String,
date_key String,
hwmc String,
ggxh String,
dw String,
sl String,
dj String ,
je String,
se String,
spbm String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
load data local inpath '/tmp/hive-result/zzsfp_hwmx_01' into table zzsfp_hwmx_01
2、数据分析:
企业异常的判断标准参考:
(1)、企业增值税发票进项与出项严重不符即出现只出不进或者只进不出的企业;
(2)企业发票数据与详细流水信息不符;
(3)个人上网查阅企业异常信息数据标准;
//销方的数据
create table xf_data
(xf_id String, xf_count String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/xf_data"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select xf_id , count(xf_id) from zzsfp group by xf_id;
load data local inpath '/tmp/hive-result/xf_data' into table xf_data;
//购方的数据
create table gf_data
(gf_id String, gf_count String )
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/gf_data"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select gf_id , count(gf_id) from zzsfp_01 group by gf_id;
load data local inpath '/tmp/hive-result/gf_data' into table gf_data;
//按照企业id,把购方销方总次数联合起来
drop table xf_gf_data;
create table xf_gf_data
(xf_gf_id String, xf_count String, gf_count String )
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
Insert into xf_gf_data(xf_gf_id,xf_count,gf_count)
Select xf_id as xf_gf_id,xf_data.xf_count as xf_count,gf_data.gf_count as gf_count from xf_data join gf_data on xf_id=gf_id;
create table fp_abs
(xf_gf_id String, abs_xg String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
INSERT INTO fp_abs (xf_gf_id, abs_xg)
SELECT
xf_gf_id,
ABS(xf_count - gf_count)
FROM
xf_gf_data;
看差值:
select count(xf_gf_id)
from fp_abs
where abs_xg>900;
查找企业发票作废次数存入zfbz_data:
create table zfbz_data
(xf_id String , gf_id String , zfcs String)
ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
Insert into table zfbz_data(xf_id,gf_id,zfcs)
(select xf_id, gf_id ,count(zfbz) from zzsfp_01 where zfbz = 'Y' group by xf_id,gf_id);
Insert into table wenti(xf_id)
SELECT
xx.xf_gf_id
FROM
(SELECT
a.xf_gf_id
FROM
(SELECT
xf_gf_id
FROM
fp_abs
WHERE abs_xg > 400) AS a
JOIN
(SELECT
xf_id
FROM
zfbz_data
WHERE zfcs > 15) AS b
ON a.xf_gf_id = b.xf_id GROUP BY a.xf_gf_id) AS xx
JOIN
(SELECT
aa.xf_gf_id
FROM
(SELECT
xf_gf_id
FROM
fp_abs
WHERE abs_xg > 400) AS aa
JOIN
(SELECT
gf_id
FROM
zfbz_data
WHERE zfcs > 15) AS bb
ON aa.xf_gf_id = bb.gf_id GROUP BY aa.xf_gf_id) AS yy
ON xx.xf_gf_id = yy.xf_gf_id ;
3、处理结果入库:
将上述异常标准的结果分别汇总统计,并将结果数据导出到mySQL数据库中。
最终结果参考提示:
最终给出的数据情况
企业总数:33,829
非正常企业总数:318
4、数据可视化展示:
利用Echarts将上述统计结果以图形化展示的方式展现出来:饼图、柱状图、地图、折线图等。