企业发票异常分析

 

一、          数据说明:

1、数据组成

(1)增值税发票数据,文件名zzsfp

(2)发票对应货物明细数据,文件名zzsfp_hwmx

(3)企业信息,文件名nsrxx

2、数据字段说明

1zzsfp表字典

字段名称

字段含义

数据类型

备注

fp_nid

发票id

String

发票唯一标识

xf_id

销方识别号

String

企业唯一身份标识

gf_id

购方识别号

String

企业唯一身份标识

je

金额

Double

 

se

税额

Double

 

jshj

价税合计

Double

 

kpyf

开票月份

String

 

kprq

开票日期

String

 

zfbz

作废标志

String

‘Y’代表作废

 

 

zzsfp表内容($ less zzsfp

 

2zzsfp_hwmx

字段名称

字段含义

数据类型

备注

fp_nid

发票id

String

发票唯一标识

date_key

开票月份

String

 

hwmc

货物名称

String

 

ggxh

规格型号

String

 

dw

单位

String

 

sl

数量

Double

 

dj

单价

Double

 

je

金额

Double

 

se

税额

Double

 

spbm

商品编码

String

 

zzsfp_hwmx表内容($ less zzsfp_hwmx)

 

3nsrxx

字段名称

字段含义

数据类型

备注

hydm

行业代码

String

 

nsr_id

纳税人id

String

企业唯一身份标识

djzclx_dm

登记注册类型代码

String

网上可查阅相关代码含义

kydjrq

开业登记日期

String

 

xgrq

修改日期

String

给企业打标签的时间

label

标签

String

‘0’代表正常企业

‘1’代表问题企业

nsrxx表内容($ less nsrxx)

 

 

3、关联数据的必要说明

(1)zzsfp表可通过fp_nid进行关联

(2)zzsfp表可通过xf_id或者gf_id与nsrxx中的nsr_id进行关联,分离出销项发票表和进项发票表

二、     测试要求:

1、数据导入:

       ---nsrxx
--
建表
CREATE TABLE `nsrxx` ( `hydm` string ,
                          `nsr_id` string,
                          `djzclx_dm` string,
                          `kydjrq` string,
                          `xgrq` string ,
                          `label` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data  local inpath  '/kkb/data/import-hive/nsrxx.csv' into table nsrxx;

---nsrxx
---
去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/nsrxx_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(hydm,2)as hydm,nsr_id,djzclx_dm,kydjrq,xgrq,substr(label,1,1)as label from nsrxx;

--建表
CREATE TABLE `nsrxx_01` ( `hydm` string ,
                       `nsr_id` string,
                       `djzclx_dm` string,
                       `kydjrq` string,
                       `xgrq` string ,
                       `label` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;

load data  local inpath  '/tmp/hive-result/nsrxx_01' into table nsrxx_01;

 

 

 

---zzsfp

--
建表
CREATE TABLE `zzsfp` (
                            `fp_nid` string,
                            `xf_id` string,
                            `gf_id` string ,
                            `je` double ,
                            `se` double,
                            `jshj` double ,
                            `kpyf` string,
                            `kprq` string,
                            `zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data  local inpath  '/kkb/data/import-hive/zzsfp.csv' into table zzsfp;

---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_01"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,xf_id,gf_id,je,se,jshj,kpyf,kprq,substr(zfbz,1,1)as zfbz from zzsfp;

--建表
CREATE TABLE `zzsfp_01` (
                            `fp_nid` string,
                            `xf_id` string,
                            `gf_id` string ,
                            `je` double ,
                            `se` double,
                            `jshj` double ,
                            `kpyf` string,
                            `kprq` string,
                            `zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;

load data  local inpath  '/tmp/hive-result/zzsfp_01' into table zzsfp_01;

 

 

 

---zzsfp_hwmx
---
去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_hwmx_01"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,date_key,hwmc,ggxh,dw,sl,dj,je,se,substr(spbm,1,1)as spbm from zzsfp_hwmx;
--建表 zzsfp_hwmx
create table zzsfp_hwmx_01(
                           fp_nid  String,
                           date_key String,
                           hwmc  String,
                           ggxh String,
                           dw String,
                           sl String,
                           dj String ,
                           je   String,
                           se  String,
                           spbm  String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
load data  local inpath  '/tmp/hive-result/zzsfp_hwmx_01' into table zzsfp_hwmx_01

 

2、数据分析:

   企业异常的判断标准参考:

    (1)、企业增值税发票进项与出项严重不符即出现只出不进或者只进不出的企业;

    (2)企业发票数据与详细流水信息不符;

    (3)个人上网查阅企业异常信息数据标准;

//销方的数据
create table xf_data
(xf_id  String,  xf_count String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/xf_data"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select xf_id , count(xf_id) from zzsfp group by xf_id;

load data  local inpath  '/tmp/hive-result/xf_data' into table xf_data;

 

 

//购方的数据
create table gf_data
(gf_id  String,  gf_count String )
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/gf_data"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select gf_id , count(gf_id) from zzsfp_01 group by gf_id;

load data  local inpath  '/tmp/hive-result/gf_data' into table gf_data;

 

 

//按照企业id,把购方销方总次数联合起来
drop table xf_gf_data;
create table xf_gf_data
(xf_gf_id  String,  xf_count String,  gf_count String )
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

Insert into xf_gf_data(xf_gf_id,xf_count,gf_count)
Select xf_id as xf_gf_id,xf_data.xf_count as xf_count,gf_data.gf_count as gf_count from xf_data  join gf_data  on xf_id=gf_id;

 

create table fp_abs
(xf_gf_id  String,  abs_xg String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

INSERT INTO fp_abs (xf_gf_id, abs_xg)
SELECT
    xf_gf_id,
    ABS(xf_count - gf_count)
FROM
    xf_gf_data;

 

看差值:

select count(xf_gf_id)
from fp_abs
where abs_xg>900;

 

 

 

 

查找企业发票作废次数存入zfbz_data

create table zfbz_data
(xf_id String , gf_id String , zfcs String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

Insert into table zfbz_data(xf_id,gf_id,zfcs)
(select xf_id, gf_id ,count(zfbz) from zzsfp_01 where zfbz = 'Y'  group by xf_id,gf_id);

 

 

Insert into table wenti(xf_id)
SELECT
    xx.xf_gf_id
FROM
    (SELECT
         a.xf_gf_id
     FROM
         (SELECT
              xf_gf_id
          FROM
              fp_abs
          WHERE abs_xg > 400) AS a
             JOIN
         (SELECT
              xf_id
          FROM
              zfbz_data
          WHERE zfcs > 15) AS b
         ON a.xf_gf_id = b.xf_id GROUP BY a.xf_gf_id) AS xx
        JOIN
    (SELECT
         aa.xf_gf_id
     FROM
         (SELECT
              xf_gf_id
          FROM
              fp_abs
          WHERE abs_xg > 400) AS aa
             JOIN
         (SELECT
              gf_id
          FROM
              zfbz_data
          WHERE zfcs > 15) AS bb
         ON aa.xf_gf_id = bb.gf_id GROUP BY aa.xf_gf_id) AS yy
    ON xx.xf_gf_id = yy.xf_gf_id ;

3、处理结果入库:

   将上述异常标准的结果分别汇总统计,并将结果数据导出到mySQL数据库中。

   最终结果参考提示:

          最终给出的数据情况

企业总数:33,829

非正常企业总数:318

 

4、数据可视化展示:

   利用Echarts将上述统计结果以图形化展示的方式展现出来:饼图、柱状图、地图、折线图等。

三、     测试报告:

1、按照测试题目顺序,将实验步骤说明和结果截图存储到答题纸上。

            

石家庄铁道大学2021年秋季

  2019 级课堂测试试卷-企业发票异常分析

课程名称:大型数据库应用技术  任课教师王建民     测试时间:150分钟

一、          数据说明:

1、数据组成

(1)增值税发票数据,文件名zzsfp

(2)发票对应货物明细数据,文件名zzsfp_hwmx

(3)企业信息,文件名nsrxx

2、数据字段说明

1zzsfp表字典

字段名称

字段含义

数据类型

备注

fp_nid

发票id

String

发票唯一标识

xf_id

销方识别号

String

企业唯一身份标识

gf_id

购方识别号

String

企业唯一身份标识

je

金额

Double

 

se

税额

Double

 

jshj

价税合计

Double

 

kpyf

开票月份

String

 

kprq

开票日期

String

 

zfbz

作废标志

String

‘Y’代表作废

 

 

zzsfp表内容($ less zzsfp

 

2zzsfp_hwmx

字段名称

字段含义

数据类型

备注

fp_nid

发票id

String

发票唯一标识

date_key

开票月份

String

 

hwmc

货物名称

String

 

ggxh

规格型号

String

 

dw

单位

String

 

sl

数量

Double

 

dj

单价

Double

 

je

金额

Double

 

se

税额

Double

 

spbm

商品编码

String

 

zzsfp_hwmx表内容($ less zzsfp_hwmx)

 

3nsrxx

字段名称

字段含义

数据类型

备注

hydm

行业代码

String

 

nsr_id

纳税人id

String

企业唯一身份标识

djzclx_dm

登记注册类型代码

String

网上可查阅相关代码含义

kydjrq

开业登记日期

String

 

xgrq

修改日期

String

给企业打标签的时间

label

标签

String

‘0’代表正常企业

‘1’代表问题企业

nsrxx表内容($ less nsrxx)

 

 

3、关联数据的必要说明

(1)zzsfp表可通过fp_nid进行关联

(2)zzsfp表可通过xf_id或者gf_id与nsrxx中的nsr_id进行关联,分离出销项发票表和进项发票表

二、     测试要求:

1、数据导入:

       ---nsrxx
--
建表
CREATE TABLE `nsrxx` ( `hydm` string ,
                          `nsr_id` string,
                          `djzclx_dm` string,
                          `kydjrq` string,
                          `xgrq` string ,
                          `label` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data  local inpath  '/kkb/data/import-hive/nsrxx.csv' into table nsrxx;

---nsrxx
---
去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/nsrxx_01"
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(hydm,2)as hydm,nsr_id,djzclx_dm,kydjrq,xgrq,substr(label,1,1)as label from nsrxx;

--建表
CREATE TABLE `nsrxx_01` ( `hydm` string ,
                       `nsr_id` string,
                       `djzclx_dm` string,
                       `kydjrq` string,
                       `xgrq` string ,
                       `label` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;

load data  local inpath  '/tmp/hive-result/nsrxx_01' into table nsrxx_01;

 

 

 

---zzsfp

--
建表
CREATE TABLE `zzsfp` (
                            `fp_nid` string,
                            `xf_id` string,
                            `gf_id` string ,
                            `je` double ,
                            `se` double,
                            `jshj` double ,
                            `kpyf` string,
                            `kprq` string,
                            `zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;
load data  local inpath  '/kkb/data/import-hive/zzsfp.csv' into table zzsfp;

---去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_01"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,xf_id,gf_id,je,se,jshj,kpyf,kprq,substr(zfbz,1,1)as zfbz from zzsfp;

--建表
CREATE TABLE `zzsfp_01` (
                            `fp_nid` string,
                            `xf_id` string,
                            `gf_id` string ,
                            `je` double ,
                            `se` double,
                            `jshj` double ,
                            `kpyf` string,
                            `kprq` string,
                            `zfbz` string
) row format delimited fields terminated by ',' STORED AS TEXTFILE;

load data  local inpath  '/tmp/hive-result/zzsfp_01' into table zzsfp_01;

 

 

 

---zzsfp_hwmx
---
去括号
INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/zzsfp_hwmx_01"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
select substr(fp_nid,2)as fp_nid,date_key,hwmc,ggxh,dw,sl,dj,je,se,substr(spbm,1,1)as spbm from zzsfp_hwmx;
--建表 zzsfp_hwmx
create table zzsfp_hwmx_01(
                           fp_nid  String,
                           date_key String,
                           hwmc  String,
                           ggxh String,
                           dw String,
                           sl String,
                           dj String ,
                           je   String,
                           se  String,
                           spbm  String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;
load data  local inpath  '/tmp/hive-result/zzsfp_hwmx_01' into table zzsfp_hwmx_01

 

2、数据分析:

   企业异常的判断标准参考:

    (1)、企业增值税发票进项与出项严重不符即出现只出不进或者只进不出的企业;

    (2)企业发票数据与详细流水信息不符;

    (3)个人上网查阅企业异常信息数据标准;

//销方的数据
create table xf_data
(xf_id  String,  xf_count String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/xf_data"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select xf_id , count(xf_id) from zzsfp group by xf_id;

load data  local inpath  '/tmp/hive-result/xf_data' into table xf_data;

 

 

//购方的数据
create table gf_data
(gf_id  String,  gf_count String )
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

INSERT OVERWRITE LOCAL DIRECTORY "/tmp/hive-result/gf_data"
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
Select gf_id , count(gf_id) from zzsfp_01 group by gf_id;

load data  local inpath  '/tmp/hive-result/gf_data' into table gf_data;

 

 

//按照企业id,把购方销方总次数联合起来
drop table xf_gf_data;
create table xf_gf_data
(xf_gf_id  String,  xf_count String,  gf_count String )
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

Insert into xf_gf_data(xf_gf_id,xf_count,gf_count)
Select xf_id as xf_gf_id,xf_data.xf_count as xf_count,gf_data.gf_count as gf_count from xf_data  join gf_data  on xf_id=gf_id;

 

create table fp_abs
(xf_gf_id  String,  abs_xg String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

INSERT INTO fp_abs (xf_gf_id, abs_xg)
SELECT
    xf_gf_id,
    ABS(xf_count - gf_count)
FROM
    xf_gf_data;

 

看差值:

select count(xf_gf_id)
from fp_abs
where abs_xg>900;

 

 

 

 

查找企业发票作废次数存入zfbz_data

create table zfbz_data
(xf_id String , gf_id String , zfcs String)
    ROW format delimited fields terminated by ',' STORED AS TEXTFILE;

Insert into table zfbz_data(xf_id,gf_id,zfcs)
(select xf_id, gf_id ,count(zfbz) from zzsfp_01 where zfbz = 'Y'  group by xf_id,gf_id);

 

 

Insert into table wenti(xf_id)
SELECT
    xx.xf_gf_id
FROM
    (SELECT
         a.xf_gf_id
     FROM
         (SELECT
              xf_gf_id
          FROM
              fp_abs
          WHERE abs_xg > 400) AS a
             JOIN
         (SELECT
              xf_id
          FROM
              zfbz_data
          WHERE zfcs > 15) AS b
         ON a.xf_gf_id = b.xf_id GROUP BY a.xf_gf_id) AS xx
        JOIN
    (SELECT
         aa.xf_gf_id
     FROM
         (SELECT
              xf_gf_id
          FROM
              fp_abs
          WHERE abs_xg > 400) AS aa
             JOIN
         (SELECT
              gf_id
          FROM
              zfbz_data
          WHERE zfcs > 15) AS bb
         ON aa.xf_gf_id = bb.gf_id GROUP BY aa.xf_gf_id) AS yy
    ON xx.xf_gf_id = yy.xf_gf_id ;

3、处理结果入库:

   将上述异常标准的结果分别汇总统计,并将结果数据导出到mySQL数据库中。

   最终结果参考提示:

          最终给出的数据情况

企业总数:33,829

非正常企业总数:318

 

4、数据可视化展示:

   利用Echarts将上述统计结果以图形化展示的方式展现出来:饼图、柱状图、地图、折线图等。

 

            

posted @ 2021-11-16 19:46  yasai  阅读(141)  评论(0编辑  收藏  举报