开学测试——电子商务大数据分析

一、测试要求:

1、 数据采集(要求至少爬取三千条记录,时间跨度超过一星期):(10分)

要求Python 编写程序爬取京东手机的评论数据,生成Json形式的数据文件。

京东商城部分数据格式如下图所示:

1.    "productCommentSummary": {  
2.    "goodRateShow": 95, //好评率  
3.    "poorRateShow": 3, //差评率  
4.    "poorCountStr": "1900+",  
5.    "averageScore": 5, //平均分  
6.    "generalCountStr": "1600+",  
7.    "oneYear": 0,  
8.    "showCount": 21000,  
9.    "showCountStr": "2.1万+",  
10.    "goodCount": 64000, //好评数  
11.    "generalRate": 0.024,  
12.    "generalCount": 1600, //中评数  
13.    "skuId": 4432058,  
14.    "goodCountStr": "6.4万+",  
15.    "poorRate": 0.028,  
16.    "afterCount": 2400,  
17.    "goodRateStyle": 142,  
18.    "poorCount": 1900, //差评数  
19.    "skuIds": null,  
20.    "poorRateStyle": 4,  
21.    "generalRateStyle": 4,  
22.    "commentCountStr": "6.8万+",  
23.    "commentCount": 68000,  
24.    "productId": 4432058, //商品id  
25.    "afterCountStr": "2400+",  
26.    "goodRate": 0.948,  
27.    "generalRateShow": 2 //中评率  
28.    },  
comments中一条评论的结构:

1.    “id": 10432588299,  
2.    “guid": "6c1d83b1-ac45-4189-a041-774eaff87df9",  
3.    “content": "割手,相当的割手,无语了", //评论内容 √  
4.    “creationTime": "2017-05-22 23:37:24", //写评论的时间 √  
5.    “isTop": false,                        //是否置顶  
6.    “referenceTime": "2017-05-20 18:35:11", //收货时间 √  
7.    “firstCategory": 9987,                 //第一分类 √  
8.    “secondCategory": 653,                 //第二分类 √  
9.    “thirdCategory": 655,                  //第三分类 √  
10.    “replyCount": 0,  
11.    “score": 3,                            //打分 √  
12.    “nickname": "j***柜",                  //昵称  √  
13.    “userClient": 2,  
14.    “productColor": "碳黑色",  
15.    “productSize": "32GB",  
16.    “userLevelName": "金牌会员",           //会员级别 √  
17.    “plusAvailable": 0,  
18.    “productSales": [  
19.       {  
20.           "dim": 3,  
21.           "saleName": "选择套装",  
22.           "saleValue": "官方标配"  
23.       }  
24.    ,  
25.    “userClientShow": "来自京东iPhone客户端",//评论设备  
26.    “isMobile": true,                       //是否移动端  
27.    “days": 2,                              //评论时间距【收货/下单】时间多长时间  
28.    “afterDays": 0  

使用python进行数据爬取,在爬取数据的同时进行数据清洗

import requests
import json
import csv
from lxml import etree
from bs4 import BeautifulSoup
import time

comment_url = 'https://club.jd.com/comment/productPageComments.action?callback'

# 获取评论
def get_comment(productid, name):
    headers = {
        # 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
        "Referer": "https://item.jd.com/%s.html" % (productid)
    }
    for i in range(100):  # 此处设置爬取几页的评论
        page = i
        params = {
            "productId": 100031406046,  # 商品id
            'score': 0,  # 如果想要爬取全部评论设置为0,好评为3,中评为2,差评为1,晒图评价为4,追评为5
            'sortType': 5,
            'page': page,
            'pageSize': 10,
        }
        comment_resp = requests.get(url=comment_url, params=params, headers=headers)
        comment_str = comment_resp.text
        if comment_str == '':
            print("获取内容为空")
        comment_dict = json.loads(comment_str)
        comments = comment_dict['comments']
        load(comments, name, productid)


# 数据存储
def load(comments, name, productid):
    for comment in comments:
        nickname = comment['nickname']
        content = comment['content'].replace(',','').replace('\n', '').replace('\r', '') #对数据进行处理
        g_uid = comment['guid']
        creationTime = comment['creationTime']
        is_Top = comment['isTop']
        plus = comment['plusAvailable']
        referenceTime = comment['referenceTime']
        score = comment['score']
        days = comment['days']
        is_Mobile = comment['userClient']

        if(is_Top == False):
            is_Top = "非置顶"
        else:
            is_Top = "置顶"

        if(is_Mobile == 2):
            is_Mobile = "IOS客户端"
        elif (is_Mobile == 0):
            is_Mobile = "PC端"
        elif(is_Mobile == 4):
            is_Mobile = "Android客户端"

        if(plus == 201):
            plus = "PLUS会员"
        else:
            plus = "非会员"


        test = g_uid \
               + ',' + nickname \
               + ',' + content \
               + ',' + str(score) \
               +',' + creationTime \
               + ',' + str(is_Top) \
               + ',' + plus \
               + ',' + referenceTime \
               + ',' + str(days) \
               + ',' + str(is_Mobile)

        # print(infor)
        with open('C:/Users/jzz/Desktop/' + '%s' % (name) + '.csv', 'a',
                  newline='')as csv_file:
            text = []
            text.append(test)
            writer = csv.writer(csv_file)
            writer.writerow(text)


# 获取搜索商品名界面的每个商品的序号
def get_number(name):
    headers = {
         'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
    }
    response = requests.get("https://search.jd.com/Search?keyword=%s&enc=utf-8" % (name), headers=headers)
    html = BeautifulSoup(response.text, 'lxml')
    list = html.find_all("li", class_='gl-item')
    numbers = []
    for number in list:
        numbers.append(int(number.get("data-sku")))
    return numbers


def main():
    get_comment(100031406046, "华为")
    #time.sleep(0.5)
    print("爬取完毕")


main()

 

2、数据预处理:要求使用MapReduce或者kettle实现源数据的预处理,对大量的Json文件,进行清洗,以得到结构化的文本文件。(10分)

(1)去除用户评论表的重复记录;

(2)按照清洗后的数据格式要求提取相应的数据字段。

 

 

 

此处的数据清洗已经在第一步完成

还可以使用kettle进行清洗(开源可视化EL工具)

3、 数据统计:生成Hive用户评论数据:(15分)

(1)在Hive创建一张表,用于存放清洗后的数据,表名为pinglun,字段名、字符类型、字段解释如下:

1.    productid        string  产品ID  
2.    commentcount     int     评论数  
3.    goodcount        int     好评数  
4.    generalcount     int     中评数  
5.    poorcount        int     差评数  
6.    goodrateshow     float   好评率  
7.    generalrateshow  float   中评率  
8.    poorrateshow     float   差评率  
9.    guid             string  随机生成ID  
10.    content          string  评论内容  
11.    creationtime     string  写评论的时间  
12.    score            int     打分  
13.    nickname         string  昵称  
14.    userlevelname    string  会员级别  
15.    userclientshow   string  评论设备  
16.    ismobile         string  是否移动端  
17.    days             int     评论时间距【收货/下单】时间多长时间  
create table pinglun(
guid string,
nickname string,
content string,
score int,
creationTime string,
isTop string,
plus string,
referenceTime string,
days int,
ismobile string
)
row format delimited fields terminated by ',';

load data local inpath '/data/jd/jd.csv' into table pinglun;

需求1:分析用户使用移动端购买还是PC端购买,及移动端和PC端的用户比例,生成ismobilehive表,存储统计结果;

create table ismobilehive(
clientname string,
num int
)row format delimited fields terminated by ',';

insert into table ismobilehive select ismobile,count(*) from pinglun group by ismobile;

--查看结果
select * from ismobilehive;

 

需求2:分析用户评论周期(收到货后,一般多久进行评论),生成dayssql表,存储统计结果;

create table dayssql(
  day string,
  num int
)
row format delimited fields terminated by ',';

insert into table dayssql select days,count(*) as num from pinglun group by days order by num desc;

--结果
select * from dayssql;

 

 

 

需求3:分析会员级别(判断购买此商品的用户级别),生成userlevelname_out表,存储统计结果;

create table userlevelname_out(
  userlevel string,
  num int
)row format delimited fields terminated by ',';

insert into table userlevelname_out select plus,count(*) as num from pinglun group by plus;

select * from userlevelname_out;

 

需求4:分析每天评论量,生成creationtime_out表,存储统计结果;

create table creationtime_out(
  time string,
  num int
)row format delimited terminated by ',';

insert into table creatiotime_out select substr(
creationTime,0,10) as time,count(*) as num from pinglun group by substr(creationTime,0,10);

select * from creationtime_out;

 

 

 

需求5:日期格式标准化

功能为:去掉评论时间的时分秒,只保留年月日

create table pinglun01(
guid string,
nickname string,
content string,
score int,
creationTime string,
isTop string,
plus string,
referenceTime string,
days int,
ismobile string
)
row format delimited fields terminated by ',';

insert into table pinglun01 select guid,nickname,content,score,substr(creationTime,0,10),isTop,plus,substr(referenceTime,0,10),days,ismobilefrom pinglun;

 

 

 

 

3、 利用Sqoop进行数据迁移至Mysql数据库:(5分)

将上述五个表倒入到相对应的mysql数据表中。

可以使用Sqoop(需要安装),或者导出文件再次进行数据导入

sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password whyjlbcdy2001 --table ismobilehive --export-dir /user/hive/warehouse/mi12pro.db/ismobilehive --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1

sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table dayssql --export-dir /user/hive/warehouse/mi12pro.db/dayssql --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1

sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table userlevelname_out --export-dir /user/hive/warehouse/mi12pro.db/userlevelname_out

sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table creationtime_out --export-dir /user/hive/warehouse/mi12pro.db/creationtime_out

sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table mi12pro --export-dir /user/hive/warehouse/mi12pro.db/comment02 --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1

 

3、 数据可视化:利用JavaWeb+Echarts完成数据图表展示过程(20分)

(实现前五步,获得60分)

需求1把用户对京东进行评论时使用的是客户端还是移动端的统计数据,用饼图进行数据展示,从而达到让观者能从中熟悉某个项目与整个数据组间所存在的比例关系的目的。

 

 

需求2把用户在收到货后,一般多久进行评论,即用户评论周期用柱状图展示,可以达到展现数据并将数据进行比较的目的。

 

 

需求3将购买某商品的用户级别进行统计的结果数据用饼状图展示,从而可以展现用户级别的比例构成关系,让观者能从中熟悉某个级别的用户数量与所有购买用户所存在的比例关系。

 

 

需求4将某件商品的每天的评论量的统计数据用折线图进行展现,可以展现出这个商品每天的评论量的变化趋势。

数据可视化比较简单,直接可视化即可

 

posted on 2023-02-17 10:12  跨越&尘世  阅读(217)  评论(0编辑  收藏  举报