开学测试——电子商务大数据分析
一、测试要求:
1、 数据采集(要求至少爬取三千条记录,时间跨度超过一星期):(10分)
要求Python 编写程序爬取京东手机的评论数据,生成Json形式的数据文件。
京东商城部分数据格式如下图所示:
1. "productCommentSummary": { 2. "goodRateShow": 95, //好评率 3. "poorRateShow": 3, //差评率 4. "poorCountStr": "1900+", 5. "averageScore": 5, //平均分 6. "generalCountStr": "1600+", 7. "oneYear": 0, 8. "showCount": 21000, 9. "showCountStr": "2.1万+", 10. "goodCount": 64000, //好评数 11. "generalRate": 0.024, 12. "generalCount": 1600, //中评数 13. "skuId": 4432058, 14. "goodCountStr": "6.4万+", 15. "poorRate": 0.028, 16. "afterCount": 2400, 17. "goodRateStyle": 142, 18. "poorCount": 1900, //差评数 19. "skuIds": null, 20. "poorRateStyle": 4, 21. "generalRateStyle": 4, 22. "commentCountStr": "6.8万+", 23. "commentCount": 68000, 24. "productId": 4432058, //商品id 25. "afterCountStr": "2400+", 26. "goodRate": 0.948, 27. "generalRateShow": 2 //中评率 28. }, comments中一条评论的结构: 1. “id": 10432588299, 2. “guid": "6c1d83b1-ac45-4189-a041-774eaff87df9", 3. “content": "割手,相当的割手,无语了", //评论内容 √ 4. “creationTime": "2017-05-22 23:37:24", //写评论的时间 √ 5. “isTop": false, //是否置顶 6. “referenceTime": "2017-05-20 18:35:11", //收货时间 √ 7. “firstCategory": 9987, //第一分类 √ 8. “secondCategory": 653, //第二分类 √ 9. “thirdCategory": 655, //第三分类 √ 10. “replyCount": 0, 11. “score": 3, //打分 √ 12. “nickname": "j***柜", //昵称 √ 13. “userClient": 2, 14. “productColor": "碳黑色", 15. “productSize": "32GB", 16. “userLevelName": "金牌会员", //会员级别 √ 17. “plusAvailable": 0, 18. “productSales": [ 19. { 20. "dim": 3, 21. "saleName": "选择套装", 22. "saleValue": "官方标配" 23. } 24. , 25. “userClientShow": "来自京东iPhone客户端",//评论设备 26. “isMobile": true, //是否移动端 27. “days": 2, //评论时间距【收货/下单】时间多长时间 28. “afterDays": 0
使用python进行数据爬取,在爬取数据的同时进行数据清洗
import requests import json import csv from lxml import etree from bs4 import BeautifulSoup import time comment_url = 'https://club.jd.com/comment/productPageComments.action?callback' # 获取评论 def get_comment(productid, name): headers = { # 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", "Referer": "https://item.jd.com/%s.html" % (productid) } for i in range(100): # 此处设置爬取几页的评论 page = i params = { "productId": 100031406046, # 商品id 'score': 0, # 如果想要爬取全部评论设置为0,好评为3,中评为2,差评为1,晒图评价为4,追评为5 'sortType': 5, 'page': page, 'pageSize': 10, } comment_resp = requests.get(url=comment_url, params=params, headers=headers) comment_str = comment_resp.text if comment_str == '': print("获取内容为空") comment_dict = json.loads(comment_str) comments = comment_dict['comments'] load(comments, name, productid) # 数据存储 def load(comments, name, productid): for comment in comments: nickname = comment['nickname'] content = comment['content'].replace(',',',').replace('\n', '').replace('\r', '') #对数据进行处理 g_uid = comment['guid'] creationTime = comment['creationTime'] is_Top = comment['isTop'] plus = comment['plusAvailable'] referenceTime = comment['referenceTime'] score = comment['score'] days = comment['days'] is_Mobile = comment['userClient'] if(is_Top == False): is_Top = "非置顶" else: is_Top = "置顶" if(is_Mobile == 2): is_Mobile = "IOS客户端" elif (is_Mobile == 0): is_Mobile = "PC端" elif(is_Mobile == 4): is_Mobile = "Android客户端" if(plus == 201): plus = "PLUS会员" else: plus = "非会员" test = g_uid \ + ',' + nickname \ + ',' + content \ + ',' + str(score) \ +',' + creationTime \ + ',' + str(is_Top) \ + ',' + plus \ + ',' + referenceTime \ + ',' + str(days) \ + ',' + str(is_Mobile) # print(infor) with open('C:/Users/jzz/Desktop/' + '%s' % (name) + '.csv', 'a', newline='')as csv_file: text = [] text.append(test) writer = csv.writer(csv_file) writer.writerow(text) # 获取搜索商品名界面的每个商品的序号 def get_number(name): headers = { 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", } response = requests.get("https://search.jd.com/Search?keyword=%s&enc=utf-8" % (name), headers=headers) html = BeautifulSoup(response.text, 'lxml') list = html.find_all("li", class_='gl-item') numbers = [] for number in list: numbers.append(int(number.get("data-sku"))) return numbers def main(): get_comment(100031406046, "华为") #time.sleep(0.5) print("爬取完毕") main()
2、数据预处理:要求使用MapReduce或者kettle实现源数据的预处理,对大量的Json文件,进行清洗,以得到结构化的文本文件。(10分)
(1)去除用户评论表的重复记录;
(2)按照清洗后的数据格式要求提取相应的数据字段。
此处的数据清洗已经在第一步完成
还可以使用kettle进行清洗(开源可视化EL工具)
3、 数据统计:生成Hive用户评论数据:(15分)
(1)在Hive创建一张表,用于存放清洗后的数据,表名为pinglun,字段名、字符类型、字段解释如下:
1. productid string 产品ID
2. commentcount int 评论数
3. goodcount int 好评数
4. generalcount int 中评数
5. poorcount int 差评数
6. goodrateshow float 好评率
7. generalrateshow float 中评率
8. poorrateshow float 差评率
9. guid string 随机生成ID
10. content string 评论内容
11. creationtime string 写评论的时间
12. score int 打分
13. nickname string 昵称
14. userlevelname string 会员级别
15. userclientshow string 评论设备
16. ismobile string 是否移动端
17. days int 评论时间距【收货/下单】时间多长时间
create table pinglun(
guid string,
nickname string,
content string,
score int,
creationTime string,
isTop string,
plus string,
referenceTime string,
days int,
ismobile string
)
row format delimited fields terminated by ',';
load data local inpath '/data/jd/jd.csv' into table pinglun;
需求1:分析用户使用移动端购买还是PC端购买,及移动端和PC端的用户比例,生成ismobilehive表,存储统计结果;
create table ismobilehive(
clientname string,
num int
)row format delimited fields terminated by ',';
insert into table ismobilehive select ismobile,count(*) from pinglun group by ismobile;
--查看结果
select * from ismobilehive;
需求2:分析用户评论周期(收到货后,一般多久进行评论),生成dayssql表,存储统计结果;
create table dayssql(
day string,
num int
)row format delimited fields terminated by ',';
insert into table dayssql select days,count(*) as num from pinglun group by days order by num desc;
--结果
select * from dayssql;
需求3:分析会员级别(判断购买此商品的用户级别),生成userlevelname_out表,存储统计结果;
create table userlevelname_out(
userlevel string,
num int
)row format delimited fields terminated by ',';
insert into table userlevelname_out select plus,count(*) as num from pinglun group by plus;
select * from userlevelname_out;
需求4:分析每天评论量,生成creationtime_out表,存储统计结果;
create table creationtime_out(
time string,
num int
)row format delimited terminated by ',';
insert into table creatiotime_out select substr(creationTime,0,10) as time,count(*) as num from pinglun group by substr(creationTime,0,10);
select * from creationtime_out;
需求5:日期格式标准化
功能为:去掉评论时间的时分秒,只保留年月日
create table pinglun01(
guid string,
nickname string,
content string,
score int,
creationTime string,
isTop string,
plus string,
referenceTime string,
days int,
ismobile string
)
row format delimited fields terminated by ',';
insert into table pinglun01 select guid,nickname,content,score,substr(creationTime,0,10),isTop,plus,substr(referenceTime,0,10),days,ismobilefrom pinglun;
3、 利用Sqoop进行数据迁移至Mysql数据库:(5分)
将上述五个表倒入到相对应的mysql数据表中。
可以使用Sqoop(需要安装),或者导出文件再次进行数据导入
sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password whyjlbcdy2001 --table ismobilehive --export-dir /user/hive/warehouse/mi12pro.db/ismobilehive --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1 sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table dayssql --export-dir /user/hive/warehouse/mi12pro.db/dayssql --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1 sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table userlevelname_out --export-dir /user/hive/warehouse/mi12pro.db/userlevelname_out sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table creationtime_out --export-dir /user/hive/warehouse/mi12pro.db/creationtime_out sqoop export --connect jdbc:mysql://192.168.88.1:3306/test01 --username root --password why --table mi12pro --export-dir /user/hive/warehouse/mi12pro.db/comment02 --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1
3、 数据可视化:利用JavaWeb+Echarts完成数据图表展示过程(20分)
(实现前五步,获得60分)
需求1:把用户对京东进行评论时使用的是客户端还是移动端的统计数据,用饼图进行数据展示,从而达到让观者能从中熟悉某个项目与整个数据组间所存在的比例关系的目的。
需求2:把用户在收到货后,一般多久进行评论,即用户评论周期用柱状图展示,可以达到展现数据并将数据进行比较的目的。
需求3:将购买某商品的用户级别进行统计的结果数据用饼状图展示,从而可以展现用户级别的比例构成关系,让观者能从中熟悉某个级别的用户数量与所有购买用户所存在的比例关系。
需求4:将某件商品的每天的评论量的统计数据用折线图进行展现,可以展现出这个商品每天的评论量的变化趋势。
数据可视化比较简单,直接可视化即可