电子商务大数据分析

一、测试要求:

1、 数据采集(要求至少爬取三千条记录,时间跨度超过一星期):(10分)

要求Python 编写程序爬取京东手机的评论数据,生成Json形式的数据文件。

京东商城部分数据格式如下图所示:

1.    "productCommentSummary": {  
2.    "goodRateShow": 95, //好评率  
3.    "poorRateShow": 3, //差评率  
4.    "poorCountStr": "1900+",  
5.    "averageScore": 5, //平均分  
6.    "generalCountStr": "1600+",  
7.    "oneYear": 0,  
8.    "showCount": 21000,  
9.    "showCountStr": "2.1万+",  
10.    "goodCount": 64000, //好评数  
11.    "generalRate": 0.024,  
12.    "generalCount": 1600, //中评数  
13.    "skuId": 4432058,  
14.    "goodCountStr": "6.4万+",  
15.    "poorRate": 0.028,  
16.    "afterCount": 2400,  
17.    "goodRateStyle": 142,  
18.    "poorCount": 1900, //差评数  
19.    "skuIds": null,  
20.    "poorRateStyle": 4,  
21.    "generalRateStyle": 4,  
22.    "commentCountStr": "6.8万+",  
23.    "commentCount": 68000,  
24.    "productId": 4432058, //商品id  
25.    "afterCountStr": "2400+",  
26.    "goodRate": 0.948,  
27.    "generalRateShow": 2 //中评率  
28.    },  
comments中一条评论的结构:

1.    “id": 10432588299,  
2.    “guid": "6c1d83b1-ac45-4189-a041-774eaff87df9",  
3.    “content": "割手,相当的割手,无语了", //评论内容 √  
4.    “creationTime": "2017-05-22 23:37:24", //写评论的时间 √  
5.    “isTop": false,                        //是否置顶  
6.    “referenceTime": "2017-05-20 18:35:11", //收货时间 √  
7.    “firstCategory": 9987,                 //第一分类 √  
8.    “secondCategory": 653,                 //第二分类 √  
9.    “thirdCategory": 655,                  //第三分类 √  
10.    “replyCount": 0,  
11.    “score": 3,                            //打分 √  
12.    “nickname": "j***柜",                  //昵称  √  
13.    “userClient": 2,  
14.    “productColor": "碳黑色",  
15.    “productSize": "32GB",  
16.    “userLevelName": "金牌会员",           //会员级别 √  
17.    “plusAvailable": 0,  
18.    “productSales": [  
19.       {  
20.           "dim": 3,  
21.           "saleName": "选择套装",  
22.           "saleValue": "官方标配"  
23.       }  
24.    ,  
25.    “userClientShow": "来自京东iPhone客户端",//评论设备  
26.    “isMobile": true,                       //是否移动端  
27.    “days": 2,                              //评论时间距【收货/下单】时间多长时间  
28.    “afterDays": 0  

python代码

import requests
import json
import csv
from lxml import etree
from bs4 import BeautifulSoup
import time

'''
#如果ip被封,可以使用这个进行ip设置
proxy ='114.239.148.103'
proxies={
    'http':'http://'+proxy,
    'https':'https://'+proxy,
    }
'''
comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback'


# 获取评论
def get_comment(productid, name):
    headers = {
        # 'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
        "Referer": "https://item.jd.com/%s.html" % (productid)
    }
    for i in range(100):  # 此处设置爬取几页的评论
        page = i
        params = {
            "productId": 100031406046,  # 商品id
            'score': 0,  # 如果想要爬取全部评论设置为0,好评为3,中评为2,差评为1,晒图评价为4,追评为5
            'sortType': 5,
            'page': page,
            'pageSize': 10,
        }
        comment_resp = requests.get(url=comment_url, params=params, headers=headers)
        comment_str = comment_resp.text
        if comment_str == '':
            print("获取内容为空")
        comment_dict = json.loads(comment_str)
        comments = comment_dict['comments']
        load(comments, name, productid)


# 数据存储
def load(comments, name, productid):
    for comment in comments:
        nickname = comment['nickname']
        content = comment['content'].replace(',','').replace('\n', '').replace('\r', '')
        g_uid = comment['guid']
        creationTime = comment['creationTime']
        is_Top = comment['isTop']
        plus = comment['plusAvailable']
        referenceTime = comment['referenceTime']
        score = comment['score']
        days = comment['days']
        is_Mobile = comment['userClient']

        if(is_Top == False):
            is_Top = "非置顶"
        else:
            is_Top = "置顶"

        if(is_Mobile == 2):
            is_Mobile = "IOS客户端"
        elif (is_Mobile == 0):
            is_Mobile = "PC端"
        elif(is_Mobile == 4):
            is_Mobile = "Android客户端"

        if(plus == 201):
            plus = "PLUS会员"
        else:
            plus = "非会员"


        test = g_uid \
               + ',' + nickname \
               + ',' + content \
               + ',' + str(score) \
               +',' + creationTime \
               + ',' + str(is_Top) \
               + ',' + plus \
               + ',' + referenceTime \
               + ',' + str(days) \
               + ',' + str(is_Mobile)

        # print(infor)
        with open('C:/Users/wuhao/Desktop/JD评论/' + '%s' % (name) + '.csv', 'a',
                  newline='')as csv_file:
            text = []
            text.append(test)
            writer = csv.writer(csv_file)
            writer.writerow(text)


# 获取搜索商品名界面的每个商品的序号
def get_number(name):
    headers = {
         'cookie': 'shshshfpa=4e6c0f90-587c-a46f-5880-a7debd7d4393-1544616560; __jdu=1126324296; PCSYCityID=412; user-key=44089d07-befa-4522-87fc-bcc039ec7045; pinId=qopcdCj6kcR3U84v0KTTbrV9-x-f3wj7; pin=jd_769791719e9e9; unick=jd_769791719e9e9; _tp=nc%2FbpB%2BkeSbk3jZ6p2H0FlWrdUa1gbgi16QiQ7NBXKY%3D; _pst=jd_769791719e9e9; cn=9; ipLoc-djd=1-72-2799-0; mt_xid=V2_52007VwMSUVpaUV8cQR5sUWMDEgUIUVBGGEofWhliABNUQQtQWkpVHVVXb1ZGB1lYW11LeRpdBW4fElFBW1VLH0ESXgJsAhpiX2hSahxLGFsFZwcRUG1bWlo%3D; shshshfpb=bRnqa4s886i2OeHTTR9Nq6g%3D%3D; unpl=V2_ZzNtbUZTSxJ3DURTLk0LAmJXFVlKAkdAIQ1PUXseCVIzU0UKclRCFXwURldnGlUUZwcZXERcQRdFCHZXchBYAWcCGllyBBNNIEwHDCRSBUE3XHxcFVUWF3RaTwEoSVoAYwtBDkZUFBYhW0IAKElVVTUFR21yVEMldQl2VHsaWwdkBhFVRWdzEkU4dl17HVwDYDMTbUNnAUEpAUJRfRpcSGcDEVpAVEYWfQ92VUsa; __jda=122270672.1126324296.1544405080.1545968922.1545980857.16; __jdc=122270672; ceshi3.com=000; TrackID=11EpDXYHaqwJE15W6paeMk_GMm05o3NUUeze9XyIcFs33GGxX8knxMpxWTeID75qSiUlj31s8CtKJs4hJUV-7CvKuiOEyDd8bvOCH7zzigeI; __jdv=122270672|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_55963436def64e659d5de48416dfeaff|1545980984854; 3AB9D23F7A4B3C9B=OA3G4SO3KYLQB6H3AIX36QQAW34BF376WJN66IUPEQAG6FUA2NWGM6R6MBDL32HLDG62WL2FICMYIVMOU6ISUWHKPE; shshshfp=1ed96ad08a7585648cd5017583df22bd; _gcl_au=1.1.162218981.1545981094; JSESSIONID=305879A97D4EA21F4D5C4207BB81423F.s1; shshshsID=c8c51ee0c5b1ddada7c5544abc3eea8a_5_1545981289039; __jdb=122270672.11.1126324296|16.1545980857; thor=3A30EBABA844934A836AC9AA37D0F4B85306071BD7FC64831E361A626E76F6977EC7794D06F2A922AEABF7D3D7DC22FBE2EB6B240F81A13F5A609368D4185BA0081D7C34A93760063D2F058F5B916835B4960EC8A9122008745971D812BA9E4AE48542CCC5A42E5CD786CC93770E520E36F950614C06A7EB05C8E1DD93EEA844B2EBA9B0136063FCFB6B7C83AECA828774041A9FED7BD98496689496122822FF',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
    }
    response = requests.get("https://search.jd.com/Search?keyword=%s&enc=utf-8" % (name), headers=headers)
    html = BeautifulSoup(response.text, 'lxml')
    list = html.find_all("li", class_='gl-item')
    numbers = []
    for number in list:
        numbers.append(int(number.get("data-sku")))
    return numbers


def main():
    get_comment(100031406046, "差评-小米12Pro")
    #time.sleep(0.5)
    print("爬取完毕")


main()

 

遇到的问题:京东每种商品只能显示100页评论,一页十条,也就是最多只能爬到1000条评论,另外一个问题是网站能爬取到的内容与题目给出的不完全相符,应该是因为网站的更新。

2、数据预处理:要求使用MapReduce或者kettle实现源数据的预处理,对大量的Json文件,进行清洗,以得到结构化的文本文件。(10分)

1)去除用户评论表的重复记录;

2)按照清洗后的数据格式要求提取相应的数据字

 

 

 清洗后的标准文本格式:

 

1.    "id": 10432588299,  
2.    "guid": "6c1d83b1-ac45-4189-a041-774eaff87df9",  
3.    "content": "割手,相当的割手,无语了",   //评论内容 √  
4.    "creationTime": "2017-05-22 23:37:24",  //写评论的时间 √  
5.    "isTop": false,                         //是否置顶  
6.    "referenceTime": "2017-05-20 18:35:11", //收货时间 √  
7.    "score": 3,                             //打分 √  
8.    "nickname": "j***柜",                   //昵称  √  
9.    "userLevelName": "金牌会员",            //会员级别 √  
10.    "userClientShow": "来自京东iPhone客户端",//评论设备  
11.    "isMobile": true,                       //是否移动端  
12.    "days": 2,                              //评论时间距【收货/下单】时间多长时间  

爬取到的已经是处理好的,这步跳过

3、数据统计:生成Hive用户评论数据:(15分)

1)在Hive创建一张表,用于存放清洗后的数据,表名为pinglun,字段名、字符类型、字段解释如下:

1.    productid        string  产品ID  
2.    commentcount     int     评论数  
3.    goodcount        int     好评数  
4.    generalcount     int     中评数  
5.    poorcount        int     差评数  
6.    goodrateshow     float   好评率  
7.    generalrateshow  float   中评率  
8.    poorrateshow     float   差评率  
9.    guid             string  随机生成ID  
10.    content          string  评论内容  
11.    creationtime     string  写评论的时间  
12.    score            int     打分  
13.    nickname         string  昵称  
14.    userlevelname    string  会员级别  
15.    userclientshow   string  评论设备  
16.    ismobile         string  是否移动端  
17.    days             int     评论时间距【收货/下单】时间多长时间  
create table comment(
guid string,
nickname string,
content string,
score int,
creationTime string,
is_Top string,
plus string,
referenceTime string,
days int,
is_Mobile string
)
row format delimited fields terminated by ',';

load data local inpath '/data/jd/jd.csv' into table comment;

需求1:分析用户使用移动端购买还是PC端购买,及移动端和PC端的用户比例,生成ismobilehive表,存储统计结果;

建表,ismobilehive

insert into table ismobilehive select 'Android',count(*) from comment where is_mobile = 'Android客户端';
insert into table ismobilehive select 'IOS',count(*) from comment where is_mobile = 'IOS客户端';
insert into table ismobilehive select 'PC',count(*) from comment where is_mobile = 'PC端';

需求2:分析用户评论周期(收到货后,一般多久进行评论),生成dayssql表,存储统计结果;

create table dayssql(
day int,
num string
)
row format delimited fields terminated by ',';

**************************************************************************************************************
insert into table dayssql select '0',count(*) from comment01 where days = '0';
insert into table dayssql select '1',count(*) from comment01 where days = '1';
insert into table dayssql select '2',count(*) from comment01 where days = '2';
insert into table dayssql select '3',count(*) from comment01 where days = '3';
insert into table dayssql select '4',count(*) from comment01 where days = '4';
insert into table dayssql select '5',count(*) from comment01 where days = '5';
insert into table dayssql select '6',count(*) from comment01 where days = '6';
insert into table dayssql select '7',count(*) from comment01 where days = '7';
insert into table dayssql select '8',count(*) from comment01 where days = '8';
insert into table dayssql select '9',count(*) from comment01 where days = '9';
insert into table dayssql select '10',count(*) from comment01 where days = '10';
insert into table dayssql select '11',count(*) from comment01 where days = '11';
insert into table dayssql select '12',count(*) from comment01 where days = '12';
insert into table dayssql select '13',count(*) from comment01 where days = '13';
insert into table dayssql select '14',count(*) from comment01 where days = '14';
insert into table dayssql select '大于14天小于21天',count(*) from comment01 where days > '14' and days <= '21';
insert into table dayssql select '大于21天小于30天',count(*) from comment01 where days > '21' and days <= '30';
insert into table dayssql select '大于30天',count(*) from comment01 where days > '30';
**************************************************************************************************************

需求3:分析会员级别(判断购买此商品的用户级别),生成userlevelname_out表,存储统计结果;

create table userlevelname_out(
level string,
num int
)
row format delimited fields terminated by ',';

insert into table userlevelname_out select 'PLUS会员',count(*) from comment01 where plus = 'PLUS会员';
insert into table userlevelname_out select '非会员',count(*) from comment01 where plus = '非会员';

需求4:分析每天评论量,生成creationtime_out表,存储统计结果;

create table creationtime_out(
day string,
num int
)
row format delimited fields terminated by ',';

insert into table creationtime_out select '2021年12月',count(*) from comment01 where creationtime between '2021-12-01 00:00:00' and '2021-12-31 24:00:00';
insert into table creationtime_out select '2022年01月',count(*) from comment01 where creationtime between '2022-01-01 00:00:00' and '2022-01-31 24:00:00';
insert into table creationtime_out select '2022年02月',count(*) from comment01 where creationtime between '2022-02-01 00:00:00' and '2022-02-28 24:00:00';
insert into table creationtime_out select '2022年03月',count(*) from comment01 where creationtime between '2022-03-01 00:00:00' and '2022-03-31 24:00:00';

select * from creationtime_out;

需求5:日期格式标准化需求5:日期格式标准化

功能为:去掉评论时间的时分秒,只保留年月日

create table comment02(
guid string,
nickname string,
content string,
score int,
creationTime string,
is_Top string,
plus string,
referenceTime string,
days int,
is_Mobile string
)
row format delimited fields terminated by ',';

insert into table comment02 select guid,nickname,content,score,substr(creationTime,0,10),is_Top,plus,substr(referenceTime,0,10),days,is_Mobile from comment;

4、 利用Sqoop进行数据迁移至Mysql数据库:(5分)

将处理好的数据导入到mysql数据表中

sqoop export --connect jdbc:mysql://192.168.51.100:3306/test01 --username root --password whyjlbcdy2001 --table ismobilehive --export-dir /user/hive/warehouse/mi12pro.db/ismobilehive --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1

sqoop export --connect jdbc:mysql://192.168.51.100:3306/test01 --username root --password why --table dayssql --export-dir /user/hive/warehouse/mi12pro.db/dayssql --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1

sqoop export --connect jdbc:mysql://192.168.51.100:3306/test01 --username root --password why --table userlevelname_out --export-dir /user/hive/warehouse/mi12pro.db/userlevelname_out

sqoop export --connect jdbc:mysql://192.168.51.100:3306/test01 --username root --password why --table creationtime_out --export-dir /user/hive/warehouse/mi12pro.db/creationtime_out

sqoop export --connect jdbc:mysql://192.168.51.100:3306/test01 --username root --password why --table mi12pro --export-dir /user/hive/warehouse/mi12pro.db/comment02 --input-fields-terminated-by ',' --input-null-string '\\N' --input-null-non-string '\\N' --m 1

遇到的问题:

有些表需要设置一些特殊参数才能正常导入(--input-null-string '\\N';--input-null-string ',')等

5、 数据可视化:利用JavaWeb+Echarts完成数据图表展示过程(20分)

很简单,利用导出的mysql数据画图就行了

需求1:可视化展示截图

把用户对京东进行评论时使用的是客户端还是移动端的统计数据,用饼图进行数据展示,从而达到让观者能从中熟悉某个项目与整个数据组间所存在的比例关系的目的。

 

需求2:可视化展示截图

 

把用户在收到货后,一般多久进行评论,即用户评论周期用柱状图展示,可以达到展现数据并将数据进行比较的目的。

 

 

 需求3:可视化展示截图

将购买某商品的用户级别进行统计的结果数据用饼状图展示,从而可以展现用户级别的比例构成关系,让观者能从中熟悉某个级别的用户数量与所有购买用户所存在的比例关系。

 

 

 需求4:可视化展示截图

将某件商品的每天的评论量的统计数据用折线图进行展现,可以展现出这个商品每天的评论量的变化趋势。

 

 

 6、 中文分词实现用户评价分析。(20分)

(1)本节通过对商品评论表中的差评数据,进行分析,筛选用户差评点,以知己知彼。(筛选差评数据集截图)

(2)利用 python 结巴分词实现用户评价信息中的中文分词及词频统计;(分词后截图)

(3)在 hive 中新建词频统计表并加载分词数据;

要求实现:

①实现用户评价信息中的中文分词;

②实现中文分词后的词频统计;

③在 hive 中新建词频统计表加载分词数据;

④柱状图可视化展示用户差评的统计前十类。

 

⑤用词云图可视化展示用户差评分词。

 

 

步骤:首先使用之前的爬虫,设置参数只爬取差评,爬完后用jieba做词频统计,最后使用词云图进行展示

jieba分词

import jieba
import sys

# log
class Logger(object):
    def __init__(self, fileN="Default.log"):
        self.terminal = sys.stdout
        self.log = open(fileN, "a")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        pass


txt = open("C:\\Users\wuhao\Desktop\差评集.csv", "r", encoding='utf-8').read()
words = jieba.lcut(txt , cut_all=True)  # 使用精确模式对文本进行分词
counts = {}  # 通过键值对的形式存储词语及其出现的次数

for word in words:
    if len(word) == 1:  # 单个词语不计算在内
        continue
    else:
        counts[word] = counts.get(word, 0) + 1  # 遍历所有词语,每出现一次其对应的值加 1

items = list(counts.items())  # 将键值对转换成列表
items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序

sys.stdout = Logger("C:\\Users\wuhao\Desktop\差评词频.csv") #这里我将Log输出到D盘
#下面所有的方法,只要控制台输出,都将写入"D:\\1.txt"

for i in range(100):
    word, count = items[i]
    print("{0:<5}{1:>5}".format(word,count))

 

 

 

 

 7、 利用Spark进行实时数据分析。(20分)

本实验以京东商品评论为目标网站,架构采用爬虫+Flume+Kafka+Spark Streaming+Mysql,实现数据动态实时的采集、分析、展示数据。

 

 

 

具体工作流程如下图:

 

 

操作流程:

首先使用python脚本,爬取到当天的商品评论数量,将其作为文件发送到liunx服务器上。

之后使用flume与kafka连接,flume读取文件信息发送给kafka的消费者

最后通过sparkstreaming将kafka中的数据存入mysql,使用javaweb+echart进行图标展示

所需环境:

jdk8

zookeeper3.6.2

spark3.2.1

flume1.9.0

kafka2.4.1(scala2.11)

操作步骤:

0.安装、配置好运行环境

1.运行python脚本爬取京东商品评论数量并发送到服务器上

# -*- coding: utf-8 -*-
import gzip
import urllib.request
import json
import time
import random
import demjson as dj
import requests
import itertools

headers = {
    "Cookie": "__jdu=1507876332; shshshfpa=2ea021ee-52dd-c54e-1be1-f5aa9e333af2-1640075639; areaId=5; PCSYCityID=CN_0_0_0; shshshfpb=n7UymiTWOsGPvQfCup%2B3J1g%3D%3D; ipLoc-djd=5-142-42547-54561; jwotest_product=99; pinId=S4TjgVP4kjjnul02leqp07V9-x-f3wj7; pin=jd_60a1ab2940be3; unick=jd_60a1ab2940be3; ceshi3.com=000; _tp=672TNfWmOtaDFuqCPQqYycXMpi6F%2BRiwrhIuumNmoJ4%3D; _pst=jd_60a1ab2940be3; __jdc=122270672; shshshfp=4e8d45f57897e469586da47a0016f20e; ip_cityCode=142; CCC_SE=ADC_rzqTR2%2bUDTtHDYjJdX25PEGvHsBpPY%2bC9pRDVdNK7pU%2fwikRihpN3XEXZ1cn4Jd4w5OWdpJuduhBFwUvdeB6X1VFb7eIZkqL0OJvBn9RB6AJYo4An%2fGTiU%2b8rvqQwYxBI4QCM8a9w9kYQczygSjPxPjn1pbQLtBgo%2fzKBhwfKhAWs563NfBjmnRlkGHPX6E7jy6%2fEdfEhtkNSTCQod238cEpUFpKiQ%2bWV%2bW8MiaL3ti7d7ozdlNbZ03ylqRbI1XrXylDiqzW%2b2uALhF5H1eHuk3yH3t4ojXZmRbDy3k2OoZFk%2bcmrXD0eWhcIaD5RnhHbToYLuX%2byx7otaPuemTVAG4Z7CSyEfmUBAj7QuGmHg647a7KuoaR3hoCvxj%2f3woXdd2H9b40oqmJ5PO958Z1g%2fr7Jbk8a5w2CU547IaXRzakehLhW9xzG57Ak0Jhv85Jlt9A5N6hl%2ft4DSAwh%2bGhwg%3d%3d; unpl=JF8EAJJnNSttDBxWAxxSEkUVQg4EW1QKTx9TazcCAV8KSFICE1FIF0N7XlVdXhRKFR9vYhRUW1NPVA4ZBysSEXteVV1YCE0TAGlnNWRtW0tkBCsCHxMWQltTXF8LeycDZ2M1VFxZSlYGHQEbEBBCbWRbXQlKFQBpYQVQbVl7VTVZbEJTDBkCBxNdDEoRCmlgB1ZeaEpkBg; JSESSIONID=347F847A6818E35675648739BD4BA9FF.s1; __jda=122270672.1507876332.1640075637.1647251498.1647261295.13; thor=8D225D1673AA75681B9D3811417B0D325568BB2DD7F2729798D3AECF0428F59F7C70EA7504347F8E059F895AEE7D6E2662F565665845F0D94F2D7D56739CF3BC2B15F5F6E2ADDB891DDA80A9E9F88B7BA0BA95147512F78D28D8095E52379AB78550E451558DB6595C2270A1D5CFA2E211FF20F22ADA1987C6AE9E864DA6A7364D5BFD3EE08DA597D2EF2B37444CFD7A47134EFFD71B3A70B0C8BD55D51F274F; token=397b2c7c58f4021bbe9a9bbe9eeda694,3,915145; __tk=46fbcc7e51f75824dcdc2e8820904365,3,915145; shshshsID=5c5095f0b5728a839c0397308d625da5_1_1647261360535; __jdb=122270672.2.1507876332|13.1647261295; __jdv=122270672|jd.idey.cn|t_2024175271_|tuiguang|ef376a8f48ba48359a5a6d3c2769bb4b|1647261360584; 3AB9D23F7A4B3C9B=24HI5ARAA3SK7RJERTWUDZKA2NYJIXX3ING24VG466VC3ANKUALJLLD7VBYLQ3QPRYUSO3R6QBJYLVTVXBDIGJLGBA",
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39"
}
headers2 = {
    "accept": "*/*",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "cookie": "__jdu=1507876332; shshshfpa=2ea021ee-52dd-c54e-1be1-f5aa9e333af2-1640075639; areaId=5; PCSYCityID=CN_0_0_0; ipLoc-djd=5-142-42547-54561; pinId=S4TjgVP4kjjnul02leqp07V9-x-f3wj7; pin=jd_60a1ab2940be3; unick=jd_60a1ab2940be3; _tp=672TNfWmOtaDFuqCPQqYycXMpi6F%2BRiwrhIuumNmoJ4%3D; _pst=jd_60a1ab2940be3; user-key=a2aaf011-2c1e-4dea-bf76-3392d16b1fb1; __jdc=122270672; wlfstk_smdl=jlwwba2gmccq62touff9evvbp3fk8gbr; ceshi3.com=000; shshshfp=4e8d45f57897e469586da47a0016f20e; ip_cityCode=142; shshshfpb=n7UymiTWOsGPvQfCup%2B3J1g%3D%3D; joyya=1647305570.1647305909.27.0kop377; __jda=122270672.1507876332.1640075637.1647314039.1647318046.22; token=d5899471c4530886f6a9658cbea3ca94,3,915176; __tk=1570759a7dd1a720b0db2dec5df8d044,3,915176; CCC_SE=ADC_Wj0UWzuXioxsiUvbIxw9PbW9q011vNMASHkfjXFO%2fZlkeGDtZUHe5qgaEpWv8RDEkCruGSGmCItsvHjIZ3aHbh9heUjNIZh6WZl9ZDfDokk66kRX6I%2by%2bDsdf4JtPOQUuULSsWOA%2fcDyP7Bb91YuHOwNnciLtS97UIKO7XA5sAd34Rf4XDKijy6Fw1DFTx%2b7izzme6YALuLp9Y%2bByC6aUTDzU9te7g1BZXPXtfGGwqu52ZVkdVId2jpxPnhX24fFD9WI9aX1qgswZ1PPZSGYKswUkqXhIf2S9aLFkjXW2n61LVzw2ZeqJRQI8QIcmi%2fF7WHOHLbWScnKwG594WIk0SRiCa0n2aEJAhVlXmzEE%2f5%2f%2bXWsKhlneTLduVs52ST5m96zdx%2bLnNGgDERqznFNu3AT5zvLcN0PyVq08n4keSv2ngLLTZK4QQJslS4he9MT3XJoEUfe9L8beZNh1239eLHYF6w4KWMCWWTfwxdCUOY%3d; unpl=JF8EAJZnNSttDEhSAkwDE0dEGAoEWw8LSh9TbjRVXV5QHFIDGwMfGhd7XlVdXhRKFR9vYxRUXlNIUw4ZBysSEXteVV1YCE0TAGlnNWRtW0tkBCsCHxMWQltTXF8LeycDZ2M1VFxZSlYGGwcTEhhObWRbXQlKFQBpYQVQbVl7VTVNbBsTEUpcVVteDENaA2tmA11bX0lWBisDKxE; __jdv=122270672|jd.idey.cn|t_2024175271_|tuiguang|e276f09debfa4c209a0ba829f7710596|1647318395561; thor=8D225D1673AA75681B9D3811417B0D325568BB2DD7F2729798D3AECF0428F59F4C39726C44E930AA2DD868FC4BCA33EA0D52228F39A68FC9F5C1157433CAACF1110B20B6975502864453B70E6B21C0ED165B733359002643CD05BDBA37E4A673AF38CC827B6013BCB5961ADA022E57DB6811E99E10E9C4E6410D844CD129071F7646EC7CE120A0B3D2F768020B044A010452D9F8ABD67A59D41880DD1991935C; 3AB9D23F7A4B3C9B=24HI5ARAA3SK7RJERTWUDZKA2NYJIXX3ING24VG466VC3ANKUALJLLD7VBYLQ3QPRYUSO3R6QBJYLVTVXBDIGJLGBA; __jdb=122270672.5.1507876332|22.1647318046; shshshsID=d7a96097b296c895558adfd840546a72_5_1647318650562",
    "referer": "https://search.jd.com/"
}
def crawlProductComment(url):
    # 读取原始数据(注意选择gbk编码方式)
    try:
        req = requests.get(url=url, headers=headers2).text
        reqs = req.replace("fetchJSON_comment98(", "").strip(');')
        print(reqs)
        jsondata = json.loads(reqs)
        # 遍历商品评论列表
        comments = jsondata['comments']
        return comments
    except IOError:
        print("Error: gbk不合适")
    # 从原始数据中提取出JSON格式数据(分别以'{'和'}'作为开始和结束标志)

def getProduct(url):
    ids = []
    req = requests.get(url=url, headers=headers2).text
    reqs = req.replace("jQuery1544821(", "").strip(')')
    jsondata = json.loads(reqs)['291']
    for i in range(0, len(jsondata)):
        ids.append(jsondata[i]['sku_id'])
    print(ids)
    return ids



import paramiko

#服务器信息,主机名(IP地址)、端口号、用户名及密码
hostname = "192.168.51.100"
port = 22
username = "root"
password = "whyjlbcdy2001"


client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname, port, username, password, compress=True)
sftp_client = client.open_sftp()
remote_file = sftp_client.open("/opt/a.log",'a+')  # 文件路径
ids = []
for i in range(2,3):
    product_id = getProduct(
        "https://search-x.jd.com/Search?callback=jQuery1544821&area=5&enc=utf-8&keyword=%E7%94%B7%E5%A3%AB%E8%BF%90%E5%8A%A8%E9%9E%8B&adType=7&page="+str(i)+"&ad_ids=291%3A33&xtest=new_search&_=1647325621019")
    time.sleep(random.randint(1, 3))
    ids.append(product_id)

data = []
count = 0
for k in list(set(itertools.chain.from_iterable(ids))):
    for i in range(0, 100):
        url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=' + str(
            k) + '&score=0&sortType=5&page=' \
              + str(i) + '&pageSize=10&isShadowSku=0&fold=1'
        comments = crawlProductComment(url)
        if len(comments) <= 0:
            break
        print(comments)
        remote_file.writelines(str(len(comments))+"\n")
        data.extend(comments)
        # 设置休眠时间
        time.sleep(random.randint(1, 5))
        print('-------', i)

    print("这是第{}类商品".format(count))
    count += 1

2.编写flume启动所需的配置文件,启动flume(配置文件在启动命令中作为参数引用)

配置文件

pro.sources = s1
pro.channels = c1
pro.sinks = k1
 
pro.sources.s1.type = exec
pro.sources.s1.command = tail -F /opt/a.log
 
pro.channels.c1.type = memory
pro.channels.c1.capacity = 1000
pro.channels.c1.transactionCapacity = 100
 
pro.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
pro.sinks.k1.kafka.topic = ct
pro.sinks.k1.kafka.bootstrap.servers = node01:9092,node02:9092,node03:9092
pro.sinks.k1.kafka.flumeBatchSize = 20
pro.sinks.k1.kafka.producer.acks = 1
pro.sinks.k1.kafka.producer.linger.ms = 1
pro.sinks.k1.kafka.producer.compression.type = snappy
 
pro.sources.s1.channels = c1
pro.sinks.k1.channel = c1

这里要与上传到服务器的数据文件路径一致

  pro.sources.s1.command = tail -F /opt/a.log  

这个是集群的ip和运行端口

  pro.sinks.k1.kafka.bootstrap.servers = node01:9092,node02:9092,node03:9092

这个是topic的名称,与自己创建的名称一致就行

  pro.sinks.k1.kafka.topic = ct

遇到的坑:最好不要在里面使用'#'来添加中文注释,否则可能会报错

启动flume

/home/apache-flume-1.9.0-bin/bin/flume-ng agent --conf-file /home/apache-flume-1.9.0-bin/conf/flume-kafka.conf --name pro -Dflume.root.logger=INFO,LOGFILE

3.启动kafka,创建一个topic并启动

为了验证kafka的正常启动,我们打开两个终端窗口,一个当消费者一个当生产者,来验证是否能正常通信

#创建topic
/home/kafka_2.11-2.4.1/bin/kafka-topics.sh --create --zookeeper node01:2181 --replication-factor 1 --partitions 3 --topic ct
#启动topic(生产者)
/home/kafka_2.11-2.4.1/bin/kafka-console-producer.sh --broker-list 192.168.51.100:9092 --topic ct
#模拟消费者
/home/kafka_2.11-2.4.1/bin/kafka-console-consumer.sh --bootstrap-server node01:9092 --topic ct --from-beginning

如果正常的话,生产者窗口发送消息,消费者窗口可以正常收到消息。

同时,如果已经运行了python脚本,上传了数据到虚拟机,且打开了flume,没有报错的话现在已经开始读取数据了,类似下面的图

 

 4.运行sparkstreaming程序(Scala语言),将kafka中的数据写入mysql

package spark

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

import java.sql.{Connection, DriverManager}
import java.text.SimpleDateFormat
import java.util.Date


object Streaming {
  def main(args: Array[String]): Unit = {
    //TODO 0.准备环境
    val conf: SparkConf = new SparkConf().setAppName("spark").setMaster("local[*]")
    val sc: SparkContext = new SparkContext(conf)
    sc.setLogLevel("WARN")
    //the time interval at which streaming data will be divided into batches
    val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))//每隔5s划分一个批次
    ssc.checkpoint("./ckp")

    //TODO 1.加载数据-从Kafka
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "192.168.51.100:9092",//kafka集群地址
      "key.deserializer" -> classOf[StringDeserializer],//key的反序列化规则
      "value.deserializer" -> classOf[StringDeserializer],//value的反序列化规则
      "group.id" -> "ct",//消费者组名称
      //earliest:表示如果有offset记录从offset记录开始消费,如果没有从最早的消息开始消费
      //latest:表示如果有offset记录从offset记录开始消费,如果没有从最后/最新的消息开始消费
      //none:表示如果有offset记录从offset记录开始消费,如果没有就报错
      "auto.offset.reset" -> "latest",
      "auto.commit.interval.ms"->"5000",//自动提交的时间间隔
      "enable.auto.commit" -> (true: java.lang.Boolean)//是否自动提交
    )
    val topics = Array("ct")//要订阅的主题
    //使用工具类从Kafka中消费消息
    val kafkaDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent, //位置策略,使用源码中推荐的
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) //消费策略,使用源码中推荐的
    )

    //TODO 2.处理消息
    val infoDS: DStream[Int] = kafkaDS.map(record => {
      val nowDate = new Date()
      val strDate: String = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(nowDate)
      val value: String = record.value()
      Class.forName("com.mysql.jdbc.Driver")
      //获取mysql连接
      val conn: Connection = DriverManager.getConnection("jdbc:mysql://192.168.51.100:3306/test01?useUnicode=true&characterEncoding=utf-8", "root", "whyjlbcdy2001")
      //把数据写入mysql
      try {
        val sql: String = "insert into spider(spider_time,number)values('" + strDate + "','" + value.toInt + "')"
        conn.prepareStatement(sql).executeUpdate()
      } finally {
        conn.close()
      }
      value.toInt
    })
    //TODO 3.输出结果
    infoDS.print()
    //TODO 4.启动并等待结束
    ssc.start()
    ssc.awaitTermination()//注意:流式应用程序启动之后需要一直运行等待手动停止/等待数据到来

    //TODO 5.关闭资源
    ssc.stop(stopSparkContext = true, stopGracefully = true)//优雅关闭

  }
}

正常应该是这样的

 

 打开数据库,可以看到已经有数据了,并且随着python的爬取,数据还在不断增加

 

5.连接数据库做可视化展示

SELECT LEFT(spider_time,10),SUM(number) FROM `spider` GROUP BY LEFT(spider_time,10);

结束

 

 

 

 

参考同学的博客

2019级Spark项目应用-电子商务大数据分析总结 - 睡觉不困 - 博客园 (cnblogs.com)

 

 

 

 

 

 

 

 

 

 

 

1.  "productCommentSummary": {  

2.  "goodRateShow": 95, //好评率  

3.  "poorRateShow": 3, //差评率  

4.  "poorCountStr""1900+",  

5.  "averageScore": 5, //平均分  

6.  "generalCountStr""1600+",  

7.  "oneYear": 0,  

8.  "showCount": 21000,  

9.  "showCountStr""2.1+",  

10.  "goodCount": 64000, //好评数  

11.  "generalRate": 0.024,  

12.  "generalCount": 1600, //中评数  

13.  "skuId": 4432058,  

14.  "goodCountStr""6.4+",  

15.  "poorRate": 0.028,  

16.  "afterCount": 2400,  

17.  "goodRateStyle": 142,  

18.  "poorCount": 1900, //差评数  

19.  "skuIds"null,  

20.  "poorRateStyle": 4,  

21.  "generalRateStyle": 4,  

22.  "commentCountStr""6.8+",  

23.  "commentCount": 68000,  

24.  "productId": 4432058, //商品id  

25.  "afterCountStr""2400+",  

26.  "goodRate": 0.948,  

27.  "generalRateShow": 2 //中评率  

28.  },  

comments中一条评论的结构:

 

1.  “id": 10432588299,  

2.  “guid": "6c1d83b1-ac45-4189-a041-774eaff87df9",  

3.  “content": "割手,相当的割手,无语了", //评论内容 √  

4.  “creationTime": "2017-05-22 23:37:24", //写评论的时间 √  

5.  “isTop": false,                        //是否置顶  

6.  “referenceTime": "2017-05-20 18:35:11", //收货时间 √  

7.  “firstCategory": 9987,                 //第一分类 √  

8.  “secondCategory": 653,                 //第二分类 √  

9.  “thirdCategory": 655,                  //第三分类 √  

10.  “replyCount": 0,  

11.  “score": 3,                            //打分 √  

12.  “nickname": "j***",                  //昵称  √  

13.  “userClient": 2,  

14.  “productColor": "碳黑色",  

15.  “productSize": "32GB",  

16.  “userLevelName": "金牌会员",           //会员级别 √  

17.  “plusAvailable": 0,  

18.  “productSales": [  

19.     {  

20.         "dim": 3,  

21.         "saleName""选择套装",  

22.         "saleValue""官方标配"  

23.     }  

24.  ,  

25.  “userClientShow": "来自京东iPhone客户端",//评论设备  

26.  “isMobile": true,                       //是否移动端  

27.  “days": 2,                              //评论时间距【收货/下单】时间多长时间  

28.  “afterDays": 0  

 

posted @ 2022-03-17 22:16  Arisf  阅读(542)  评论(0编辑  收藏  举报