新浪微博大数据分析

**1、组织数据 (需要处理每条数据开头和结尾的中括号)
(1)创建Hive表weibo_json(json string),表只有一个字段,导入所有数据,并验证查询前5条数据 **

    create table weibo_json(json string);
    load data local inpath '/root/weibo.txt' into table weibo_json;
    select * from weibo_json limit 5;

(2)解析完weibo_json当中的json格式数据到拥有19个字段的weibo表中,写出必要的SQL语句

    create table weibo as 
    select json_tuple(json,
    'beCommentWeiboId'
    ,'beForwardWeiboId'
    ,'catchTime'
    ,'commentCount'
    ,'content'
    ,'createTime'
    ,'info1'
    ,'info2'
    ,'info3'
    ,'mlevel'
    ,'musicurl'
    ,'pic_list'
    ,'praiseCount'
    ,'reportCount'
    ,'source'
    ,'userId'
    ,'videourl'
    ,'weiboId'
    ,'weiboUrl'
    ) as 
    (beCommentWeiboId
    ,beForwardWeiboId
    ,catchTime
    ,commentCount
    ,content
    ,createTime
    ,info1
    ,info2
    ,info3
    ,mlevel
    ,musicurl
    ,pic_list
    ,praiseCount
    ,reportCount
    ,source
    ,userId
    ,videourl
    ,weiboId
    ,weiboUrl
    )
    from weibo_json;

2、统计微博总量 和 独立用户数

    select count(*) from weibo;
    +----------+--+
    |   _c0    |
    +----------+--+
    | 1451868  |
    +----------+--+
    select count(*) as Independent from
    (select count(*) from weibo group by userId) tmp;
    +--------------+--+
    | independent  |
    +--------------+--+
    | 78540        |
    +--------------+--+

3、统计用户所有微博被转发的次数之和,输出top5用户,并给出次数

    #select source from weibo limit 10;
    reportCount
    select userId,sum(reportCount) as total from weibo group by userId order by total desc limit 5;
    +-------------+--------------+--+
    |   userid    |    total     |
    +-------------+--------------+--+
    | 1793285524  | 7.6454805E7  |
    | 1629810574  | 7.3656898E7  |
    | 2803301701  | 6.8176008E7  |
    | 1266286555  | 5.5111054E7  |
    | 1191258123  | 5.4808042E7  |
    +-------------+--------------+--+

4、统计带图片的微博数

    select count(*) as total from weibo where pic_list='[]';
    +---------+--+
    |  total  |
    +---------+--+
    | 701356  |
    +---------+--+

5、统计使用iphone发微博的独立用户数

    select count(*) as total from
    (select count(*) as total from weibo where source='iPhone客户端' group by userId) tmp;
    +------+--+
    | total  |
    +------+--+
    | 921  |
    +------+--+

6、将用户所有微博的点赞人数和转发人数相加求和,并将相加之和降序排列,取前10条记录,输出userid和总次数

    select userId,(sum(praiseCount)+sum(reportCount)) as total from weibo group by userId order by total desc limit 10;
    
    +-------------+---------------+--+
    |   userid    |     total     |
    +-------------+---------------+--+
    | 1793285524  | 1.14941096E8  |
    | 1629810574  | 9.761207E7    |
    | 1266286555  | 8.3789422E7   |
    | 2803301701  | 7.4208822E7   |
    | 1195242865  | 6.9292231E7   |
    | 1191258123  | 6.1985742E7   |
    | 1197161814  | 5.9093308E7   |
    | 2656274875  | 5.2380775E7   |
    | 2202387347  | 5.1623117E7   |
    | 1195230310  | 4.8321083E7   |
    +-------------+---------------+--+

7、统计微博中评论次数小于1000的用户ID与数据来源信息,将其放入视图,然后统计视图中数据来源是”ipad客户端”的用户数目

    #创建表接收微博中评论次数小于1000的用户ID与数据来源信息
    create table small1000 as
    select userId,source from weibo where commentCount<1000 group by userId,source;
    #查询数据来源是”ipad客户端”的用户数目
    select count(*) as total from small1000 where source='iPad客户端';
    +--------+--+
    | total  |
    +--------+--+
    | 537    |
    +--------+--+

**8、统计微博内容中出现”iphone”次数最多的用户,
最终结果输出用户id和次数(注意:该次数是”iphone”的出现次数,
不是出现”iphone”的微博数目) **

    select userId,count(userid) from weibo where content like '%iphone%' group by userId;

9、求每天发微博次数最多的那个家伙的ID和发微博的条数

    #时间函数,原表存储的数据的时间是字符串的时间戳,所以需要将其转成长整型bigint
    from_unixtime(cast(createTime as bigint)
    #首先获取数据的userId,以及时间并转换为"yyyy-MM-dd"格式
    select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo;
    #然后按照时间,用户进行分组查询,获取每日用户发布微博数量,这里定义该查询结果为a
    select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time;
    #从a中按照获取的时间再进行分组查询,获取当日发布微博最大值,这里将查询结果定义为b
    select tmp1.Release_time as Release_time,max(tmp1.total) as big from
    (select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time) tmp1
    group by tmp1.Release_time;
    #最后b与a进行内连接,以获取导userId
    select a.Release_time as Release_time,b.userId as userId,a.big as total from
    (select tmp1.Release_time as Release_time,max(tmp1.total) as big from
    (select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time) tmp1
    group by tmp1.Release_time) a
    join 
    (select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time) b
    on a.Release_time=b.Release_time and a.big=b.total;

在这里插入图片描述

posted @ 2021-07-09 17:17  老酱  阅读(857)  评论(0编辑  收藏  举报