13.友盟项目--数据清洗-转储升级版-pySpark

pySpark不用编译

1.利用java程序生成分区转储SQL语句
java程序
@Test
public void genSQL() throws IntrospectionException {
   Class[] classes = {
         AppStartupLog.class ,
         AppEventLog.class ,
         AppErrorLog.class ,
         AppPageLog.class ,
         AppUsageLog.class

   } ;
   for(Class clz : classes){
      System.out.println(doGenSQL(clz));
   }
}

public static String doGenSQL(Class clazz) throws IntrospectionException {
   String RN =  "\r\n" ;
   String simpleClassName = clazz.getSimpleName().toLowerCase() ;
   /**
    * 运行速度快慢为:StringBuilder > StringBuffer > String
    * 在线程安全上,StringBuilder是线程不安全的,而StringBuffer是线程安全的
    * String:适用于少量的字符串操作的情况
    *   StringBuilder:适用于单线程下在字符缓冲区进行大量操作的情况
    *   StringBuffer:适用多线程下在字符缓冲区进行大量操作的情况
    */
   StringBuilder builder = new StringBuilder() ;
   builder.append(String.format("%s" , RN)) ;
   builder.append(String.format("%s" , RN)) ;
   builder.append(String.format("-- %s %s" , simpleClassName,RN)) ;
   builder.append(String.format("insert into %ss partition(ym , day) %s" , simpleClassName,RN)) ;
   builder.append(String.format("select%s",RN)) ;
   //通过反省得到成员变量
   BeanInfo bi = Introspector.getBeanInfo(clazz) ;
   PropertyDescriptor[] pps = bi.getPropertyDescriptors() ;
   for(PropertyDescriptor pp : pps){
      String name = pp.getName() ;
      Class type = pp.getPropertyType();
      if(type != String.class
            && type != int.class
            && type != Integer.class
            && type != Long.class
            && type != long.class
            ){
         continue;
      }
      Method get = pp.getReadMethod() ;
      Method set = pp.getWriteMethod() ;
      if(get != null && set != null){
         builder.append(String.format("  t.%s ,%s",name, RN));
      }
   }
   builder.append(String.format("date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,%s", RN));
   builder.append(String.format("date_format(cast(t.createdatms as timestamp) , 'dd')%s",  RN));
   builder.append(String.format("from%s",  RN));
   builder.append(String.format(" (%s",  RN));
   builder.append(String.format("    select fork%ss(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10%s",simpleClassName.substring(3),  RN));
   builder.append(String.format("  )t;%s" , RN));
   return builder.toString() ;
}
java生成转储SQL语句

 

2.结果 

分别保存在五个sql文件中   传到centos下

 
-- appstartuplog
use big12_umeng ;
insert into appstartuplogs partition(ym , day)
select
  t.appChannel ,
  t.appId ,
  t.appPlatform ,
  t.appVersion ,
  t.brand ,
  t.carrier ,
  t.country ,
  t.createdAtMs ,
  t.deviceId ,
  t.deviceStyle ,
  t.ipAddress ,
  t.network ,
  t.osType ,
  t.province ,
  t.screenSize ,
  t.tenantId ,
date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,
date_format(cast(t.createdatms as timestamp) , 'dd')
from
(
    select forkstartuplogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10
  )t;



-- appeventlog
use big12_umeng ;
insert into appeventlogs partition(ym , day)
select
  t.appChannel ,
  t.appId ,
  t.appPlatform ,
  t.appVersion ,
  t.brand ,
  t.createdAtMs ,
  t.deviceId ,
  t.deviceStyle ,
  t.eventDurationSecs ,
  t.eventId ,
  t.osType ,
  t.tenantId ,
date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,
date_format(cast(t.createdatms as timestamp) , 'dd')
from
(
    select forkeventlogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10
  )t;



-- apperrorlog
use big12_umeng ;
insert into apperrorlogs partition(ym , day)
select
  t.appChannel ,
  t.appId ,
  t.appPlatform ,
  t.appVersion ,
  t.brand ,
  t.createdAtMs ,
  t.deviceId ,
  t.deviceStyle ,
  t.errorBrief ,
  t.errorDetail ,
  t.osType ,
  t.tenantId ,
date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,
date_format(cast(t.createdatms as timestamp) , 'dd')
from
(
    select forkerrorlogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10
  )t;



-- apppagelog
use big12_umeng ;
insert into apppagelogs partition(ym , day)
select
  t.appChannel ,
  t.appId ,
  t.appPlatform ,
  t.appVersion ,
  t.brand ,
  t.createdAtMs ,
  t.deviceId ,
  t.deviceStyle ,
  t.nextPage ,
  t.osType ,
  t.pageId ,
  t.pageViewCntInSession ,
  t.stayDurationSecs ,
  t.tenantId ,
  t.visitIndex ,
date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,
date_format(cast(t.createdatms as timestamp) , 'dd')
from
(
    select forkpagelogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10
  )t;



-- appusagelog
use big12_umeng ;
insert into appusagelogs partition(ym , day)
select
  t.appChannel ,
  t.appId ,
  t.appPlatform ,
  t.appVersion ,
  t.brand ,
  t.createdAtMs ,
  t.deviceId ,
  t.deviceStyle ,
  t.osType ,
  t.singleDownloadTraffic ,
  t.singleUploadTraffic ,
  t.singleUseDurationSecs ,
  t.tenantId ,
date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,
date_format(cast(t.createdatms as timestamp) , 'dd')
from
(
    select forkusagelogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10
  )t;
View Code

3.pySpark  python脚本动态执行sql脚本---实现日志转储

/home/centos
nano data_clean.py
#-*- coding:utf8 -*-
from pyspark.sql import SparkSession
import sys
reload(sys)
sys.setdefaultencoding('utf8')

if __name__ == '__main__':
    # 提取参数
    sess = SparkSession.Builder().appName("dataClean").master("local[4]").enableHiveSupport().getOrCreate()
    f = open(sys.argv[1])
    # 读取整个文件作为一个string
    str = f.read()

    #按照;切割字符串
    arr = str.split(";")
    for sql in arr :
        sql2 = sql.strip(" \r\n")
        if sql2 != "":
            print sql2
            sess.sql(sql2).show()
 
3.spark-submit --jars  /soft/hive/umeng_hive.jar  data_clean.py  data_clean_event.sql
 
 
posted @ 2018-11-05 15:28  star521  阅读(333)  评论(0编辑  收藏  举报