13.友盟项目--数据清洗-转储升级版-pySpark
pySpark不用编译
1.利用java程序生成分区转储SQL语句
java程序

@Test public void genSQL() throws IntrospectionException { Class[] classes = { AppStartupLog.class , AppEventLog.class , AppErrorLog.class , AppPageLog.class , AppUsageLog.class } ; for(Class clz : classes){ System.out.println(doGenSQL(clz)); } } public static String doGenSQL(Class clazz) throws IntrospectionException { String RN = "\r\n" ; String simpleClassName = clazz.getSimpleName().toLowerCase() ; /** * 运行速度快慢为:StringBuilder > StringBuffer > String * 在线程安全上,StringBuilder是线程不安全的,而StringBuffer是线程安全的 * String:适用于少量的字符串操作的情况 * StringBuilder:适用于单线程下在字符缓冲区进行大量操作的情况 * StringBuffer:适用多线程下在字符缓冲区进行大量操作的情况 */ StringBuilder builder = new StringBuilder() ; builder.append(String.format("%s" , RN)) ; builder.append(String.format("%s" , RN)) ; builder.append(String.format("-- %s %s" , simpleClassName,RN)) ; builder.append(String.format("insert into %ss partition(ym , day) %s" , simpleClassName,RN)) ; builder.append(String.format("select%s",RN)) ; //通过反省得到成员变量 BeanInfo bi = Introspector.getBeanInfo(clazz) ; PropertyDescriptor[] pps = bi.getPropertyDescriptors() ; for(PropertyDescriptor pp : pps){ String name = pp.getName() ; Class type = pp.getPropertyType(); if(type != String.class && type != int.class && type != Integer.class && type != Long.class && type != long.class ){ continue; } Method get = pp.getReadMethod() ; Method set = pp.getWriteMethod() ; if(get != null && set != null){ builder.append(String.format(" t.%s ,%s",name, RN)); } } builder.append(String.format("date_format(cast(t.createdatms as timestamp) , 'yyyyMM') ,%s", RN)); builder.append(String.format("date_format(cast(t.createdatms as timestamp) , 'dd')%s", RN)); builder.append(String.format("from%s", RN)); builder.append(String.format(" (%s", RN)); builder.append(String.format(" select fork%ss(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10%s",simpleClassName.substring(3), RN)); builder.append(String.format(" )t;%s" , RN)); return builder.toString() ; }
2.结果
分别保存在五个sql文件中 传到centos下

-- appstartuplog use big12_umeng ; insert into appstartuplogs partition(ym , day) select t.appChannel , t.appId , t.appPlatform , t.appVersion , t.brand , t.carrier , t.country , t.createdAtMs , t.deviceId , t.deviceStyle , t.ipAddress , t.network , t.osType , t.province , t.screenSize , t.tenantId , date_format(cast(t.createdatms as timestamp) , 'yyyyMM') , date_format(cast(t.createdatms as timestamp) , 'dd') from ( select forkstartuplogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10 )t; -- appeventlog use big12_umeng ; insert into appeventlogs partition(ym , day) select t.appChannel , t.appId , t.appPlatform , t.appVersion , t.brand , t.createdAtMs , t.deviceId , t.deviceStyle , t.eventDurationSecs , t.eventId , t.osType , t.tenantId , date_format(cast(t.createdatms as timestamp) , 'yyyyMM') , date_format(cast(t.createdatms as timestamp) , 'dd') from ( select forkeventlogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10 )t; -- apperrorlog use big12_umeng ; insert into apperrorlogs partition(ym , day) select t.appChannel , t.appId , t.appPlatform , t.appVersion , t.brand , t.createdAtMs , t.deviceId , t.deviceStyle , t.errorBrief , t.errorDetail , t.osType , t.tenantId , date_format(cast(t.createdatms as timestamp) , 'yyyyMM') , date_format(cast(t.createdatms as timestamp) , 'dd') from ( select forkerrorlogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10 )t; -- apppagelog use big12_umeng ; insert into apppagelogs partition(ym , day) select t.appChannel , t.appId , t.appPlatform , t.appVersion , t.brand , t.createdAtMs , t.deviceId , t.deviceStyle , t.nextPage , t.osType , t.pageId , t.pageViewCntInSession , t.stayDurationSecs , t.tenantId , t.visitIndex , date_format(cast(t.createdatms as timestamp) , 'yyyyMM') , date_format(cast(t.createdatms as timestamp) , 'dd') from ( select forkpagelogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10 )t; -- appusagelog use big12_umeng ; insert into appusagelogs partition(ym , day) select t.appChannel , t.appId , t.appPlatform , t.appVersion , t.brand , t.createdAtMs , t.deviceId , t.deviceStyle , t.osType , t.singleDownloadTraffic , t.singleUploadTraffic , t.singleUseDurationSecs , t.tenantId , date_format(cast(t.createdatms as timestamp) , 'yyyyMM') , date_format(cast(t.createdatms as timestamp) , 'dd') from ( select forkusagelogs(servertimestr ,clienttimems ,clientip ,json) from raw_logs limit 10 )t;
3.pySpark python脚本动态执行sql脚本---实现日志转储
/home/centos
nano data_clean.py
#-*- coding:utf8 -*- from pyspark.sql import SparkSession import sys reload(sys) sys.setdefaultencoding('utf8') if __name__ == '__main__': # 提取参数 sess = SparkSession.Builder().appName("dataClean").master("local[4]").enableHiveSupport().getOrCreate() f = open(sys.argv[1]) # 读取整个文件作为一个string str = f.read() #按照;切割字符串 arr = str.split(";") for sql in arr : sql2 = sql.strip(" \r\n") if sql2 != "": print sql2 sess.sql(sql2).show()
3.spark-submit --jars /soft/hive/umeng_hive.jar data_clean.py data_clean_event.sql
分类:
大数据项目
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· 一文读懂知识蒸馏
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下