开发系列:01、使用Java和Maven开发Spark应用
1、POM.xml
1 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 3 <modelVersion>4.0.0</modelVersion> 4 5 <groupId>org.hansight.spark</groupId> 6 <artifactId>examples</artifactId> 7 <version>0.0.1-SNAPSHOT</version> 8 <packaging>jar</packaging> 9 10 <name>examples</name> 11 <url>http://maven.apache.org</url> 12 13 <properties> 14 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 15 <elasticsearch.version>1.2.1</elasticsearch.version> 16 <jdk.version>1.7</jdk.version> 17 <logback.version>1.1.2</logback.version> 18 <slf4j.version>1.7.7</slf4j.version> 19 <junit.version>4.11</junit.version> 20 <jcl.over.slf4j.version>1.7.7</jcl.over.slf4j.version> 21 <metrics.version>3.0.2</metrics.version> 22 <avro.version>1.7.7</avro.version> 23 <jna.version>4.1.0</jna.version> 24 <spark.version>1.0.2</spark.version> 25 </properties> 26 <dependencies> 27 <dependency> 28 <groupId>junit</groupId> 29 <artifactId>junit</artifactId> 30 <version>3.8.1</version> 31 <scope>test</scope> 32 </dependency> 33 <dependency> 34 <groupId>com.fasterxml.jackson.core</groupId> 35 <artifactId>jackson-core</artifactId> 36 <version>2.4.2</version> 37 </dependency> 38 <dependency> 39 <groupId>com.google.guava</groupId> 40 <artifactId>guava</artifactId> 41 <version>14.0.1</version> 42 <scope>provided</scope> 43 </dependency> 44 <dependency> 45 <groupId>org.apache.spark</groupId> 46 <artifactId>spark-streaming-kafka_2.10</artifactId> 47 <version>${spark.version}</version> 48 <exclusions> 49 <exclusion> 50 <groupId>javax.servlet</groupId> 51 <artifactId>servlet-api</artifactId> 52 </exclusion> 53 <exclusion> 54 <groupId>org.apache.hadoop</groupId> 55 <artifactId>hadoop-client</artifactId> 56 </exclusion> 57 </exclusions> 58 </dependency> 59 <dependency> 60 <groupId>org.elasticsearch</groupId> 61 <artifactId>elasticsearch</artifactId> 62 <version>1.2.1</version> 63 </dependency> 64 <dependency> 65 <groupId>org.apache.hadoop</groupId> 66 <artifactId>hadoop-hdfs</artifactId> 67 <version>2.4.0.2.1.4.0-632</version> 68 </dependency> 69 <dependency> 70 <groupId>org.apache.hadoop</groupId> 71 <artifactId>hadoop-common</artifactId> 72 <version>2.4.0.2.1.4.0-632</version> 73 <exclusions> 74 <exclusion> 75 <groupId>jdk.tools</groupId> 76 <artifactId>jdk.tools</artifactId> 77 </exclusion> 78 </exclusions> 79 </dependency> 80 <dependency> 81 <groupId>org.apache.hadoop</groupId> 82 <artifactId>hadoop-mapreduce-client-common</artifactId> 83 <version>2.4.0.2.1.4.0-632</version> 84 </dependency> 85 <dependency> 86 <groupId>org.elasticsearch</groupId> 87 <artifactId>elasticsearch-hadoop</artifactId> 88 <version>2.0.1</version> 89 </dependency> 90 </dependencies> 91 92 <build> 93 <plugins> 94 <plugin> 95 <groupId>org.apache.maven.plugins</groupId> 96 <artifactId>maven-compiler-plugin</artifactId> 97 <version>3.1</version> 98 <configuration> 99 <source>${jdk.version}</source> 100 <target>${jdk.version}</target> 101 </configuration> 102 </plugin> 103 <plugin> 104 <groupId>org.apache.maven.plugins</groupId> 105 <artifactId>maven-assembly-plugin</artifactId> 106 <version>2.4</version> 107 <configuration> 108 <descriptorRefs> 109 <descriptorRef>jar-with-dependencies</descriptorRef> 110 </descriptorRefs> 111 </configuration> 112 </plugin> 113 </plugins> 114 </build> 115 </project>
2、样例代码
1 package com.hansight.spark.utils; 2 3 import org.apache.spark.SparkConf; 4 import org.apache.spark.api.java.JavaSparkContext; 5 6 public class SparkUtils { 7 8 public static JavaSparkContext get(String name) { 9 SparkConf conf = new SparkConf(); 10 // conf.setMaster("local[1]"); 11 // conf.setMaster("spark://hdp125:7077"); 12 conf.setAppName(name); 13 return new JavaSparkContext(conf); 14 } 15 }
1 package com.hansight.spark.streaming; 2 3 import java.util.Iterator; 4 5 import org.apache.spark.api.java.JavaRDD; 6 import org.apache.spark.api.java.JavaSparkContext; 7 import org.apache.spark.api.java.function.Function; 8 import org.apache.spark.api.java.function.VoidFunction; 9 10 import com.hansight.spark.utils.SparkUtils; 11 12 public class HttpParser { 13 @SuppressWarnings({ "unchecked", "serial" }) 14 public static void main(String[] args) { 15 if (args.length == 0) { 16 System.out.println("Usage: <file path>"); 17 return; 18 } 19 System.setProperty("hadoop.home.dir", "E:/tools/hadoop-2.4.1"); 20 JavaSparkContext sc = SparkUtils.get("HttpLog"); 21 String path = args[0]; 22 JavaRDD<String> rdd = sc 23 .textFile(path); 24 JavaRDD<HttpLog> parsed = rdd.map(new Function<String, HttpLog>() { 25 public HttpLog call(String line) throws Exception { 26 return HttpLog.parser(line); 27 } 28 }); 29 System.out.println(parsed.count()); 30 parsed.foreachPartition(new VoidFunction<Iterator<HttpLog>>() { 31 @Override 32 public void call(Iterator<HttpLog> t) throws Exception { 33 HttpLog.save(t); 34 } 35 }); 36 } 37 }
1 package com.hansight.spark.streaming; 2 3 import java.lang.reflect.Field; 4 import java.util.HashMap; 5 import java.util.Iterator; 6 import java.util.Map; 7 8 import org.elasticsearch.action.bulk.BulkRequestBuilder; 9 import org.elasticsearch.action.bulk.BulkResponse; 10 import org.elasticsearch.client.Client; 11 12 import com.hansight.spark.utils.EsUtils; 13 14 public class HttpLog { 15 private String rawlog; 16 // VARCHAR2(8 BYTE) 记录类型,表示此记录为HTTP浏览业务记录(取值为’HTTP’) 17 private String RECORD_TYPE; 18 // TIMESTAMP 开始时间,格式为:YYYY-MM-DD HH24:MI:SS.xxxxxxxxx 19 private String CAPTURETIME; 20 // VARCHAR2(16 BYTE) 手机号码(从创建PDP上下文消息中获取) 21 private String MSISDN; 22 // IMSI VARCHAR2(18 BYTE) 国际移动用户识别码(从创建PDP上下文消息中获取) 23 private String IMSI; 24 // IMEI(SV) VARCHAR2(20 BYTE) IMEI(SV)号(从创建PDP上下文消息中获取) 25 private String IMEI; 26 // VARCHAR2(32 BYTE) APN 27 private String APN; 28 // UEIP VARCHAR2(20 BYTE) 终端的IP 29 private String UEIP; 30 // SPIP VARCHAR2(20 BYTE) SP的IP 31 private String SPIP; 32 // UEPORT NUMBER(12) 终端端口 33 private int UEPORT; 34 // SPPORT NUMBER(12) SP端端口 35 private int SPPORT; 36 // USERAGENT VARCHAR2(64 BYTE) User Agent信息 37 private String USERAGENT; 38 // URL VARCHAR2(256 BYTE) URL,该字段的错误率应不超过万分之一 39 private String URL; 40 // HOST VARCHAR2(64 BYTE) HOST信息 41 private String HOST; 42 // CONTENTLEN NUMBER(12) 内容大小 43 private String CONTENTLEN; 44 // CONTENTTYPE VARCHAR2(64 BYTE) 内容类型 45 private String CONTENTTYPE; 46 // BSKIP NUMBER(12) 是否是链接访问,0=否,1=是 47 private boolean BSKIP; 48 // REFERER VARCHAR2(256 BYTE) 链接源信息 49 private String REFERER; 50 // HTTPSTATUS NUMBER(12) 状态码,请参照附录HTTPSTATUS表 51 private long HTTPSTATUS; 52 // RESPDELAY NUMBER(12) 响应时延,单位毫秒 53 private long RESPDELAY; 54 // HTTPACTION NUMBER(12) HTTP操作类型(5: Post, 6:Get) 55 private String HTTPACTION; 56 // DURATION NUMBER(12) 持续时长 57 private long DURATION; 58 // FLOW NUMBER(12) 总流量 59 private long FLOW; 60 // UPFLOW NUMBER(12) 上行流量 61 private long UPFLOW; 62 // DOWNFLOW NUMBER(12) 下行流量 63 private long DOWNFLOW; 64 // SGSNIP VARCHAR2(20 BYTE) SGSN 用户面 IP 65 private String SGSNIP; 66 // GGSNIP VARCHAR2(20 BYTE) GGSN 用户面 IP 67 private String GGSNIP; 68 // LAC NUMBER(12) LAC信息 69 private long LAC; 70 // CI NUMBER(12) CI/SAC信息 71 private String CI; 72 // RATTYPE NUMBER(12) RAT Type,1=2G,2=3G 73 private String RATTYPE; 74 // STOPTIME TIMESTAMP 结束时间,格式为:YYYY-MM-DD HH24:MI:SS.xxxxxxxxx 75 private String STOPTIME; 76 // PBIP VARCHAR2(20 BYTE) 采集解析设备IP地址 77 private String PBIP; 78 // PBID NUMBER(12) 采集解析设备编号 79 private long PBID; 80 81 public String getRawlog() { 82 return rawlog; 83 } 84 85 public void setRawlog(String rawlog) { 86 this.rawlog = rawlog; 87 } 88 89 public String getRECORD_TYPE() { 90 return RECORD_TYPE; 91 } 92 93 public void setRECORD_TYPE(String rECORD_TYPE) { 94 RECORD_TYPE = rECORD_TYPE; 95 } 96 97 public String getCAPTURETIME() { 98 return CAPTURETIME; 99 } 100 101 public void setCAPTURETIME(String cAPTURETIME) { 102 CAPTURETIME = cAPTURETIME; 103 } 104 105 public String getMSISDN() { 106 return MSISDN; 107 } 108 109 public void setMSISDN(String mSISDN) { 110 MSISDN = mSISDN; 111 } 112 113 public String getIMSI() { 114 return IMSI; 115 } 116 117 public void setIMSI(String iMSI) { 118 IMSI = iMSI; 119 } 120 121 public String getIMEI() { 122 return IMEI; 123 } 124 125 public void setIMEI(String iMEI) { 126 IMEI = iMEI; 127 } 128 129 public String getAPN() { 130 return APN; 131 } 132 133 public void setAPN(String aPN) { 134 APN = aPN; 135 } 136 137 public String getUEIP() { 138 return UEIP; 139 } 140 141 public void setUEIP(String uEIP) { 142 UEIP = uEIP; 143 } 144 145 public String getSPIP() { 146 return SPIP; 147 } 148 149 public void setSPIP(String sPIP) { 150 SPIP = sPIP; 151 } 152 153 public int getUEPORT() { 154 return UEPORT; 155 } 156 157 public void setUEPORT(int uEPORT) { 158 UEPORT = uEPORT; 159 } 160 161 public int getSPPORT() { 162 return SPPORT; 163 } 164 165 public void setSPPORT(int sPPORT) { 166 SPPORT = sPPORT; 167 } 168 169 public String getUSERAGENT() { 170 return USERAGENT; 171 } 172 173 public void setUSERAGENT(String uSERAGENT) { 174 USERAGENT = uSERAGENT; 175 } 176 177 public String getURL() { 178 return URL; 179 } 180 181 public void setURL(String uRL) { 182 URL = uRL; 183 } 184 185 public String getHOST() { 186 return HOST; 187 } 188 189 public void setHOST(String hOST) { 190 HOST = hOST; 191 } 192 193 public String getCONTENTLEN() { 194 return CONTENTLEN; 195 } 196 197 public void setCONTENTLEN(String cONTENTLEN) { 198 CONTENTLEN = cONTENTLEN; 199 } 200 201 public String getCONTENTTYPE() { 202 return CONTENTTYPE; 203 } 204 205 public void setCONTENTTYPE(String cONTENTTYPE) { 206 CONTENTTYPE = cONTENTTYPE; 207 } 208 209 public boolean isBSKIP() { 210 return BSKIP; 211 } 212 213 public void setBSKIP(boolean bSKIP) { 214 BSKIP = bSKIP; 215 } 216 217 public String getREFERER() { 218 return REFERER; 219 } 220 221 public void setREFERER(String rEFERER) { 222 REFERER = rEFERER; 223 } 224 225 public long getHTTPSTATUS() { 226 return HTTPSTATUS; 227 } 228 229 public void setHTTPSTATUS(long hTTPSTATUS) { 230 HTTPSTATUS = hTTPSTATUS; 231 } 232 233 public long getRESPDELAY() { 234 return RESPDELAY; 235 } 236 237 public void setRESPDELAY(long rESPDELAY) { 238 RESPDELAY = rESPDELAY; 239 } 240 241 public String getHTTPACTION() { 242 return HTTPACTION; 243 } 244 245 public void setHTTPACTION(String hTTPACTION) { 246 HTTPACTION = hTTPACTION; 247 } 248 249 public long getDURATION() { 250 return DURATION; 251 } 252 253 public void setDURATION(long dURATION) { 254 DURATION = dURATION; 255 } 256 257 public long getFLOW() { 258 return FLOW; 259 } 260 261 public void setFLOW(long fLOW) { 262 FLOW = fLOW; 263 } 264 265 public long getUPFLOW() { 266 return UPFLOW; 267 } 268 269 public void setUPFLOW(long uPFLOW) { 270 UPFLOW = uPFLOW; 271 } 272 273 public long getDOWNFLOW() { 274 return DOWNFLOW; 275 } 276 277 public void setDOWNFLOW(long dOWNFLOW) { 278 DOWNFLOW = dOWNFLOW; 279 } 280 281 public String getSGSNIP() { 282 return SGSNIP; 283 } 284 285 public void setSGSNIP(String sGSNIP) { 286 SGSNIP = sGSNIP; 287 } 288 289 public String getGGSNIP() { 290 return GGSNIP; 291 } 292 293 public void setGGSNIP(String gGSNIP) { 294 GGSNIP = gGSNIP; 295 } 296 297 public long getLAC() { 298 return LAC; 299 } 300 301 public void setLAC(long lAC) { 302 LAC = lAC; 303 } 304 305 public String getCI() { 306 return CI; 307 } 308 309 public void setCI(String cI) { 310 CI = cI; 311 } 312 313 public String getRATTYPE() { 314 return RATTYPE; 315 } 316 317 public void setRATTYPE(String rATTYPE) { 318 RATTYPE = rATTYPE; 319 } 320 321 public String getSTOPTIME() { 322 return STOPTIME; 323 } 324 325 public void setSTOPTIME(String sTOPTIME) { 326 STOPTIME = sTOPTIME; 327 } 328 329 public String getPBIP() { 330 return PBIP; 331 } 332 333 public void setPBIP(String pBIP) { 334 PBIP = pBIP; 335 } 336 337 public long getPBID() { 338 return PBID; 339 } 340 341 public void setPBID(long pBID) { 342 PBID = pBID; 343 } 344 345 public static HttpLog parser(String line) { 346 if (line == null) { 347 return null; 348 } 349 String[] arr = line.split("','"); 350 HttpLog log = new HttpLog(); 351 log.setRawlog(line); 352 if (arr.length != 31) { 353 return log; 354 } 355 String start = arr[0]; 356 if (start != null) { 357 start = arr[0].substring(1); 358 } 359 log.setRECORD_TYPE(start); 360 log.setCAPTURETIME(arr[1]); 361 log.setMSISDN(arr[2]); 362 log.setIMSI(arr[3]); 363 log.setIMEI(arr[4]); 364 log.setAPN(arr[5]); 365 log.setUEIP(arr[6]); 366 log.setSPIP(arr[7]); 367 log.setUEPORT(Integer.parseInt(arr[8])); 368 log.setSPPORT(Integer.parseInt(arr[9])); 369 log.setUSERAGENT(arr[10]); 370 log.setURL(arr[11]); 371 log.setHOST(arr[12]); 372 log.setCONTENTLEN(arr[13]); 373 log.setCONTENTTYPE(arr[14]); 374 log.setBSKIP("1".equals(arr[15])); 375 log.setREFERER(arr[16]); 376 log.setHTTPSTATUS(Long.parseLong(arr[17])); 377 log.setRESPDELAY(Long.parseLong(arr[18])); 378 String action = arr[19]; 379 if ("5".equals(action)) { 380 action = "POST"; 381 } else if ("6".equals(action)) { 382 action = "GET"; 383 } 384 log.setHTTPACTION(action); 385 386 log.setDURATION(Long.parseLong(arr[20])); 387 log.setFLOW(Long.parseLong(arr[21])); 388 log.setUPFLOW(Long.parseLong(arr[22])); 389 log.setDOWNFLOW(Long.parseLong(arr[23])); 390 log.setSGSNIP(arr[24]); 391 log.setGGSNIP(arr[25]); 392 log.setLAC(Long.parseLong(arr[26])); 393 log.setCI(arr[27]); 394 String ratType = arr[28]; 395 if ("1".equals(ratType)) { 396 ratType = "2G"; 397 } else if ("2".equals(ratType)) { 398 ratType = "3G"; 399 } 400 log.setRATTYPE(ratType); 401 log.setSTOPTIME(arr[29]); 402 log.setPBIP(arr[30]); 403 String stop = arr[31]; 404 if (stop != null) { 405 stop = stop.substring(0, stop.length() - 1); 406 } 407 log.setPBID(Long.parseLong(stop)); 408 return log; 409 } 410 411 public static void save(Iterator<HttpLog> t) { 412 try { 413 Client client = EsUtils.getEsClient(); 414 BulkRequestBuilder bulk = client.prepareBulk(); 415 int index = 0; 416 while (t.hasNext()) { 417 HttpLog log = t.next(); 418 index++; 419 bulk.add(client.prepareIndex("logs_nuoxi", "http").setSource( 420 log.toJSON())); 421 if (index >= 500) { 422 BulkResponse bulkResponse = bulk.execute().actionGet(); 423 if (bulkResponse.hasFailures()) { 424 // 处理错误 425 System.out.println(bulkResponse.buildFailureMessage()); 426 } 427 index = 0; 428 } 429 } 430 if (index != 0) { 431 BulkResponse bulkResponse = bulk.execute().actionGet(); 432 if (bulkResponse.hasFailures()) { 433 // 处理错误 434 System.out.println(bulkResponse.buildFailureMessage()); 435 } 436 } 437 } catch (Exception e) { 438 e.printStackTrace(); 439 throw e; 440 } 441 // client.close(); 442 } 443 444 private Map<String, Object> toJSON() { 445 Field[] fields = this.getClass().getDeclaredFields(); 446 Map<String, Object> map = new HashMap<>(); 447 for (Field field : fields) { 448 field.setAccessible(true); 449 try { 450 map.put(field.getName().toLowerCase(), field.get(this)); 451 } catch (IllegalArgumentException | IllegalAccessException e) { 452 e.printStackTrace(); 453 } 454 } 455 return map; 456 } 457 }