spark streaming 实现接收网络传输数据进行WordCount功能
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | package iie.udps.example.operator.spark; import scala.Tuple2; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.Time; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.Arrays; import java.util.List; import com.google.common.io.Files; import org.apache.spark.api.java.JavaPairRDD; import com.google.common.base.Optional; /** * To run this on your local machine, you need to first run a Netcat server * * `$ nc -lk 9999` * * and run the example as * * spark-submit --class iie.udps.example.operator.spark.JavaNetworkWordCount * --master local /home/xdf/test2.jar localhost 9999 /user/test/checkpoint/ * /home/xdf/outputFile /home/xdf/totalOutputFile * * 此示例接收Netcat server产生的数据,进行WordCount操作,分别输出当前结果和历史结果到本地文件中 */ public final class JavaNetworkWordCount { @SuppressWarnings ( "serial" ) public static void main(String[] args) { if (args.length != 5 ) { System.err.println( "You arguments were " + Arrays.asList(args)); System.err .println( "Usage: JavaNetworkWordCount <hostname> <port> <checkpoint-directory>\n" + " <output-file> <total-output-file>. <hostname> and <port> describe the TCP server that Spark\n" + " Streaming would connect to receive data. <checkpoint-directory> directory to\n" + " HDFS-compatible file system which checkpoint data <output-file> file to which\n" + " the word counts will be appended\n" + " <total-output-file> file to which the total word counts will be appended\n" + "\n" + "In local mode, <master> should be 'local[n]' with n > 1\n" + "Both <checkpoint-directory> and <output-file> and <total-output-file> must be absolute paths" ); System.exit( 1 ); } final String checkpointDirectory = args[ 2 ]; // 检查点目录 final String curOutputPath = args[ 3 ]; // 输出当前WordCount结果的路径 final String totalOutputPath = args[ 4 ]; // 输出全部累计WordCount结果的路径 System.out.println( "Creating new context" ); final File curOutputFile = new File(curOutputPath); if (curOutputFile.exists()) { curOutputFile.delete(); } final File totalOutputFile = new File(totalOutputPath); if (totalOutputFile.exists()) { totalOutputFile.delete(); } // Create a StreamingContext SparkConf conf = new SparkConf().setAppName( "NetworkWordCount" ); final JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration( 1000 )); jssc.checkpoint(checkpointDirectory); // Create a DStream that will connect to hostname:port, like // localhost:9999 JavaReceiverInputDStream<String> lines = jssc.socketTextStream(args[ 0 ], Integer.parseInt(args[ 1 ])); // Split each line into words JavaDStream<String> words = lines .flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Arrays.asList(x.split( " " )); } }); // Count each word in each batch JavaPairDStream<String, Integer> pairs = words .mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s, 1 ); } }); JavaPairDStream<String, Integer> runningCounts = pairs .reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) throws Exception { return i1 + i2; } }); runningCounts .foreachRDD( new Function2<JavaPairRDD<String, Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException { String counts = "Counts at time " + time + " " + rdd.collect(); System.out.println(counts); System.out.println( "Appending to " + curOutputFile.getAbsolutePath()); Files.append(counts + "\n" , curOutputFile, Charset.defaultCharset()); return null ; } }); Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction = new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() { @Override public Optional<Integer> call(List<Integer> values, Optional<Integer> state) { Integer newSum = state.or( 0 ); for (Integer i : values) { newSum += i; } return Optional.of(newSum); } }; JavaPairDStream<String, Integer> TotalCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1 ); } }).updateStateByKey(updateFunction); TotalCounts .foreachRDD( new Function2<JavaPairRDD<String, Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException { String counts = "Counts at time " + time + " " + rdd.collect(); System.out.println(counts); System.out.println( "Appending to " + totalOutputFile.getAbsolutePath()); Files.append(counts + "\n" , totalOutputFile, Charset.defaultCharset()); return null ; } }); jssc.start(); // Start the computation jssc.awaitTermination(); // Wait for the computation to terminate System.exit( 0 ); } } |
分类:
Spark系列
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?