编程模型:数据输出层
结果保存到HDFS中
import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapred.{SequenceFileOutputFormat, TextOutputFormat} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} /** * WordCount程序,Spark Streaming消费TCP Server发过来的实时数据的例子: * * 1、在master服务器上启动一个Netcat server * `$ nc -lk 9998` (如果nc命令无效的话,我们可以用yum install -y nc来安装nc) * * 2、用下面的命令在在集群中将Spark Streaming应用跑起来 * spark-submit --class com.twq.streaming.output.NetworkWordCountHDFS \ * --master spark://master:7077 \ * --deploy-mode client \ * --driver-memory 512m \ * --executor-memory 512m \ * --total-executor-cores 4 \ * --executor-cores 2 \ * /home/hadoop-twq/spark-course/streaming/spark-streaming-basic-1.0-SNAPSHOT.jar */ object NetworkWordCountHDFS { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("NetworkWordCountHDFS") val sc = new SparkContext(sparkConf) // Create the context with a 1 second batch size val ssc = new StreamingContext(sc, Seconds(5)) //创建一个接收器(ReceiverInputDStream),这个接收器接收一台机器上的某个端口通过socket发送过来的数据并处理 val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER) //处理的逻辑,就是简单的进行word count val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) //将结果输出 ///Option[Int]) 改称一个分区 wordCounts.repartition(1).mapPartitions { iter => val text = new Text() iter.map { x => text.set(x.toString) (NullWritable.get(), text) } } saveAsHadoopFiles[TextOutputFormat[NullWritable, Text]]( "hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/hadoop/wordcount", "-hadoop") wordCounts.repartition(1).map(x => { val text = new Text() text.set(x.toString()) (NullWritable.get(), text) }).saveAsHadoopFiles[SequenceFileOutputFormat[NullWritable, Text]]( "hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/sequence/wordcount", "-hadoop")// 文件名字(非目录名字),以—hadoop为后缀,中间会有个时间戳 wordCounts.repartition(1).saveAsTextFiles("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/text/wordcount") ////saveAsTextFiles 简介方法 wordCounts.repartition(1).saveAsObjectFiles("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/object/wordcount") //启动Streaming处理流 ssc.start() ssc.stop(false) //结果验证 sc.textFile("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/hadoop/wordcount*") // 每隔一段时间生成一定的小文件,coalesce(1) 将所有的小文件合并成一个大文件 saveAsTextFile(" 文件目录") 保存到一定的目录中 sc.textFile("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/hadoop/wordcount*").coalesce(1).saveAsTextFile("") sc.sequenceFile("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/sequence/wordcount*", classOf[NullWritable], classOf[Text]).map(_._2.toString).collect() sc.objectFile[(String, Int)]("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/object/wordcount*").collect sc.textFile("hdfs://master:9999/user/hadoop-twq/spark-course/streaming/data/text/wordcount*").collect //等待Streaming程序终止 ssc.awaitTermination() } }
将数据保存到MySQL中
import java.sql.DriverManager import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} /** * WordCount程序,Spark Streaming消费TCP Server发过来的实时数据的例子: * * 1、在master服务器上启动一个Netcat server * `$ nc -lk 9998` (如果nc命令无效的话,我们可以用yum install -y nc来安装nc) * * * create table wordcount(ts bigint, word varchar(50), count int); * * spark-shell --total-executor-cores 4 --executor-cores 2 --master spark://master:7077 --jars mysql-connector-java-5.1.44-bin.jar, * c3p0-0.9.1.2.jar,spark-streaming-basic-1.0-SNAPSHOT.jar * * */ object NetworkWordCountForeachRDD { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("NetworkWordCountForeachRDD") val sc = new SparkContext(sparkConf) // Create the context with a 1 second batch size val ssc = new StreamingContext(sc, Seconds(5)) //创建一个接收器(ReceiverInputDStream),这个接收器接收一台机器上的某个端口通过socket发送过来的数据并处理 val lines = ssc.socketTextStream("master", 9998, StorageLevel.MEMORY_AND_DISK_SER) //处理的逻辑,就是简单的进行word count val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) //将结果保存到Mysql(错误代码:Connection不能序列化 Driver端执行的代码不能在execute上 ) wordCounts.foreachRDD { (rdd, time) => Class.forName("com.mysql.jdbc.Driver") val conn = DriverManager.getConnection("jdbc:mysql://master:3306/test", "root", "root") val statement = conn.prepareStatement(s"insert into wordcount(ts, word, count) values (?, ?, ?)") rdd.foreach { record => statement.setLong(1, time.milliseconds) statement.setString(2, record._1) statement.setInt(3, record._2) statement.execute() } statement.close() conn.close() } //启动Streaming处理流 ssc.start() ssc.stop(false) //将结果保存到Mysql(优化代码) wordCounts.foreachRDD { (rdd, time) => rdd.foreachPartition { partitionRecords => val conn = ConnectionPool.getConnection conn.setAutoCommit(false) val statement = conn.prepareStatement(s"insert into wordcount(ts, word, count) values (?, ?, ?)") partitionRecords.zipWithIndex.foreach { case ((word, count), index) => statement.setLong(1, time.milliseconds) statement.setString(2, word) statement.setInt(3, count) statement.addBatch() if (index != 0 && index % 500 == 0) { statement.executeBatch() conn.commit() } } statement.executeBatch() statement.close() conn.commit() conn.setAutoCommit(true) ConnectionPool.returnConnection(conn) } } //等待Streaming程序终止 ssc.awaitTermination() } }