通过sparkstreaming分析url的数据
spark version 1.6.2
scala verson 2.10.6
此代码参考官方例子----
自定义接收器
import java.io.BufferedReader import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import scala.io.{BufferedSource, Source} /** * 自定义接收器 * @param URL url */ class UrlReceiver(URL:String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK) with Logging{ override def onStart(): Unit = { new Thread("创建一个线程"){ override def run(){ url() } }.start() } override def onStop(): Unit = { } private def url(): Unit ={ var input:String=null try{ val fileContent: BufferedSource = Source.fromURL(URL, "utf-8") val reader: BufferedReader = fileContent.bufferedReader() input=reader.readLine() while (!isStopped && input!=null){ //推送数据给streaming store(input) input=reader.readLine() } reader.close() logInfo("停止接受") restart("尝试再次连接~~~~~~~~~~~~~") }catch { case t:Throwable =>{ restart(s"接受数据错误${t}") } case s=>{ restart(s"连接出现错误${URL}:${s}") } } } }
写streming程序
import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object urlstreaming { def main(args: Array[String]): Unit = { new SparkConf SparkContext //此处以百度为例 val URL= "https://www.baidu.com" val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName(s"${this.getClass.getSimpleName}").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.streaming.stopGracefullyOnShutdown", "true") val ssc = new StreamingContext(conf,Seconds(3)) val value: ReceiverInputDStream[String] = ssc.receiverStream(new UrlReceiver(URL)) value.foreachRDD( rdd=>{ rdd.foreach(println) } ) ssc.start() ssc.awaitTermination() } }