scala_spark实践1

/**
  *  scala模型的main(args:Array[String])是业务执行入口
  *  org.apache.spark.{SparkConf, SparkContext}
  *  val sparkConf =new SparkConf().setAppName(appName)
  *  val ssc = new StreamingContext(sparkConf, Seconds(batchNum))
  *  val sc = ssc.sparkContext  //如果代码中不用StreamingContextval 只需要SparkContext则new一个如val sc = new SparkContext(sparkConf)
  *
  *  val sqlContext = new HiveContext(sc)//HiveContext是对SQLContext的扩展 val sqlContext = new SQLContext(sc)
  *  val result:DataFrame = sqlContext.sql(sql)
  *  //2.0之后HiveContext和SQLContext也可以用SparkSession替换val result =SparkSession.builder().appName("test").config("key","value").getOrCreate().sql(sql)
  *
  *  项目中一般用json处理，如发送kafka或者格式转换和过滤
  *   val resultRdd = result.toJSON.rdd.map(x => {
          val json = new JSONObject(x)
          val computerIp = json.optString("ip", "")
          val rowKey = json.optString("name", "")
          ......
          val dataMap = new util.HashMap[String, String]()
          dataMap.put("computerip", computerIp)
          (rowKey, dataMap)
      })
   val bhaseRdd = resultRdd.filter(r => {
   r._1 != "" && r._1 != null && r._1.length > 0
   }).map(line => {
   val put = new Put(Bytes.toBytes(line._1)) //rowKey 为参数，拿到put
   val key = line._2.keySet().iterator(); //拿到对应的dataMap
   while (key.hasNext) {
    val k = key.next().toString
    put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(k), Bytes.toBytes(line._2.get(k)))
   }
    (new ImmutableBytesWritable(), put)
   })

   val hadoopconf = sc.hadoopConfiguration
   val jobconf = new JobConf(hadoopconf)
   jobconf.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
   jobconf.setOutputValueClass(classOf[Result])
   jobconf.setClass("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]],classOf[OutputFormat[ImmutableBytesWritable, Mutation]])
   jobconf.set(TableOutputFormat.OUTPUT_TABLE, table)

   bhaseRdd.saveAsNewAPIHadoopDataset(jobconf) //存入Hasee
  *-----------------------------------------------------------------------------------------------------------
  * class KafkaSink(createProducer: () => KafkaProducer[String, String]) extends Serializable {
    lazy val producer = createProducer()
    def send(topic: String, value: String): Unit ={

      producer.send(new ProducerRecord(topic, value))
    }
    }

  object KafkaSink {
    def apply(config: java.util.Map[String, Object]): KafkaSink = {
      val f = () => {
        val producer = new KafkaProducer[String, String](config)
        producer
      }
      new KafkaSink(f)
    }
  }
  *val kafka = sc.broadcast(KafkaSink(Configs.kafka_props))
  *selectDatas.toJSON.rdd.foreach(x => {
      val json = new JSONObject(x)
      kafka.value.send(topic, json.toString)
  })
  *//发送topic
  *-------------------------------------------------------------------
* val kafkaStream= KafkaUtils.createStream[String,String,StringDecoder,StringDecoder](ssc,kafka_param,topic,StorageLevel.MEMORY_AND_DISK_SER).map(_._2)
* kafkaStream.foreachRDD(rdd =>{
*   rdd.foreach(data=> {
* //消费kafka
*/
posted @ 2020-01-09 15:52 ~清风煮酒~ 阅读(194) 评论(0) 编辑收藏举报
刷新页面返回顶部
狂奔小蜗牛

scala_spark实践1

公告