scala_spark实践1
/** * scala模型的main(args:Array[String])是业务执行入口 * org.apache.spark.{SparkConf, SparkContext} * val sparkConf =new SparkConf().setAppName(appName) * val ssc = new StreamingContext(sparkConf, Seconds(batchNum)) * val sc = ssc.sparkContext //如果代码中不用StreamingContextval 只需要SparkContext则new一个如val sc = new SparkContext(sparkConf) * * val sqlContext = new HiveContext(sc)//HiveContext是对SQLContext的扩展 val sqlContext = new SQLContext(sc) * val result:DataFrame = sqlContext.sql(sql) * //2.0之后HiveContext和SQLContext也可以用SparkSession替换val result =SparkSession.builder().appName("test").config("key","value").getOrCreate().sql(sql) * * 项目中一般用json处理,如发送kafka或者格式转换和过滤 * val resultRdd = result.toJSON.rdd.map(x => { val json = new JSONObject(x) val computerIp = json.optString("ip", "") val rowKey = json.optString("name", "") ...... val dataMap = new util.HashMap[String, String]() dataMap.put("computerip", computerIp) (rowKey, dataMap) }) val bhaseRdd = resultRdd.filter(r => { r._1 != "" && r._1 != null && r._1.length > 0 }).map(line => { val put = new Put(Bytes.toBytes(line._1)) //rowKey 为参数,拿到put val key = line._2.keySet().iterator(); //拿到对应的dataMap while (key.hasNext) { val k = key.next().toString put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(k), Bytes.toBytes(line._2.get(k))) } (new ImmutableBytesWritable(), put) }) val hadoopconf = sc.hadoopConfiguration val jobconf = new JobConf(hadoopconf) jobconf.setMapOutputKeyClass(classOf[ImmutableBytesWritable]) jobconf.setOutputValueClass(classOf[Result]) jobconf.setClass("mapreduce.job.outputformat.class", classOf[TableOutputFormat[ImmutableBytesWritable]],classOf[OutputFormat[ImmutableBytesWritable, Mutation]]) jobconf.set(TableOutputFormat.OUTPUT_TABLE, table) bhaseRdd.saveAsNewAPIHadoopDataset(jobconf) //存入Hasee *----------------------------------------------------------------------------------------------------------- * class KafkaSink(createProducer: () => KafkaProducer[String, String]) extends Serializable { lazy val producer = createProducer() def send(topic: String, value: String): Unit ={ producer.send(new ProducerRecord(topic, value)) } } object KafkaSink { def apply(config: java.util.Map[String, Object]): KafkaSink = { val f = () => { val producer = new KafkaProducer[String, String](config) producer } new KafkaSink(f) } } *val kafka = sc.broadcast(KafkaSink(Configs.kafka_props)) *selectDatas.toJSON.rdd.foreach(x => { val json = new JSONObject(x) kafka.value.send(topic, json.toString) }) *//发送topic *------------------------------------------------------------------- * val kafkaStream= KafkaUtils.createStream[String,String,StringDecoder,StringDecoder](ssc,kafka_param,topic,StorageLevel.MEMORY_AND_DISK_SER).map(_._2) * kafkaStream.foreachRDD(rdd =>{ * rdd.foreach(data=> { * //消费kafka */