spark读写Sequoiadb
spark如何读写Sequoiadb,最近被客户问多了,这个记录下。
Spark读Sequoiadb数据:
package marketing import com.sequoiadb.hadoop.io.BSONWritable import com.sequoiadb.hadoop.mapreduce.SequoiadbInputFormat import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkContext, SparkConf} /** * Created by joy on 2015/12/15. */ object Read extends App { val conf = new SparkConf().setAppName("cgbdata"). setMaster("local").registerKryoClasses(Array(classOf[BSONWritable])) val sc = new SparkContext(conf) val hadoopConfig = new Configuration() hadoopConfig.set("sequoiadb.input.url","master:11810,slave1:11810,slave2:11810") hadoopConfig.set("sequoiadb.in.collectionspace","default") hadoopConfig.set("sequoiadb.in.collection","bar") val sdbRDD = sc.newAPIHadoopRDD[Object,BSONWritable,SequoiadbInputFormat](hadoopConfig,classOf[SequoiadbInputFormat],classOf[Object], classOf[BSONWritable]) sdbRDD.map(_._2.getBson).collect.map(println) sc.stop() }
Spark写Sequoiadb
package marketing import com.sequoiadb.hadoop.io.BSONWritable import com.sequoiadb.hadoop.mapreduce.SequoiadbOutputFormat import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.{NullWritable, IntWritable} import org.apache.spark.{SparkConf, SparkContext} import org.bson.BasicBSONObject import org.bson.types.ObjectId /** * Hello world! * */ object Save extends App { val sparkconf = new SparkConf().setMaster("local[2]").setAppName("save").registerKryoClasses(Array(classOf[BSONWritable])) val sc = new SparkContext(sparkconf) var data = sc.parallelize(List((NullWritable.get(),new BSONWritable(new BasicBSONObject("name","gaoxing"))))) val config = new Configuration() config.set("sequoiadb.output.url","master:11810") config.set("sequoiadb.out.collectionspace","foo") config.set("sequoiadb.out.collection","bar") data.saveAsNewAPIHadoopFile("",classOf[NullWritable],classOf[BSONWritable],classOf[SequoiadbOutputFormat],config) }