Spark操作hbase
- Spark操作HBase - 读操作
1 /** 2 * 配置hbase 3 * 4 * @param tableName 5 * @param quorum 6 * @param port 7 * @return 8 */ 9 def getHbaseConf(quorum: String, port: String): Configuration = { 10 // 配置hbase环境 11 val conf = HBaseConfiguration.create() 12 // 设置zookeeper地址 13 conf.set("hbase.zookeeper.quorum", quorum) 14 // 设置zookeeper端口 15 conf.set("hbase.zookeeper.property.clientPort", port) 16 // 设置读取hbase的tablename 17 conf 18 }
1 def main(args: Array[String]): Unit = { 2 3 // 获取spark环境 设置local使程序在本地运行,不需要安装Spark集群 这里使用的是yarn模式 4 val sparkConf = new SparkConf().setAppName("Spark-Hbase-Read").setMaster("local[2]") 5 val sc = new SparkContext(sparkConf) 6 // 获取hbase相关配置信息 7 val hbaseConf = HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181") 8 hbaseConf.set(TableInputFormat.INPUT_TABLE, "movie_wordcount") 9 // 将 hbase 读取信息转换成rdd 10 val hbaseRdd = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result]).cache() 11 12 hbaseRdd.map(x => { 13 val result = x._2 14 val row = Bytes.toString(result.getRow) 15 val word = Bytes.toString(result.getValue("word".getBytes(), "word".getBytes())) 16 // 单词量是int 所以需要用 Bytes.toInt 否则会出现乱码 17 val count = Bytes.toInt(result.getValue("word".getBytes(), "count".getBytes())) 18 19 println(row, word, count) 20 (row, word, count) 21 }).saveAsTextFile("/wordcount/output3") 22 23 24 }
打包提交到集群,运行 命令:./bin/spark-submit --class com.xxx.xx.scala.hbase.SparkHbaseR ./localjar/sc-1.0-SNAPSHOT-jar-with-dependencies.jar
saveAsTextFile 默认保存地址是hdfs上的,所以去hdfs上查看结果,结果是一个文件夹,
查看命令: hadoop fs -ls /wordcount/output3
2.Spark操作HBase - 写操作
object SparkHbaseW {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("Spark-Hbase-Write").setMaster("yarn")
val sc = new SparkContext(conf)
val file = sc.textFile("/spark/movie/wordcount/pinglun.txt")
// 使用中文分词
val rdd = file.flatMap(line => {
getWords(line, filter(new Array[String](0)))
}).map(x => (x, 1)).reduceByKey(_ + _)
// 保存一份结果到hdfs
rdd.saveAsTextFile(args(0))
// 插入方式一
rdd.foreachPartition(x => {
x.foreach(y => {
// 将数组插入hbase
val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
val family = Bytes.toBytes("word")
val wordColum = Bytes.toBytes("word")
val countColum = Bytes.toBytes("count")
val uuid = UUID.randomUUID()
val wordPut = new Put(Bytes.toBytes(uuid.toString))
wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
table.put(wordPut)
})
})
// 插入方式二 (批量插入)
// rdd.foreachPartition(x => {
// val table = HbaseUtils.getTable(HbaseUtils.getHbaseConf("centos01,centos02,centos03", "2181"), "movie_wordcount")
// val puts = new java.util.LinkedList[Put]()
// x.foreach(y => {
// // 将数组插入hbase
// val family = Bytes.toBytes("word")
// val wordColum = Bytes.toBytes("word")
// val countColum = Bytes.toBytes("count")
// val uuid = UUID.randomUUID()
// val wordPut = new Put(Bytes.toBytes(uuid.toString))
// wordPut.addColumn(family, wordColum, Bytes.toBytes(y._1))
// wordPut.addColumn(family, countColum, Bytes.toBytes(y._2))
// puts.add(wordPut)
// })
// table.put(puts)
// })
}
/**
* 分词停止符
*
* @param stopWords
* @return
*/
def filter(stopWords: Array[String]): StopRecognition = {
// add stop words
val filter = new StopRecognition
filter.insertStopNatures("w") // filter punctuation
filter.insertStopNatures("m") // filter m pattern
filter.insertStopNatures("null") // filter null
filter.insertStopNatures("<br />") // filter <br />
filter.insertStopRegexes("^[a-zA-Z]{1,}") //filter English alphabet
filter.insertStopRegexes("^[0-9]+") //filter number
filter.insertStopRegexes("[^a-zA-Z0-9\\u4e00-\\u9fa5]+")
filter.insertStopRegexes("\t")
for (x <- stopWords) {
filter.insertStopWords(x)
}
filter
}
/**
* 分词统计
*
* @param text
* @param filter
* @return
*/
def getWords(text: String, filter: StopRecognition): ArrayBuffer[String] = {
val words = new mutable.ArrayBuffer[String]()
val terms = ToAnalysis.parse(text).recognition(filter).getTerms
for (i <- 0 until terms.size()) {
val word = terms.get(i).getName
if (word.length >= 0) {
words += word
}
}
words
}
}