Spark 使用ansj进行中文分词

在Spark中使用ansj分词先要将ansj_seg-5.1.1.jar和nlp-lang-1.7.2.jar加入工程

ansj源码github:https://github.com/NLPchina/ansj_seg

ansj下载链接:https://oss.sonatype.org/content/repositories/releases/org/ansj/ansj_seg/

nlp-lang下载链接:https://oss.sonatype.org/content/repositories/releases/org/nlpcn/nlp-lang/


package com.spark.test

import org.apache.spark.sql.SparkSession
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

import scala.io.Source
import org.ansj.splitWord.analysis.DicAnalysis
import org.ansj.library.DicLibrary
import org.ansj.recognition.impl.StopRecognition
import org.nlpcn.commons.lang.tire.library
import java.util.Arrays

object Participle {
case class Movies(productId:String,userId:String,profileName:String,
helpfulness:String,score:String,time:String,summary:String,text:String)

def main(args: Array[String]){
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

//分词准备
val stop = new StopRecognition()
stop.insertStopNatures("w")//过滤掉标点
stop.insertStopNatures("m")//过滤掉m词性
stop.insertStopNatures("null")//过滤null词性
stop.insertStopNatures("<br />")//过滤<br />词性
stop.insertStopNatures(":")
stop.insertStopNatures("'")

val spark = SparkSession.builder().master("local[4]").appName("prepare").getOrCreate()
val data = spark.sparkContext.textFile("/Users/yangyang/Desktop/b.txt")
import spark.implicits._
val splits = data.filter(line => !line.contains("4.0")).map{x =>
val fields = x.split("\t")
if(fields(4).toString <= "3.0"){
fields(4) = "0"
}else if(fields(4).toString == "5.0"){
fields(4) = "1"
}
fields(0)+"\t"+fields(1)+"\t"+fields(2)+"\t"+fields(3)+"\t"+fields(4)+"\t"+fields(5)+"\t"+fields(6)+"\t"+fields(7)
}
//生成训练数据集
val trains = splits.map(_.split("\t")).map(x => Movies(x(0).toString,x(1).toString,x(2).toString,x(3).toString,x(4).toString,x(5).toString,x(6).toString,x(7).toString)).toDF()
//trains.show()
trains.createOrReplaceTempView("train")
val doc = spark.sql("select text from train").rdd
// println(doc)
// val testsentence = DicAnalysis.parse("好喜欢《武林外传》这部电视剧!").recognition(stop).toStringWithOutNature("|")
// println(testsentence)
//去掉逗号、句号等
val splited = doc.map{ x =>
val str = x.toString()
DicAnalysis.parse(str).recognition(stop).toStringWithOutNature("|")
}.saveAsTextFile("/Users/XXXXX/Desktop/c")
//splited.show()
//.saveAsTextFile("/Users/XXXXX/Desktop/c")
// println(splited)
spark.close()
}
}
部分参考:http://m.blog.csdn.net/ozinco/article/details/70184347
posted @ 2017-12-08 18:44  就是琦哥  阅读(4229)  评论(1编辑  收藏  举报