Spark中文文本分析建模
实用的朴素贝叶斯模型建模
建模过程主要是把文本转化成向量然后再作分析
数据格式:
0,善良 美丽
1,丑陋 阴险 卑鄙
0,温和
.......
注:前面是给文章贴的标签,后面是文章的分词,分词可以找关于分词的文章去查看,后面我也会写关于分词的文章
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
class CreatModel {
}
object CreatModel{
case class RawDataRecord(category: String, text: String)
def main(args: Array[String]): Unit = {
val config = new SparkConf().setAppName("createModel").setMaster("local[4]");
val sc =new SparkContext(config);
val spark = SparkSession.builder().config(config).config("spark.sql.warehouse.dir", "warehouse/dir").getOrCreate();
import spark.implicits._
//分数据
val Array(srcDF,testDF) = sc.textFile("D:\\decstop\\testFiles\\sougou").map {
x =>
val data = x.split(",")
RawDataRecord(data(0),data(1))
}.toDF().randomSplit(Array(0.7,0.3))
//分词
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val wordsData = tokenizer.transform(srcDF)
wordsData.show(false)
val testtokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val testwordsData = testtokenizer.transform(testDF)
//文档词频
val hashingTF =
new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
val featurizedData = hashingTF.transform(wordsData)
val testhashingTF =
new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
val testfeaturizedData = testhashingTF.transform(testwordsData)
//逆文档词频
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
val testidf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val testidfModel = testidf.fit(testfeaturizedData)
val testrescaledData = testidfModel.transform(testfeaturizedData)
rescaledData.show(false)
//转换成贝叶斯的输入格式
val trainDataRdd = rescaledData.select($"category",$"features").map {
case Row(label: String, features:Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
val testtrainDataRdd = testrescaledData.select($"category",$"features").map {
case Row(label: String, features:Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
val model =new NaiveBayes().fit(trainDataRdd)
val predictions = model.transform(testtrainDataRdd)
println("predictln out:");
predictions.show();
model.write.overwrite().save("resoult")
//模型评估
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("accuracy out :")
println("Accuracy:"+accuracy)
}
}