pyspark
pyspark的一些函数:
df.select('age', 'mobile').show(10)
df.filter(df['mobile']=='Vivo').show()
df.filter((df['mobile']=='Vivo')&(df['experience'] >10)).show()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import Row
import pandas as pd
from sklearn import metrics
if __name__ == "__main__":
appname = "RandomForestClassifier"
master ="local[4]"
conf = SparkConf().setAppName(appname).setMaster(master) #spark配置
spark=SparkSession.builder.config(conf=conf).getOrCreate()#spark实例化
#读取数据
data=spark.read.csv('良恶性乳腺癌数据.csv',header=True)
#构造训练数据集
dataSet = data.na.fill('0').rdd.map(list)#用0填充空值
trainData, testData= dataSet.randomSplit([0.7, 0.3], seed=7)
trainingSet = trainData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()
train_num = trainingSet.count()
print("训练样本数:{}".format(train_num))
#使用随机森林进行训练
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingSet)
train_tf = si_model.transform(trainingSet)
train_tf.show(5)
rf = RandomForestClassifier(numTrees=100, labelCol="indexed", seed=7)
rfModel = rf.fit(train_tf)
#输出模型特征重要性、子树权重
print("模型特征重要性:{}".format(rfModel.featureImportances))
print("模型特征数:{}".format(rfModel.numFeatures))
#预测测试集
testSet = testData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()
test_num=testSet.count()
print("测试样本数:{}".format(test_num))
si_model = stringIndexer.fit(testSet)
test_tf = si_model.transform(testSet)
predictResult = rfModel.transform(test_tf)
predictResult.