pyspark配置config
使用pyspark时要注意 pyspark_python 设置为python3.5 ,可以使用ml,运行环境也应该是python3.5,版本必须一致,不然会报错。
import findspark
findspark.init()
import pandas as pd
import numpy as np
import pickle
import os
os.environ["PYSPARK_PYTHON"] = "/home/q/conda/bin/python3.5"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
os.environ["PYSPARK_DRIVER_PYTHON"] = "python"
# local[20]
#import jieba
#jieba.initialize()
conf = SparkConf() \
.setAppName("NLP_Project_youming.guo") \
.setMaster("yarn") \
.set('spark.yarn.queue', "root.adhoc") \
.set('spark.yarn.dist.files',
'file:/home/q/spark/python/lib/pyspark.zip,file:/home/q/spark/python/lib/py4j-0.10.4-src.zip') \
.setExecutorEnv('PYTHONPATH', 'pyspark.zip:py4j-0.10.4-src.zip') \
.set('PYSPARK_PYTHON', '/home/q/conda/bin/python3.5')
conf.set("spark.executor.memory", "5g")
conf.set("spark.driver.memory","10g")
conf.set("spark.executor.cores","2")
conf.set("spark.dynamicAllocation.maxExecutors","5")
conf.set("spark.driver.maxResultSize","0")
conf.set("spark.dynamicAllocation.enabled","true")
conf.set("spark.shuffle.service.enabled", "true")
conf.set("spark.shuffle.service.port", "7338")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)