from pyspark import SparkContext

from pyspark import SparkConf

from pyspark.sql import HiveContext

from pyspark.mllib.regression import LabeledPoint

 

string_test = 'pyspark_test'

conf = SparkConf().setAppName(string_test).setMaster('yarn')

sc = SparkContext(conf=conf)

hive_context = HiveContext(sc)

data = hive_context.sql(select * from database.table)#spark dataframe

tdata = data.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))#rdd