pyspark 记录

import os
import sys
spark_name = os.environ.get('SPARK_HOME',None)
if not spark_name:
    raise ValueErrorError('spark环境没有配置好')
sys.path.insert(0,os.path.join(spark_name,'python'))
sys.path.insert(0,os.path.join(spark_name,'python/lib/py4j-0.10.4-src.zip'))
import pyspark
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext
sc = SparkContext(appName="MyProject")

yarn 模式报No module named pyspark

## 初始化 SparkContext
from pyspark import SparkConf,SparkContext
conf = (SparkConf()
        .setMaster('local[20]')
       .setAppName('my spark app')
       .set("spark.executor.memory",'10g'))
sc = SparkContext(conf=conf)
# 或者
sc = SparkContext(appName="my spark app",master='local')
## 打印 SparkContext 相关信息
print(sc.version)
print(sc.pythonVer)
print(sc.master)
print(sc.sparkHome)
print(sc.sparkUser())
print(sc.appName)
print(sc.applicationId)
print(sc.defaultParallelism)
print(sc.defaultMinPartitions)

RDD 操作

查看基础统计信息

rdd = sc.parallelize([('a',1),('b',4),('a',5)])
print(rdd.getNumPartitions())
print(rdd.count())
print(rdd.countByKey())
print(rdd.countByValue())
print(rdd.collectAsMap())
1
3
defaultdict(<class 'int'>, {'a': 2, 'b': 1})
defaultdict(<class 'int'>, {('a', 1): 1, ('b', 4): 1, ('a', 5): 1})
{'a': 5, 'b': 4}
## 对一维RDD
rdd = sc.parallelize([random.randint(0,20) for i in range(10)])
print(rdd.collect())
print(sorted(rdd.collect()))
print(rdd.max())
print(rdd.min())
print(rdd.mean())
print(rdd.stdev())
print(rdd.stats())
print(rdd.histogram(3))
[20, 3, 7, 19, 7, 20, 20, 5, 16, 2]
[2, 3, 5, 7, 7, 16, 19, 20, 20, 20]
20
2
11.9
7.327346040688948
(count: 10, mean: 11.9, stdev: 7.327346040688948, max: 20.0, min: 2.0)
([2, 8, 14, 20], [5, 0, 5])

应用函数

import random
file_name = path + 'data_random'
data_random = [[random.randint(0,20) for j in range(5)] for i in range(3)]
rdd = sc.parallelize(data_random)
rdd1 = rdd.map(lambda x:sorted(x))
print(rdd1.collect())
rdd2 = rdd.flatMap(lambda x:sorted(x))   
print(rdd2.collect())
[[0, 5, 8, 13, 20], [0, 3, 3, 9, 11], [3, 4, 7, 11, 17]]
[0, 5, 8, 13, 20, 0, 3, 3, 9, 11, 3, 4, 7, 11, 17]
rdd = sc.parallelize([('a',[3,4,5,6]),('b',[11,22,33,44]),('a',[1111,2222])]) ## 注意这里是tuple()
rdd1 = rdd.mapValues(lambda x:len(x))
print(rdd1.collect())
print(rdd.reduceByKey(lambda x,y:x+y).collect())
# groupByKey():应用于(K,V)键值对的数据集时,返回一个新的(K, Iterable)形式的数据集
print(rdd.reduce(lambda x,y:x+y))  ## tuple的加法
[('a', 4), ('b', 4), ('a', 2)]
[('a', [3, 4, 5, 6, 1111, 2222]), ('b', [11, 22, 33, 44])]
('a', [3, 4, 5, 6], 'b', [11, 22, 33, 44], 'a', [1111, 2222])
rdd = sc.parallelize([i for i in range(10)])
print(rdd.collect())
print(rdd.reduce(lambda x,y:x+y))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
45
rdd = sc.parallelize([i for i in range(5)])
print(rdd.collect())
print(rdd.keyBy(lambda x:chr(ord('a')+x)).collect()) ##给每个元素添加key
[0, 1, 2, 3, 4]
[('a', 0), ('b', 1), ('c', 2), ('d', 3), ('e', 4)]

选择数据

data_random = [[random.randint(0,20) for j in range(5)] for i in range(3)]
rdd = sc.parallelize(data_random)
print(rdd.collect())
print(rdd.first()) ## 返回第一个元素
print(rdd.take(2)) ## 以list形式返回前俩个元素
print(rdd.top(2)) 
[[2, 10, 8, 15, 8], [7, 5, 6, 17, 0], [18, 17, 7, 9, 13]]
[2, 10, 8, 15, 8]
[[2, 10, 8, 15, 8], [7, 5, 6, 17, 0]]
[[18, 17, 7, 9, 13], [7, 5, 6, 17, 0]]
## 抽样
data_random = [[random.randint(0,20) for j in range(5)] for i in range(100)]
rdd = sc.parallelize(data_random)
print(rdd.count())
rdd_sample = rdd.sample(withReplacement=False,fraction=0.3,seed=123)
print(rdd_sample.count())
100
31
## 过滤(返回满足条件的元素)
rdd = sc.parallelize([random.randint(0,10) for i in range(20)])
print(rdd.collect())
rdd_filtered = rdd.filter(lambda x:x>5)
print(rdd_filtered.collect())
[7, 10, 0, 4, 1, 5, 7, 2, 4, 2, 10, 2, 9, 10, 6, 5, 8, 0, 5, 2]
[7, 10, 7, 10, 9, 10, 6, 8]
## 得到不同元素
rdd = sc.parallelize([1,2,3,4,5,6,1,2,3,4,5])
print(rdd.collect())
print(rdd.distinct().collect())
[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5]
[1, 2, 3, 4, 5, 6]
rdd = sc.parallelize([('a',[3,4,5,6]),('b',[11,22,33,44]),('a',[1111,2222])])
print(rdd.keys().collect())
print(rdd.values().collect())
['a', 'b', 'a']
[[3, 4, 5, 6], [11, 22, 33, 44], [1111, 2222]]

一行转多行

(1, List(1, 2, 3))
to
(1,1)
(1,2)
(1,3)
rdd.flatMap(lambda x: map(lambda e: (x[0], e), x[1]))

聚合

rdd = sc.parallelize([i for i in range(50)])
rdd_groupby = rdd.groupBy(lambda x: x % 3)
print(rdd_groupby.collect())
print(rdd_groupby.mapValues(list).collect())
print(rdd_groupby.map(lambda x:(x[0],list(x[1]))).collect())
[(0, <pyspark.resultiterable.ResultIterable object at 0x7f2359670be0>), (1, <pyspark.resultiterable.ResultIterable object at 0x7f2359670a58>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f2359717be0>)]
[(0, [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]), (1, [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]), (2, [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47])]
[(0, [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48]), (1, [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49]), (2, [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47])]
from operator import add
sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)
15
## 对分区分别处理
sc.parallelize([1,2,3,4]).aggregate(
  (0, 0),
  (lambda acc, value: (acc[0] + value, acc[1] + 1)),
  (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])))
(10, 4)
list(range(10))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
## 集合运算
rdd1 = sc.parallelize(list(range(10)))
rdd2 = sc.parallelize(list(range(5,15)))
rdd3 = rdd2.subtract(rdd1)  ## 返回rdd2和rdd1的差集
print(rdd3.collect())
[10, 12, 14, 11, 13]
rdd1 = sc.parallelize([('a',1),('b',2),('c',3)])
rdd2 = sc.parallelize([('a',1),('b',2),('d',4)])
rdd3 = rdd2.subtractByKey(rdd1)  ## 根据key计算差集
print(rdd3.collect())
[('d', 4)]
rdd1 = sc.parallelize(list(range(3)))
rdd2 = sc.parallelize(list(range(5,8)))
rdd3 = rdd1.cartesian(rdd2) ## 笛卡尔积
print(rdd3.collect())
[(0, 5), (0, 6), (0, 7), (1, 5), (1, 6), (1, 7), (2, 5), (2, 6), (2, 7)]
##将rdd重新分区
rdd = rdd.repartition(4)  # 需要重新赋值
rdd = rdd.coalesce(2) ##  # 需要重新赋值
print(rdd.getNumPartitions())
2
## 保存
rdd.saveAsTextFile(path)
rdd.saveAsHadoopFile(path)
...

DataFrame

from pyspark.sql import SparkSession
spark = SparkSession(sc)
## 从rdd创建df
from pyspark.sql.types import *
from pyspark.sql import Row
##从rdd推断schema
rdd = sc.parallelize([[i,chr(i+97)] for i in range(5)])
print(rdd.collect())
rdd = rdd.map(lambda x:Row(No=x[0],char=x[1]))
print(rdd.collect())

df = spark.createDataFrame(rdd)
df.show()
[[0, 'a'], [1, 'b'], [2, 'c'], [3, 'd'], [4, 'e']]
[Row(No=0, char='a'), Row(No=1, char='b'), Row(No=2, char='c'), Row(No=3, char='d'), Row(No=4, char='e')]
+---+----+
| No|char|
+---+----+
|  0|   a|
|  1|   b|
|  2|   c|
|  3|   d|
|  4|   e|
+---+----+
## 指定schema
rdd = sc.parallelize([Row(No=i,char=chr(i+97)) for i in range(5)])
print(rdd.collect())
schema_list = ['No','char']
fields = [StructField(name='No',dataType=IntegerType(),nullable=True),
          StructField(name='char',dataType=StringType(),nullable =True)]
schema = StructType(fields)
df = spark.createDataFrame(rdd)
df.show()
[Row(No=0, char='a'), Row(No=1, char='b'), Row(No=2, char='c'), Row(No=3, char='d'), Row(No=4, char='e')]
+---+----+
| No|char|
+---+----+
|  0|   a|
|  1|   b|
|  2|   c|
|  3|   d|
|  4|   e|
+---+----+
##保存dataframe到文件
df.select('*').write.save(path+'df.parquet')
df.select('*').write.save(path+'df.json',format='json')
##从spark文件读入
df=spark.read.load(path+"df.parquet")
print(df.show())
df=spark.read.load(path+"df.json",format='json')
print(df.show())
df=spark.read.json(path+'df.json')
print(df.show())
df = spark.read.parquet('hdfs://default/warehouse/data/date={19000822,19000823}')
+---+----+
| No|char|
+---+----+
|  0|   a|
|  1|   b|
|  2|   c|
|  3|   d|
|  4|   e|
+---+----+

None
+---+----+
| No|char|
+---+----+
|  0|   a|
|  1|   b|
|  2|   c|
|  3|   d|
|  4|   e|
+---+----+

None
+---+----+
| No|char|
+---+----+
|  0|   a|
|  1|   b|
|  2|   c|
|  3|   d|
|  4|   e|
+---+----+

None
## 检查dataframe数据
print(df.dtypes)
print(df.schema)
print(df.show())
print(df.first())
print(df.take(1))
print(df.describe().show())
[('No', 'bigint'), ('char', 'string')]
StructType(List(StructField(No,LongType,true),StructField(char,StringType,true)))
+---+----+
| No|char|
+---+----+
|  0|   a|
|  1|   b|
|  2|   c|
|  3|   d|
|  4|   e|
+---+----+

None
Row(No=0, char='a')
[Row(No=0, char='a')]
+-------+------------------+----+
|summary|                No|char|
+-------+------------------+----+
|  count|                 5|   5|
|   mean|               2.0|null|
| stddev|1.5811388300841898|null|
|    min|                 0|   a|
|    max|                 4|   e|
+-------+------------------+----+

None
print(df.columns)
print(df.count())
print(df.distinct().count())
print(df.printSchema())
print(df.explain())  ##
['No', 'char']
5
5
root
 |-- No: long (nullable = true)
 |-- char: string (nullable = true)

None
== Physical Plan ==
*FileScan json [No#1051L,char#1052] Batched: false, Format: JSON, Location: InMemoryFileIndex[afs://baihua.afs.baidu.com:9902/user/fpd_dm/tangshengyu/spark/df.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<No:bigint,char:string>
None

dataframe查询

## select
from pyspark.sql import functions as F 
rdd = sc.parallelize([Row(No=1,char='a',values=[1,2]),Row(No=2,char='b',values=[1,4])])
df = spark.createDataFrame(rdd)
print(df.select('No').show())
print(df.select('No','char',F.explode('values').alias('value')).show()) ## 将聚合的列展开
print(df.select(df['No']+1,df['char']).show()) ## 对整列进行运算
+---+
| No|
+---+
|  1|
|  2|
+---+

None
+---+----+-----+
| No|char|value|
+---+----+-----+
|  1|   a|    1|
|  1|   a|    2|
|  2|   b|    1|
|  2|   b|    4|
+---+----+-----+

None
+--------+----+
|(No + 1)|char|
+--------+----+
|       2|   a|
|       3|   b|
+--------+----+

None
print(df.select(df['No']>1).show())   ## 条件查询
print(df.select('No','char',F.when(df.char=='a',1).otherwise(0).alias('is a')).show()) ## 根据已有列产生新列
print(df[df.char.isin('a')].show()) ##条件查询
print(df.select('No',df.char.like('a')).show()) ## 根据已有列产生新列
+--------+
|(No > 1)|
+--------+
|   false|
|    true|
+--------+

None
+---+----+----+
| No|char|is a|
+---+----+----+
|  1|   a|   1|
|  2|   b|   0|
+---+----+----+

None
+---+----+------+
| No|char|values|
+---+----+------+
|  1|   a|[1, 2]|
+---+----+------+

None
+---+-----------+
| No|char LIKE a|
+---+-----------+
|  1|       true|
|  2|      false|
+---+-----------+

None
print(df.select('No',df.char.startswith('a')).show()) ## 以指定字符开始
print(df.select('No',df.char.endswith('b')).show())
+---+-------------------+
| No|startswith(char, a)|
+---+-------------------+
|  1|               true|
|  2|              false|
+---+-------------------+

None
+---+-----------------+
| No|endswith(char, b)|
+---+-----------------+
|  1|            false|
|  2|             true|
+---+-----------------+

None
rdd = sc.parallelize([Row(No=1,char='aaaaa',values=[1,2]),Row(No=2,char='bbbbb',values=[1,4])])
df = spark.createDataFrame(rdd)
print(df.select('No',df.char.substr(1,3)).show())
print(df.select(df.No.between(0,1)).show())
+---+---------------------+
| No|substring(char, 1, 3)|
+---+---------------------+
|  1|                  aaa|
|  2|                  bbb|
+---+---------------------+

None
+-------------------------+
|((No >= 0) AND (No <= 1))|
+-------------------------+
|                     true|
|                    false|
+-------------------------+

None
## 增加列
df = df.withColumn('new_col',df.char)
print(df.show())
df = df.withColumnRenamed('new_col','renamed_col')
print(df.show())
+---+-----+------+-------+
| No| char|values|new_col|
+---+-----+------+-------+
|  1|aaaaa|[1, 2]|  aaaaa|
|  2|bbbbb|[1, 4]|  bbbbb|
+---+-----+------+-------+

None
+---+-----+------+-----------+
| No| char|values|renamed_col|
+---+-----+------+-----------+
|  1|aaaaa|[1, 2]|      aaaaa|
|  2|bbbbb|[1, 4]|      bbbbb|
+---+-----+------+-----------+

None
## 删除列
df = df.drop("renamed_col")
print(df.show())
df = df.drop(df.values)
print(df.show())
+---+-----+------+
| No| char|values|
+---+-----+------+
|  1|aaaaa|[1, 2]|
|  2|bbbbb|[1, 4]|
+---+-----+------+

None
+---+-----+
| No| char|
+---+-----+
|  1|aaaaa|
|  2|bbbbb|
+---+-----+

None
## groupby
rdd = sc.parallelize([Row(No=1,char='aaaaa',values=[1,2]),
                      Row(No=2,char='bbbbb',values=[1,4]),
                      Row(No=1,char='ccccc',values=[3,5])])
df = spark.createDataFrame(rdd)
print(df.groupBy('No').count().show())  # 区别?
print(df.groupby('No').count().show())  # 区别?
+---+-----+
| No|count|
+---+-----+
|  1|    2|
|  2|    1|
+---+-----+

None
+---+-----+
| No|count|
+---+-----+
|  1|    2|
|  2|    1|
+---+-----+

None
print(df.filter(df.No>1).show())
+---+-----+------+
| No| char|values|
+---+-----+------+
|  2|bbbbb|[1, 4]|
+---+-----+------+

None
## sort
print(df.sort(df.No.desc()).show())  ## 降序, 升序是asc
print(df.orderBy(['No','char'],asciiending=[1,1]).show())  ## 
+---+-----+------+
| No| char|values|
+---+-----+------+
|  2|bbbbb|[1, 4]|
|  1|aaaaa|[1, 2]|
|  1|ccccc|[3, 5]|
+---+-----+------+

None
+---+-----+------+
| No| char|values|
+---+-----+------+
|  1|aaaaa|[1, 2]|
|  1|ccccc|[3, 5]|
|  2|bbbbb|[1, 4]|
+---+-----+------+

None
##  值替换和缺失值填充
rdd = sc.parallelize([Row(No=1,char='aaaaa',values=[1,2]),
                      Row(No=2,char='bbbbb',values=[1,4]),
                      Row(No=1,char='ccccc',values=[3,5])])
df = spark.createDataFrame(rdd)
print(df.show())
print(df.na.fill('eeee').show())
print(df.na.drop().show())  ## 删除有空值的行
print(df.na.replace(1,10).show()) ## 对指定值进行替换
+---+-----+------+
| No| char|values|
+---+-----+------+
|  1|aaaaa|[1, 2]|
|  2|bbbbb|[1, 4]|
|  1|ccccc|[3, 5]|
+---+-----+------+

None
+---+-----+------+
| No| char|values|
+---+-----+------+
|  1|aaaaa|[1, 2]|
|  2|bbbbb|[1, 4]|
|  1|ccccc|[3, 5]|
+---+-----+------+

None
+---+-----+------+
| No| char|values|
+---+-----+------+
|  1|aaaaa|[1, 2]|
|  2|bbbbb|[1, 4]|
|  1|ccccc|[3, 5]|
+---+-----+------+

None
+---+-----+------+
| No| char|values|
+---+-----+------+
| 10|aaaaa|[1, 2]|
|  2|bbbbb|[1, 4]|
| 10|ccccc|[3, 5]|
+---+-----+------+

None
## 分区相关
print(df.repartition(3).rdd.getNumPartitions())
print(df.coalesce(1).rdd.getNumPartitions())
3
1
## 用SQL进行操作
## 建立视图
#df.createGlobalTempView('demo') #across spark sessions
df.createOrReplaceTempView('demo') # spark session lifetime
#df.createTempView('demo')
spark.sql("select * from demo").show()
+---+-----+------+
| No| char|values|
+---+-----+------+
|  1|aaaaa|[1, 2]|
|  2|bbbbb|[1, 4]|
|  1|ccccc|[3, 5]|
+---+-----+------+
## 转其它数据
df.toJSON()
df.toPandas()
df.rdd.take(1)
# 终止会话
spark.stop()
# 保存df
df_vehicle_route_gps_info_milage.write.save(path='hdfs://tmp/dev_tsy_mileage_case1', format='csv', mode='overwrite', sep='\t')
df.write.saveAsTable(save_table,mode='overwrite',format='parquet')
df.write.csv(path,mode='overwrite',compression='none')

# PySpark Merge Two DataFrames with Different Columns

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

#Create DataFrame df1 with columns name,dept & age
data = [("James","Sales",34), ("Michael","Sales",56), \
    ("Robert","Sales",30), ("Maria","Finance",24) ]
columns= ["name","dept","age"]
df1 = spark.createDataFrame(data = data, schema = columns)
df1.printSchema()

#Create DataFrame df1 with columns name,dep,state & salary
data2=[("James","Sales","NY",9000),("Maria","Finance","CA",9000), \
    ("Jen","Finance","NY",7900),("Jeff","Marketing","CA",8000)]
columns2= ["name","dept","state","salary"]
df2 = spark.createDataFrame(data = data2, schema = columns2)
df2.printSchema()
#Add missing columns 'state' & 'salary' to df1
from pyspark.sql.functions import lit
for column in [column for column in df2.columns if column not in df1.columns]:
    df1 = df1.withColumn(column, lit(None))

#Add missing column 'age' to df2
for column in [column for column in df1.columns if column not in df2.columns]:
    df2 = df2.withColumn(column, lit(None))

#Finally join two dataframe's df1 & df2 by name
merged_df=df1.unionByName(df2)
merged_df.show()

# Add a column from another DataFrame
minDf = sc.parallelize(['2016-11-01','2016-11-02','2016-11-03']).map(lambda x: (x, )).toDF(['date_min'])
maxDf = sc.parallelize(['2016-12-01','2016-12-02','2016-12-03']).map(lambda x: (x, )).toDF(['date_max'])
def zip_df(l, r):
    return l.rdd.zip(r.rdd).map(lambda x: (x[0][0],x[1][0])).toDF(StructType([l.schema[0],r.schema[0]]))

combined = zip_df(minDf, maxDf.select('date_max'))
combined.show()

数据持久化

https://blog.csdn.net/qq_32023541/article/details/79282179

多个列名更改

rename_cols = [col(col_name).alias("s_" + col_name)  for col_name in df_label.columns]
df_label = df_label.select(*rename_cols)

pyspark 覆盖分区表数据

# 先删除原有分区
drop_sql = "ALTER TABLE %s DROP IF EXISTS PARTITION (%s='%s')"%(save_table,partition_col,tar_dt)
print(drop_sql)
spark.sql(drop_sql).show(100,False)
# 再追加写入
df_data.write.saveAsTable(save_table, mode='append', partitionBy=[partition_col])

利用Row实现rdd与dataframe互转

from pyspark.sql import Row
def tmp_fun(row):
    tmp = row.asDict()
    tmp['c'] = {'a': 1, 'b': 2}
    return Row(**tmp)
df_data = spark.sql('select * from table')
rdd_data = df_data.rdd.map(tmp_fun)
df_data = rdd_data.toDF()

dataframe 保存为JSON

值为空的key缺失

读取保存在hdfs路径的CSV文件

df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('hdfs://path/data.csv')

取消缓存

spark 2.x

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate
spark.catalog.clearCache()

spark 1.x

from pyspark.sql import SQLContext
from pyspark import SparkContext
sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate())
sqlContext.clearCache()

读取csv为dataframe

df = spark.read.load(data_path, format='com.databricks.spark.csv', header='true', inferSchema='true')

schema = StructType() \
        .add("a",Longtype(),True) \
        .add("b",FloatType(),True) 

df_with_schema = spark.read.format("csv").option("header", True).schema(schema).load(data_path)

保存df 为csv

df.repartition(1).write.format('com.databricks.spark.csv').save("/path/to/file/myfile.csv",header = 'true')
df.write.option("compression", "gzip").csv("path")
压缩格式可以为:gzip bzip2 lz4 snappy deflate

compressMode = "org.apache.hadoop.io.compress.GzipCodec"
df.write.mode("overwrite").format("**").option("codec", compressMode).save(path)


df.write.format("com.databricks.spark.csv")\
        .mode('overwrite')\
        .option("header", "false") \
        .option('compression','none') \
        .save(path)

从rdd生成df,显式指定数据类型

from pyspark.sql.types import StructType,StructField, StringType,IntegerType,DoubleType
deptSchema = StructType([       
    StructField('a', StringType(), True),
    StructField('b', IntegerType(), True),
    StructField('c', DoubleType(), True),
    StructField('d',ArrayType(IntegerType()),True)  
])
查看schema:df.printSchema()

#### bigint 对应Longtype
rdd_feat = df_data.rdd.map(map_fun)
df_feat = spark.createDataFrame(rdd_feat, schema = deptSchema)

加载训练好的模型文件,进行分布式预测打分

import pickle
model_path = 'hdfs://xgb_model.pkl'
rdd_model = sc.binaryFiles(model_path)
model_str = rdd_model.take(1)[0][1]
model = pickle.loads(model_str)
sc.broadcast(model)

pdf_feat = pd.DataFrame([feat_list])
pdf_feat.columns = ['feat_1','feat_2',...]
predict_score = model.predict_proba(pdf_feat)[:,1][0]
 
model.get_booster().feature_names = ['f%s'%i for i in range(64)]     ### ValueError: feature_names mismatch
def predictor(all_feat):
    score = float(model.predict_proba(all_feat)[:,1])
    return score 
from pyspark.sql.types import FloatType
import pyspark.sql.functions as f
udf_predictor = udf(predictor, FloatType())
columns = [f.col('p_vec_0'),f.col('p_vec_1')]
df = df.withColumn("all_feat", f.array(columns))
df_prediction = df.withColumn("risk_score", udf_predictor(df['all_feat']))

加载训练好的模型文件,进行分布式预测打分 方法2

定义rdd 的map函数,走map中确保输入特征列的顺序与模型的特征列顺序是一致的,然后在predict的时候,就参数设置为validate_features=False,传入特征list进行打分。

并发提交任务


class A:
    def __init__(self,spark):
        self.spark = spark 
    def run(self,value):
        print(f'{value} start')
        df = spark.read.parquet('hdfs://aaa')
        cnt=df.count()
        print(f'{value} finish')
        return value   


class B:
    def __init__(self):
        self.spark = spark = SparkSession.builder.master('yarn').enableHiveSupport().getOrCreate()

    def run(self,value):
        print(f'{value} start')
        df = spark.read.parquet('hdfs://aaa')
        cnt=df.count()
        print(f'{value} finish')
        return value 

def run(value):
    B().run(value)  


import concurrent

a = A(spark)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(a.run,[x for x in range(10)])

print('---')
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(run,x) for x in range(10,20)]

spark 文件 共享

from pyspark import SparkContext
from pyspark import SparkFiles

finddistance = "/home/hadoop/examples_pyspark/finddistance.R"
finddistancename = "finddistance.R"
sc = SparkContext("local", "SparkFile App")
sc.addFile(finddistance)
print "Absolute Path -> %s" % SparkFiles.get(finddistancename)

# 或
spark.sparkContext.addFile(saved_model_path)
file_local = SparkFiles.get(filename)

sc

sc =spark.sparkContext.getOrCreate()
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
check = fs.exists(sc._jvm.org.apache.hadoop.fs.Path(f"{save_path}/_SUCCESS"))
fs.delete(sc._jvm.org.apache.hadoop.fs.Path(save_path+'_xxxx_tmp'))

posted @ 2018-11-23 22:08  机器狗mo  阅读(392)  评论(0编辑  收藏  举报