filter中使用udf以及in操作的例子
df = spark.sql( "select 100 as c1,Array(struct(1,2)) as a" " union all" " select 50 as c1,Array(struct(3,4)) as a" ) def test_udf(c): return c+1 spark.udf.register('test_udf',test_udf,IntegerType()) df.filter(test_udf(F.col('c1')).isin(101,50)).show()
df = spark.sql( "select 100 as c1,Array(struct(1,2),struct(100,200)) as a" " union all" " select 50 as c1,Array(struct(3,4),struct(300,400)) as a" ) df.createOrReplaceTempView('t1') #未指定,则以col1,col2等默认方式命名 spark.sql("select c1,a[0]['col2'] from t1").show()
#如果想根据列的值进行过滤,可以使用array_contains
spark.sql("select * from t1 where array_contains(a['col1'],1)").show()
#另外一种方式展开:先行列变换,然后按条件过滤
def lg_to_number(string):
return unidecode(string)
udf_lg_to_number =udf(lg_to_number,returnType=StringType())
df1.select(F.col('c1'),F.explode(F.col('a')).alias('b')).withColumn('aaa',udf_lg_to_number(F.col('b')['col1'])).show()