filter中使用udf以及in操作的例子

df = spark.sql(
    "select 100 as c1,Array(struct(1,2)) as a"
    " union all"
    " select 50 as c1,Array(struct(3,4)) as a"
)

def test_udf(c):
    return c+1
spark.udf.register('test_udf',test_udf,IntegerType())
df.filter(test_udf(F.col('c1')).isin(101,50)).show()
df = spark.sql(
    "select 100 as c1,Array(struct(1,2),struct(100,200)) as a"
    " union all"
    " select 50 as c1,Array(struct(3,4),struct(300,400)) as a"
)

df.createOrReplaceTempView('t1')
#未指定,则以col1,col2等默认方式命名
spark.sql("select c1,a[0]['col2'] from t1").show()

#如果想根据列的值进行过滤,可以使用array_contains
spark.sql("select * from t1 where array_contains(a['col1'],1)").show()

#另外一种方式展开:先行列变换,然后按条件过滤
def lg_to_number(string):
return unidecode(string)
udf_lg_to_number =udf(lg_to_number,returnType=StringType())
df1.select(F.col('c1'),F.explode(F.col('a')).alias('b')).withColumn('aaa',udf_lg_to_number(F.col('b')['col1'])).show()
原文地址:https://www.cnblogs.com/muyue123/p/13986889.html