Handling Nulls in PySpark
Handling Nulls in PySpark
df.filter(col("column_name").isNull()).show()
df.filter(col("column_name").isNotNull()).show()
Using na Functions
df.select([col(c).isNull().alias(c) for c in
df.columns]).show()
df.select([count(when(col(c).isNull(), c)).alias(c)
for c in df.columns]).show()
df.na.drop()
df.na.drop("any")
df.na.drop("all")
df.na.drop(subset=["column1", "column2"])
df.na.fill("default_value").show()
df.na.fill(0).show()
mean_value =
df.select(mean(col("column_name"))).collect()[0]
[0]
df = df.na.fill(mean_value, subset=
["column_name"])
median_value =
df.approxQuantile("column_name", [0.5], 0.01)[0]
df = df.na.fill(median_value, subset=
["column_name"])
mode_value =
df.groupBy("column_name").count().orderBy(col(
"count").desc()).first()[0]
df = df.na.fill(mode_value, subset=
["column_name"])
window_spec =
Window.orderBy("some_column").rowsBetween(-
sys.maxsize, 0)
df = df.withColumn("column_name",
last("column_name", True).over(window_spec))
window_spec =
Window.orderBy("some_column").rowsBetween(
0, sys.maxsize)
df = df.withColumn("column_name",
last("column_name", False).over(window_spec))
df.agg(count("column_name")).show()
df.agg(count(when(col("column_name").isNotNull
(), True))).show()
df1 = df1.withColumn("join_column",
coalesce(col("join_column"), lit("default_value")))
df2 = df2.withColumn("join_column",
coalesce(col("join_column"), lit("default_value")))
joined_df = df1.join(df2, "join_column", "inner")
df = df.withColumn("array_column",
array_remove("array_column", None))
df = df.withColumn("struct_column",
col("struct_column").alias("new_struct")).drop("st
ruct_column")
https://www.seekhobigdata.com/