from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when
spark = SparkSession.builder.appName(“NullHandling”).getOrCreate()
# Step 1: Read CSV with header and infer schema
df = spark.read.option(“header”, True).option(“inferSchema”, True).csv(“path/to/file.csv”)
# Step 2: Dynamically replace nulls based on column data types
for column, dtype in df.dtypes:
if dtype in [“int”, “bigint”, “double”, “float”]:
df = df.withColumn(column, when(col(column).isNull(), lit(0)).otherwise(col(column)))
elif dtype == “string”:
df = df.withColumn(column, when(col(column).isNull(), lit(“missing”)).otherwise(col(column)))
elif dtype == “boolean”:
df = df.withColumn(column, when(col(column).isNull(), lit(False)).otherwise(col(column)))
# Add other type-specific defaults as needed
# Final cleaned DataFrame
df.show()
`
### ✅ Summary:
* Automatically detects column types.
* Fills nulls with type-appropriate defaults:
0
for numbers, "missing"
for strings, False
for booleans.
* Avoids hardcoding column names.