While searching for A free Pandas Project on Google Found this link –Exploratory Data Analysis (EDA) with Pandas in Banking . I have tried to convert this Pyscript in Pyspark one.
First, let’s handle the initial steps of downloading and extracting the data:
# These are shell commands to download and unzip the data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/EDA_Pandas_Banking_L1/bank-additional.zip
!unzip -o -q bank-additional.zip
In a PySpark context, we’ll assume the data is already available locally after the above steps.
Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.rcParams["figure.figsize"] = (8, 6)
import warnings
warnings.filterwarnings('ignore')
Equivalent imports in PySpark:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import numpy as np
# Initialize Spark session
spark = SparkSession.builder.appName("EDA_PySpark_Banking").getOrCreate()
Loading Data
Pandas:
df = pd.read_csv('bank-additional/bank-additional-full.csv', sep=';')
df.head(5)
PySpark:
df = spark.read.csv('bank-additional/bank-additional-full.csv', sep=';', header=True, inferSchema=True)
df.show(5)
Initial Exploration
Columns
Pandas:
df.columns
PySpark:
df.columns
Info
Pandas:
print(df.info())
PySpark:
df.printSchema()
Describe
Pandas:
df.describe()
df.describe(include=["object"])
PySpark:
df.describe().show()
df.select([countDistinct(c).alias(c) for c in df.columns]).show()
Value Counts
Pandas:
df["y"].value_counts()
df["marital"].value_counts(normalize=True)
PySpark:
df.groupBy("y").count().show()
df.groupBy("marital").count().withColumn("normalize", col("count") / df.count()).show()
Sorting
Pandas:
df.sort_values(by="duration", ascending=False).head()
df.sort_values(by=["age", "duration"], ascending=[True, False]).head()
PySpark:
df.orderBy(col("duration").desc()).show(5)
df.orderBy(col("age").asc(), col("duration").desc()).show(5)
Applying Functions
Pandas:
df.apply(np.max)
d = {"no": 0, "yes": 1}
df["y"] = df["y"].map(d)
PySpark:
# PySpark does not have an apply method; use select with max for each column
df.select([max(c).alias(c) for c in df.columns]).show()
# Mapping values
df = df.withColumn("y", when(col("y") == "yes", 1).otherwise(0))
Further Analysis
Pandas:
print("Share of attracted clients =", '{:.1%}'.format(df["y"].mean()))
df[df["y"] == 1].mean()
acd = round(df[df["y"] == 1]["duration"].mean(), 2)
acd_in_min = acd // 60
print("Average call duration for attracted clients =", acd_in_min, "min", int(acd) % 60, "sec")
print("Average age of attracted clients =", int(df[(df["y"] == 1) & (df["marital"] == "single")]["age"].mean()), "years")
df[-1:]
PySpark:
df.groupBy().agg(mean("y")).show()
df.filter(df["y"] == 1).agg(*[mean(c).alias(c) for c in df.columns]).show()
acd = df.filter(df["y"] == 1).agg(round(mean("duration"), 2)).collect()[0][0]
acd_in_min = acd // 60
print("Average call duration for attracted clients =", acd_in_min, "min", int(acd) % 60, "sec")
avg_age = df.filter((df["y"] == 1) & (df["marital"] == "single")).agg(mean("age")).collect()[0][0]
print("Average age of attracted clients =", int(avg_age), "years")
df.orderBy(desc("age")).limit(1).show()
Crosstab and Pivot Table
Pandas:
pd.crosstab(df["y"], df["marital"])
pd.crosstab(df["y"], df["marital"], normalize='index')
df.pivot_table(["age", "duration"], ["job"], aggfunc="mean").head(10)
PySpark:
df.groupBy("y", "marital").count().show()
df.groupBy("y", "marital").count().withColumn("normalize", col("count") / sum("count").over(Window.partitionBy("y"))).show()
df.groupBy("job").agg(mean("age"), mean("duration")).show()
Plots
Pandas:
pd.plotting.scatter_matrix(df[["age", "duration", "campaign"]], figsize=(15, 15), diagonal="kde")
plt.show()
df["age"].hist()
df.hist(color="k", bins=30, figsize=(15, 10))
plt.show()
df.boxplot(column="age", by="marital")
plt.show()
df.boxplot(column="age", by=["marital", "housing"], figsize=(20, 20))
plt.show()
PySpark:
# PySpark does not support direct plotting; use matplotlib for plotting
# Convert to Pandas for plotting
pandas_df = df.select("age", "duration", "campaign").toPandas()
pd.plotting.scatter_matrix(pandas_df, figsize=(15, 15), diagonal="kde")
plt.show()
df.select("age").toPandas().hist()
df.toPandas().hist(color="k", bins=30, figsize=(15, 10))
plt.show()
df.select("age", "marital").toPandas().boxplot(by="marital")
plt.show()
df.select("age", "marital", "housing").toPandas().boxplot(by=["marital", "housing"], figsize=(20, 20))
plt.show()
Combining Code
Here is the combined code:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Initialize Spark session
spark = SparkSession.builder.appName("EDA_PySpark_Banking").getOrCreate()
# Load the data
df = spark.read.csv('bank-additional/bank-additional-full.csv', sep=';', header=True, inferSchema=True)
# Initial exploration
df.printSchema()
df.describe().show()
df.groupBy("y").count().show()
df.groupBy("marital").count().withColumn("normalize", col("count") / df.count()).show()
df.orderBy(col("duration").desc()).show(5)
df.orderBy(col("age").asc(), col("duration").desc()).show(5)
df.select([max(c).alias(c) for c in df.columns]).show()
df = df.withColumn("y", when(col("y") == "yes", 1).otherwise(0))
df.groupBy().agg(mean("y")).show()
df.filter(df["y"] == 1).agg(*[mean(c).alias(c) for c in df.columns]).show()
acd = df.filter(df["y"] == 1).agg(round(mean("duration"), 2)).collect()[0][0]
acd_in_min = acd // 60
print("Average call duration for attracted clients =", acd_in_min, "min", int(acd) % 60, "sec")
avg_age = df.filter((df["y"] == 1) & (df["marital"] == "single")).agg(mean("age")).collect()[0][0]
print("Average age of attracted clients =", int(avg_age), "years")
df.orderBy(desc("age")).limit(1).show()
df.groupBy("y", "marital").count().show()
df.groupBy("y", "marital").count().withColumn("normalize", col("count") / sum("count").over(Window.partitionBy("y"))).show()
df.groupBy("job").agg(mean("age"), mean("duration")).show()
# Plots
pandas_df = df.select("age", "duration", "campaign").toPandas()
pd.plotting.scatter_matrix(pandas_df, figsize=(15, 15), diagonal="kde")
plt.show()
df.select("age").toPandas().hist()
df.toPandas().hist(color="k", bins=30, figsize=(15, 10))
plt.show()
df.select("age", "marital").toPandas().boxplot(by="marital")
plt.show()
df.select("age", "marital", "housing").toPandas().boxplot(by=["marital", "housing"], figsize=(20, 20))
plt.show()
Discover more from AI HintsToday
Subscribe to get the latest posts sent to your email.