While searching for A free Pandas Project on Google Found this link –Exploratory Data Analysis (EDA) with Pandas in Banking . I have tried to convert this Pyscript in Pyspark one.

First, let’s handle the initial steps of downloading and extracting the data:

In a PySpark context, we’ll assume the data is already available locally after the above steps.

Importing Libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline
plt.rcParams["figure.figsize"] = (8, 6)

import warnings
warnings.filterwarnings('ignore')

Equivalent imports in PySpark:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import numpy as np

# Initialize Spark session
spark = SparkSession.builder.appName("EDA_PySpark_Banking").getOrCreate()

Loading Data

Pandas:

df = pd.read_csv('bank-additional/bank-additional-full.csv', sep=';')
df.head(5)

PySpark:

df = spark.read.csv('bank-additional/bank-additional-full.csv', sep=';', header=True, inferSchema=True)
df.show(5)

Initial Exploration

Columns

Pandas:

df.columns

PySpark:

df.columns

Info

Pandas:

print(df.info())

PySpark:

df.printSchema()

Describe

Pandas:

df.describe()
df.describe(include=["object"])

PySpark:

df.describe().show()
df.select([countDistinct(c).alias(c) for c in df.columns]).show()

Value Counts

Pandas:

df["y"].value_counts()
df["marital"].value_counts(normalize=True)

PySpark:

df.groupBy("y").count().show()
df.groupBy("marital").count().withColumn("normalize", col("count") / df.count()).show()

Sorting

Pandas:

df.sort_values(by="duration", ascending=False).head()
df.sort_values(by=["age", "duration"], ascending=[True, False]).head()

PySpark:

df.orderBy(col("duration").desc()).show(5)
df.orderBy(col("age").asc(), col("duration").desc()).show(5)

Applying Functions

Pandas:

df.apply(np.max)
d = {"no": 0, "yes": 1}
df["y"] = df["y"].map(d)

PySpark:

# PySpark does not have an apply method; use select with max for each column
df.select([max(c).alias(c) for c in df.columns]).show()

# Mapping values
df = df.withColumn("y", when(col("y") == "yes", 1).otherwise(0))

Further Analysis

Pandas:

print("Share of attracted clients =", '{:.1%}'.format(df["y"].mean()))
df[df["y"] == 1].mean()
acd = round(df[df["y"] == 1]["duration"].mean(), 2)
acd_in_min = acd // 60
print("Average call duration for attracted clients =", acd_in_min, "min", int(acd) % 60, "sec")
print("Average age of attracted clients =", int(df[(df["y"] == 1) & (df["marital"] == "single")]["age"].mean()), "years")
df[-1:]

PySpark:

df.groupBy().agg(mean("y")).show()
df.filter(df["y"] == 1).agg(*[mean(c).alias(c) for c in df.columns]).show()
acd = df.filter(df["y"] == 1).agg(round(mean("duration"), 2)).collect()[0][0]
acd_in_min = acd // 60
print("Average call duration for attracted clients =", acd_in_min, "min", int(acd) % 60, "sec")
avg_age = df.filter((df["y"] == 1) & (df["marital"] == "single")).agg(mean("age")).collect()[0][0]
print("Average age of attracted clients =", int(avg_age), "years")
df.orderBy(desc("age")).limit(1).show()

Crosstab and Pivot Table

Pandas:

pd.crosstab(df["y"], df["marital"])
pd.crosstab(df["y"], df["marital"], normalize='index')
df.pivot_table(["age", "duration"], ["job"], aggfunc="mean").head(10)

PySpark:

df.groupBy("y", "marital").count().show()
df.groupBy("y", "marital").count().withColumn("normalize", col("count") / sum("count").over(Window.partitionBy("y"))).show()
df.groupBy("job").agg(mean("age"), mean("duration")).show()

Plots

Pandas:

pd.plotting.scatter_matrix(df[["age", "duration", "campaign"]], figsize=(15, 15), diagonal="kde")
plt.show()

df["age"].hist()
df.hist(color="k", bins=30, figsize=(15, 10))
plt.show()

df.boxplot(column="age", by="marital")
plt.show()

df.boxplot(column="age", by=["marital", "housing"], figsize=(20, 20))
plt.show()

PySpark:

# PySpark does not support direct plotting; use matplotlib for plotting
# Convert to Pandas for plotting
pandas_df = df.select("age", "duration", "campaign").toPandas()
pd.plotting.scatter_matrix(pandas_df, figsize=(15, 15), diagonal="kde")
plt.show()

df.select("age").toPandas().hist()
df.toPandas().hist(color="k", bins=30, figsize=(15, 10))
plt.show()

df.select("age", "marital").toPandas().boxplot(by="marital")
plt.show()

df.select("age", "marital", "housing").toPandas().boxplot(by=["marital", "housing"], figsize=(20, 20))
plt.show()

Combining Code

Here is the combined code:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Initialize Spark session
spark = SparkSession.builder.appName("EDA_PySpark_Banking").getOrCreate()

# Load the data
df = spark.read.csv('bank-additional/bank-additional-full.csv', sep=';', header=True, inferSchema=True)

# Initial exploration
df.printSchema()
df.describe().show()

df.groupBy("y").count().show()
df.groupBy("marital").count().withColumn("normalize", col("count") / df.count()).show()

df.orderBy(col("duration").desc()).show(5)
df.orderBy(col("age").asc(), col("duration").desc()).show(5)

df.select([max(c).alias(c) for c in df.columns]).show()
df = df.withColumn("y", when(col("y") == "yes", 1).otherwise(0))

df.groupBy().agg(mean("y")).show()
df.filter(df["y"] == 1).agg(*[mean(c).alias(c) for c in df.columns]).show()

acd = df.filter(df["y"] == 1).agg(round(mean("duration"), 2)).collect()[0][0]
acd_in_min = acd // 60
print("Average call duration for attracted clients =", acd_in_min, "min", int(acd) % 60, "sec")

avg_age = df.filter((df["y"] == 1) & (df["marital"] == "single")).agg(mean("age")).collect()[0][0]
print("Average age of attracted clients =", int(avg_age), "years")
df.orderBy(desc("age")).limit(1).show()

df.groupBy("y", "marital").count().show()
df.groupBy("y", "marital").count().withColumn("normalize", col("count") / sum("count").over(Window.partitionBy("y"))).show()
df.groupBy("job").agg(mean("age"), mean("duration")).show()

# Plots
pandas_df = df.select("age", "duration", "campaign").toPandas()
pd.plotting.scatter_matrix(pandas_df, figsize=(15, 15), diagonal="kde")
plt.show()

df.select("age").toPandas().hist()
df.toPandas().hist(color="k", bins=30, figsize=(15, 10))
plt.show()

df.select("age", "marital").toPandas().boxplot(by="marital")
plt.show()

df.select("age", "marital", "housing").toPandas().boxplot(by=["marital", "housing"], figsize=(20, 20))
plt.show()

Discover more from AI HintsToday

Subscribe to get the latest posts sent to your email.

Discover more from AI HintsToday

Subscribe now to keep reading and get access to the full archive.

Continue reading