ItemPySparkSpark SQLPandas
Read CSVspark.read.csv(“file.csv”)SELECT * FROM csv.file.csv`pd.read_csv(“file.csv”)
 spark.read.csvSELECT * FROM csv.pd.read_csv
Read JSONspark.read.json(“file.json”)SELECT * FROM json.file.json`pd.read_json(“file.json”)
Read Parquetspark.read.parquet(“file.parquet”)SELECT * FROM parquet.file.parquet`pd.read_parquet(“file.parquet”)
Data CreationPySparkSpark SQLPandas
From Listspark.createDataFrame([(1, ‘A’), (2, ‘B’)], [“col1”, “col2”])INSERT INTO table VALUES (1, ‘A’), (2, ‘B’)pd.DataFrame({‘col1’: [1, 2], ‘col2’: [‘A’, ‘B’]})
From Dictionaryspark.createDataFrame([Row(**dict1)])N/Apd.DataFrame.from_dict(dict1)
UDFPySparkSpark SQLPandas
Define UDFfrom pyspark.sql.functions import udfCREATE FUNCTION custom_func AS ‘com.example.CustomFunction’def func(x): return x+1
def func(x): return x+1
func_udf = udf(func)
Use UDFdf.withColumn(“new_col”, func_udf(df[“col”]))SELECT custom_func(col) AS new_col FROM tabledf[“new_col”] = df[“col”].apply(func)
Table CreationPySparkSpark SQLPandas
Create DataFramespark.createDataFrame(data)CREATE TABLE table AS SELECT * FROM datapd.DataFrame(data)
Create Temporary ViewLPCREATE TEMPORARY VIEW table AS SELECT * FROM dataN/A
DisplayPySparkSpark SQLPandas
Display First N Rowsdf.show(n)SELECT * FROM table LIMIT ndf.head(n)
Show All Rowsdf.show(truncate=False)SELECT * FROM tabledf
Show Specific Columnsdf.select(“col1”, “col2”).show()SELECT col1, col2 FROM table LIMIT ndf[[‘col1’, ‘col2’]].head(n)
Show with Truncationdf.show(n, truncate=True)SELECT * FROM table LIMIT nN/A
Show with Custom Widthdf.show(n, truncate=<width>)N/AN/A
Vertical Showdf.show(n, vertical=True)N/Adf.head(n).T
Display Summary Statsdf.describe().show()DESCRIBE tabledf.describe()
Show Distinct Rowsdf.distinct().show()SELECT DISTINCT * FROM tabledf.drop_duplicates()
Display Schemadf.printSchema()DESCRIBE TABLE tabledf.info()
Show Unique Valuesdf.select(“col”).distinct().show()SELECT DISTINCT col FROM tabledf[‘col’].unique()
Count Rowsdf.count()SELECT COUNT(*) FROM tablelen(df) or df.shape[0]
Check for Nullsdf.filter(df.col.isNull()).show()SELECT * FROM table WHERE col IS NULLdf[df[‘col’].isna()]
Column OperationPySparkSpark SQLPandas
Add New Columndf.withColumn(“new_col”, df.col * 2)SELECT *, col * 2 AS new_col FROM tabledf[‘new_col’] = df[‘col’] * 2
Drop Columndf.drop(“col”)SELECT * EXCEPT (col) FROM tabledf.drop(“col”, axis=1)
Column Manipulationsdf.withColumn(“col”, expr), df.drop(“col”)ALTER TABLE DROP COLUMN or create new tables with selected columnsdf[‘new_col’] = expr, df.drop(columns=[“col”])
Table Alterationdf.withColumnRenamed, df.drop and then save as newALTER TABLE table ADD/DROP COLUMN …df.rename(columns={“old”: “new”})
Rename Columndf.withColumnRenamed(“old”, “new”)SELECT col AS new FROM tabledf.rename(columns={“old”: “new”})
 df_new = df.toDF(*[col + “_v1” for col in df.columns])  
TransformationPySparkSpark SQLPandas
Select Columnsdf.select(“col1”, “col2”)SELECT col1, col2 FROM tabledf[[“col1”, “col2”]]
Filter Rowsdf.filter(df[“col”] > 5)SELECT * FROM table WHERE col > 5df[df[“col”] > 5]
Group Bydf.groupBy(“col”).count()SELECT col, COUNT(*) FROM table GROUP BY coldf.groupby(“col”).count()
Group By Multiple Columnsdf.groupBy([“col1”, “col2”])SELECT col1, col2, COUNT(*) FROM table GROUP BY col1, col2df.groupby([“col1”, “col2”])
Countdf.groupBy(“col”).count()SELECT col, COUNT(*) FROM table GROUP BY coldf.groupby(“col”).count()
Sumdf.groupBy(“col”).sum()SELECT col, SUM(value) FROM table GROUP BY coldf.groupby(“col”)[“value”].sum()
Averagedf.groupBy(“col”).avg()SELECT col, AVG(value) FROM table GROUP BY coldf.groupby(“col”)[“value”].mean()
Maxdf.groupBy(“col”).max()SELECT col, MAX(value) FROM table GROUP BY coldf.groupby(“col”)[“value”].max()
Mindf.groupBy(“col”).min()SELECT col, MIN(value) FROM table GROUP BY coldf.groupby(“col”)[“value”].min()
Group By with Filterdf.groupBy(“col”).filter(df[“value”] > 10)SELECT col, COUNT(*) FROM table WHERE value > 10 GROUP BY coldf.groupby(“col”).filter(lambda x: x[“value”] > 10)
Group By with Havingdf.groupBy(“col”).having(df[“value”] > 10)SELECT col, COUNT(*) FROM table GROUP BY col HAVING value > 10df.groupby(“col”).having(df[“value”] > 10)
Group By with Rollupdf.groupBy(“col”).rollup()SELECT col, COUNT(*) FROM table GROUP BY ROLLUP(col)N/A
Sort Ascendingdf.sort(“col”)SELECT * FROM table ORDER BY col ASCdf.sort_values(“col”, ascending=True)
Sort Ascendingdf.orderBy(“col”)  
Sort Ascendingdf.orderBy(“col”, ascending=True)SELECT * FROM table ORDER BY col Ascdf.sort_values(by=”col”, ascending=True)
Sort Ascendingdf.sort(col(“col”).asc())  
Sort Descendingdf.sort(col(“col”).desc())SELECT * FROM table ORDER BY col DESCdf.sort_values(“col”, ascending=False)
Sort Descendingdf.orderBy(“col”, ascending=False)SELECT * FROM table ORDER BY col DESCdf.sort_values(by=”col”, ascending=False)
Sort Descendingdf.orderBy(col(“col”).desc())  
Sort Multiple Columnsdf.sort(“col1”, “col2”)SELECT * FROM table ORDER BY col1 ASC, col2 DESCdf.sort_values([“col1”, “col2”], ascending=[True, False])
Sort Multiple Columns (Asc/Desc)df.sort(col(“col1”).asc(), col(“col2”).desc())  
Sort Multiple Columns (Asc/Desc)df.orderBy([“col1”, “col2”], ascending=[True, False])SELECT * FROM table ORDER BY col1 ASC, col2 DESCdf.sort_values(by=[“col1”, “col2”], ascending=[True, False])
Sort Multiple Columnsdf.orderBy(“col1”, “col2”)  
Sort with Expressionsdf.sort((col(“col1”) + col(“col2”)).desc())SELECT * FROM table ORDER BY (col1 + col2) DESCdf.sort_values(df[“col1”] + df[“col2”], ascending=False)
 df.orderBy((df[“col1”] * 2).asc())  
Sort with Nulls Firstdf.sort(col(“col”).asc_nulls_first())SELECT * FROM table ORDER BY col ASC NULLS FIRSTdf.sort_values(“col”, ascending=True, na_position=’first’)
 df.orderBy(col(“col”).desc_nulls_first())  
Sort with Nulls Lastdf.sort(col(“col”).asc_nulls_last())SELECT * FROM table ORDER BY col ASC NULLS LASTdf.sort_values(“col”, ascending=True, na_position=’last’)
 df.orderBy(col(“col”).desc_nulls_last())  
Sort In PlaceN/AN/Adf.sort_values(“col”, inplace=True)
Sort by IndexN/AN/Adf.sort_index(ascending=True)
   df.sort_index(ascending=False)
Sort with Multiple Sort Keysdf.sort(“col1”, col(“col2”).desc())SELECT * FROM table ORDER BY col1 ASC, col2 DESCdf.sort_values([“col1”, “col2”], ascending=[True, True])
 df.orderBy(“col1”, col(“col2”).asc())  
Sort Using SQL QueryN/ASELECT * FROM table ORDER BY colN/A
Sort with Custom OrderN/AN/Adf[“col”] = pd.Categorical(df[“col”], categories=[“A”, “B”, “C”], ordered=True)
   df.sort_values(“col”)
Sort with Multiple Data Typesdf.sort(“col1”, “col2”) where col1 is integer and col2 is stringSELECT * FROM table ORDER BY col1, col2df.sort_values([“col1”, “col2”], ascending=[True, False])
 df.orderBy(“col1”, “col2”)  
Sort with Functionsdf.sort(lower(col(“col1”)))SELECT * FROM table ORDER BY LOWER(col1) ASCdf.sort_values(“col1”.str.lower(), ascending=True)
 df.orderBy(abs(col(“col2”)).desc())  
Uniondf1.union(df2)SELECT * FROM table1 UNION SELECT * FROM table2pd.concat([df1, df2])
JoinPySparkSpark SQLPandas
Inner Joindf1.join(df2, “col”)SELECT * FROM table1 INNER JOIN table2 ON colpd.merge(df1, df2, on=”col”)
Left Joindf1.join(df2, “col”, “left”)SELECT * FROM table1 LEFT JOIN table2 ON colpd.merge(df1, df2, on=”col”, how=”left”)
Right Joindf1.join(df2, “col”, “right”)SELECT * FROM table1 RIGHT JOIN table2 ON col 
Full Outer Joindf1.join(df2, “col”, “outer”)SELECT * FROM table1 FULL OUTER JOIN table2 ON col 
Left Semi Joindf1.join(df2, “col”, “left_semi”)SELECT * FROM table1 LEFT SEMI JOIN table2 ON col 
Left Anti Joindf1.join(df2, “col”, “left_anti”)SELECT * FROM table1 LEFT ANTI JOIN table2 ON col 
Transpose  / PivotPySparkSpark SQLPandas
Transposedf.transpose()N/Adf.T
Pivotdf.groupBy(“col”).pivot(“col2”).sum()SELECT * FROM table PIVOT SUM(col) FOR col2 IN (values)df.pivot_table(values=”col”, index=”col2″, columns=”col3″)
Window FunctionPySparkSpark SQLPandas
Row Numberdf.withColumn(“row_num”, row_number().over(Window.orderBy(“col”)))SELECT *, ROW_NUMBER() OVER (ORDER BY col) AS row_num FROM tabledf[‘row_num’] = df.sort_values(“col”).index + 1
Rankdf.withColumn(“rank”, rank().over(Window.orderBy(“col”)))SELECT *, RANK() OVER (ORDER BY col) AS rank FROM tabledf[‘rank’] = df[‘col’].rank(method=’min’)
Dense Rankdf.withColumn(“dense_rank”, dense_rank().over(Window.orderBy(“col”)))SELECT *, DENSE_RANK() OVER (ORDER BY col) AS dense_rank FROM tabledf[‘dense_rank’] = df[‘col’].rank(method=’dense’)
Percent Rankdf.withColumn(“percent_rank”, percent_rank().over(Window.orderBy(“col”)))SELECT *, PERCENT_RANK() OVER (ORDER BY col) AS percent_rank FROM tableNot directly available in Pandas; calculated using cumulative count formula
NTiledf.withColumn(“ntile”, ntile(n).over(Window.orderBy(“col”)))SELECT *, NTILE(n) OVER (ORDER BY col) AS ntile FROM tableNot available in Pandas, can use pd.qcut() for quantile grouping
Lagdf.withColumn(“lag_val”, lag(“col”, n).over(Window.orderBy(“col”)))SELECT *, LAG(col, n) OVER (ORDER BY col) AS lag_val FROM tabledf[‘lag_val’] = df[‘col’].shift(n)
Leaddf.withColumn(“lead_val”, lead(“col”, n).over(Window.orderBy(“col”)))SELECT *, LEAD(col, n) OVER (ORDER BY col) AS lead_val FROM tabledf[‘lead_val’] = df[‘col’].shift(-n)
Cumulative Sumdf.withColumn(“cum_sum”, sum(“col”).over(Window.orderBy(“col”).rowsBetween(-sys.maxsize, 0)))SELECT *, SUM(col) OVER (ORDER BY col ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_sum FROM tabledf[‘cum_sum’] = df[‘col’].cumsum()
Moving Averagedf.withColumn(“moving_avg”, avg(“col”).over(Window.orderBy(“col”).rowsBetween(-n, 0)))SELECT *, AVG(col) OVER (ORDER BY col ROWS BETWEEN n PRECEDING AND CURRENT ROW) AS moving_avg FROM tabledf[‘moving_avg’] = df[‘col’].rolling(window=n).mean()
First Valuedf.withColumn(“first_val”, first(“col”).over(Window.orderBy(“col”)))SELECT *, FIRST_VALUE(col) OVER (ORDER BY col) AS first_val FROM tabledf[‘first_val’] = df[‘col’].iloc[0]
Last Valuedf.withColumn(“last_val”, last(“col”).over(Window.orderBy(“col”)))SELECT *, LAST_VALUE(col) OVER (ORDER BY col) AS last_val FROM tabledf[‘last_val’] = df[‘col’].iloc[-1]
String ManipulationPySparkSpark SQLPandas
String ManipulationF.concat, F.upper, F.lower, F.substring, F.replaceCONCAT(col1, ‘text’), UPPER(col2), LOWER(col3), SUBSTRINGdf[‘col’].str.upper(), str.lower(), str.contains(‘text’)
Concatenatedf.withColumn(“new_col”, concat(col(“col1”), col(“col2”)))`SELECT col1 
Lengthdf.withColumn(“length”, length(col(“col”)))SELECT LENGTH(col) AS length FROM tabledf[‘length’] = df[‘col’].str.len()
Lowercasedf.withColumn(“lower”, lower(col(“col”)))SELECT LOWER(col) AS lower FROM tabledf[‘lower’] = df[‘col’].str.lower()
Uppercasedf.withColumn(“upper”, upper(col(“col”)))SELECT UPPER(col) AS upper FROM tabledf[‘upper’] = df[‘col’].str.upper()
Trimdf.withColumn(“trim”, trim(col(“col”)))SELECT TRIM(col) AS trim FROM tabledf[‘trim’] = df[‘col’].str.strip()
Splitdf.withColumn(“split”, split(col(“col”), “,”))SELECT SPLIT(col, ‘,’) AS split FROM tabledf[‘split’] = df[‘col’].str.split(‘,’)
Replacedf.withColumn(“replace”, replace(col(“col”), “old”, “new”))SELECT REPLACE(col, ‘old’, ‘new’) AS replace FROM tabledf[‘replace’] = df[‘col’].str.replace(‘old’, ‘new’)
Date FunctionsPySparkSpark SQLPandas
Date FunctionsF.date_format, F.to_date, F.datediff, F.add_monthsDATE_FORMAT(col, ‘format’), DATE_ADD(col, days)pd.to_datetime(df[‘col’])
Current Datedf.withColumn(“current_date”, current_date())SELECT CURRENT_DATE AS current_date FROM tabledf[‘current_date’] = pd.Timestamp.today()
Current Timestampdf.withColumn(“current_timestamp”, current_timestamp())SELECT CURRENT_TIMESTAMP AS current_timestamp FROM tabledf[‘current_timestamp’] = pd.Timestamp.now()
Date Formatdf.withColumn(“date_format”, date_format(col(“date”), “yyyy-MM-dd”))SELECT DATE_FORMAT(date, ‘yyyy-MM-dd’) AS date_format FROM tabledf[‘date_format’] = df[‘date’].dt.strftime(‘%Y-%m-%d’)
Day of Monthdf.withColumn(“day_of_month”, dayofmonth(col(“date”)))SELECT DAYOFMONTH(date) AS day_of_month FROM tabledf[‘day_of_month’] = df[‘date’].dt.day
Day of Weekdf.withColumn(“day_of_week”, dayofweek(col(“date”)))SELECT DAYOFWEEK(date) AS day_of_week FROM tabledf[‘day_of_week’] = df[‘date’].dt.dayofweek
Monthdf.withColumn(“month”, month(col(“date”)))SELECT MONTH(date) AS month FROM tabledf[‘month’] = df[‘date’].dt.month
Yeardf.withColumn(“year”, year(col(“date”)))SELECT YEAR(date) AS year FROM tabledf[‘year’] = df[‘date’].dt.year
Add Daysdf.withColumn(“add_days”, date_add(col(“date”), 5))SELECT DATE_ADD(date, 5) AS add_days FROM tabledf[‘add_days’] = df[‘date’] + pd.Timedelta(days=5)
Add Monthsdf.withColumn(“add_months”, add_months(col(“date”), 5))SELECT ADD_MONTHS(date, 5) AS add_months FROM tabledf[‘add_months’] = df[‘date’] + pd.DateOffset(months=5)
Control StatementsPySparkSpark SQLPandas
Control StatementsUse if-else with F.when for conditional columnsUse CASE WHEN … THEN for conditional logic in SQLnp.where(condition, value_if_true, value_if_false)
Conditional (IF/CASE)df.withColumn(“col”, F.when(df.col > 1, “A”).otherwise(“B”))CASE WHEN col > 1 THEN ‘A’ ELSE ‘B’ ENDnp.where(df[‘col’] > 1, “A”, “B”)
For Loopfor row in df.collect(): print(row)N/Afor index, row in df.iterrows(): print(row)
RegexPySparkSpark SQLPandas
Containsdf.filter(df[“col”].rlike(“pattern”))SELECT * FROM table WHERE col RLIKE ‘pattern’df[df[“col”].str.contains(“pattern”)]
Replacefrom pyspark.sql.functions import regexp_replaceREGEXP_REPLACE(col, ‘pattern’, ‘replace’)df[“col”].str.replace(“pattern”, “replace”)
 df.withColumn(“col”, regexp_replace(“col”, “pattern”, “replace”))  
Handling Missing DataPySparkSpark SQLPandas
Fill NaNdf.fillna({‘col’: 0})SELECT COALESCE(col, 0) FROM tabledf.fillna({‘col’: 0})
Drop NaNdf.na.drop()SELECT * FROM table WHERE col IS NOT NULLdf.dropna()
OptimizationPySparkSpark SQLPandas
Optimization – Cachedf.cache() – caches data in memory for faster accessCACHE TABLE table_nameCaching not directly supported
Optimization – Repartitiondf.repartition(numPartitions, “col”) to balance data across partitionsN/A in SQL, can partition with Hive tablesdf.repartition(num_partitions) improves operations indirectly
Optimization – Serializationdf.write.format(“parquet”).save(“path”), columnar formats like Parquet for better I/O efficiencyN/A, serialization handled outside SQLParquet is best used in large-scale data processing
Partitionsdf.repartition(numPartitions, “column”) creates partitions based on specified column.CREATE TABLE table_name PARTITIONED BY (col1 STRING) allows data to be organized by partition. 
Bucketingdf.write.bucketBy(numBuckets, “column”).saveAsTable(“table_name”) for distributing data.CREATE TABLE table_name CLUSTERED BY (col1) INTO numBuckets BUCKETS for bucketing data in a table. 
SegmentationSegmentation done by filtering DataFrames based on specific criteria, e.g., df.filter(df.col > 1).SELECT * FROM table WHERE col > value for segmenting data based on specific criteria. 
Broadcastingspark.conf.set(“spark.sql.autoBroadcastJoinThreshold”, size) for broadcast joins.SELECT /*+ BROADCAST(t2) */ * FROM table1 t1 JOIN table2 t2 ON t1.key = t2.key for broadcast hint. 
    
selectExample Code SnippetDescription / Use Case
Basic Selectdf.select(“col1”, “col2”).show()Select specific columns
Rename Columnsdf.select(col(“col1”).alias(“new_col1”), col(“col2”).alias(“new_col2”)).show()Rename columns directly in select
Arithmetic Operationdf.select((col(“col1”) + 10).alias(“col1_plus_10”), (col(“col2”) * 2).alias(“col2_times_2”)).show()Perform arithmetic operations and alias them
Conditional Logicdf.select(when(col(“col1”) > 50, “High”).otherwise(“Low”).alias(“level”)).show()Apply conditional logic using when and otherwise
Combining Columnsdf.select(concat(col(“col1”), lit(“_”), col(“col2”)).alias(“combined_col”)).show()Concatenate multiple columns with literals
String Manipulationdf.select(upper(col(“name”)).alias(“name_upper”), lower(col(“city”)).alias(“city_lower”)).show()Use string functions like upper and lower
Date Manipulationdf.select(date_add(col(“date_col”), 7).alias(“next_week”)).show()Manipulate date columns, such as adding days
Complex Expressionsdf.select((round((col(“col1”) * col(“col2”)) / 100, 2)).alias(“calculated_col”)).show()Chain multiple operations within a single expression
Aggregationsdf.groupBy(“category”).agg(sum(“value”).alias(“total_value”)).select(“category”, “total_value”).show()Combine select with agg to create summary statistics
Filtered Selectiondf.select(“col1”, “col2”).filter(col(“col1”) > 50).show()Select specific columns with an additional filter
Mathematical Functionsdf.select(sin(col(“col1”)).alias(“sin_col1”), log(col(“col2”)).alias(“log_col2”)).show()Apply mathematical functions such as sin and log
Array Creationdf.select(array(“col1”, “col2”, “col3”).alias(“array_col”)).show()Create an array column by combining multiple columns
JSON Parsingdf.select(get_json_object(col(“json_col”), “$.name”).alias(“name”)).show()Parse JSON fields from a column using get_json_object
Casting Data Typesdf.select(col(“col1”).cast(“int”).alias(“col1_int”), col(“col2”).cast(“string”).alias(“col2_str”)).show()Cast columns to different data types
Complex Aggregationdf.groupBy(“category”).agg(count(when(col(“value”) > 50, 1)).alias(“count_above_50”)).select(“category”, “count_above_50”).show()Perform conditional aggregation with count
Using SQL Functionsdf.select(coalesce(col(“col1”), col(“col2”)).alias(“first_non_null”)).show()Use SQL functions such as coalesce for null handling
Window Functionsdf.withColumn(“rank”, dense_rank().over(Window.partitionBy(“category”).orderBy(col(“col1”).desc()))).select(“category”, “col1”, “rank”).show()Apply window functions for rank and row number calculations
Array Conditionalsdf.select(when(col(“col1”) > 50, array(“col2”, “col3”)).otherwise(array(“col4”)).alias(“conditional_array”)).show()Use conditional logic to create arrays based on column values
Complex Type Creationdf.select(struct(col(“col1”).alias(“field1”), col(“col2”).alias(“field2”)).alias(“struct_col”)).show()Create complex data types like structs within select
Pivot Operationdf.groupBy(“category”).pivot(“pivot_column”).agg(sum(“value”)).select(“*”).show()Perform a pivot operation to reshape data based on column values
Null Handlingdf.select(col(“col1”).isNull().alias(“col1_is_null”), col(“col2”).isNotNull().alias(“col2_is_not_null”)).show()Check for null and non-null values in columns
Column Substringdf.select(col(“col1”).substr(1, 3).alias(“col1_substr”)).show()Extract substring from column values
Conditional Replacementdf.select(col(“col1”).alias(“original”), when(col(“col1”) == “unknown”, “NA”).otherwise(col(“col1”)).alias(“updated”)).show()Replace values in a column based on conditions
withColumnExample Code SnippetDescription / Use Case
Add New Columndf.withColumn(“new_col”, lit(100)).show()Adds a new column with a constant value 100.
Modify Existing Columndf.withColumn(“col1”, col(“col1”) * 10).show()Modifies an existing column by multiplying its values by 10.
Conditional Columndf.withColumn(“status”, when(col(“age”) > 18, “Adult”).otherwise(“Minor”)).show()Adds a new column status based on a conditional expression.
Column Based on Multiple Conditionsdf.withColumn(“grade”, when(col(“score”) >= 90, “A”).when(col(“score”) >= 75, “B”).otherwise(“C”)).show()Creates a new column grade with multiple conditions to assign values based on score.
Date and Time Operationsdf.withColumn(“year”, year(col(“date”))).withColumn(“month”, month(col(“date”))).show()Extracts year and month from a date column, creating new columns.
Concatenate Columnsdf.withColumn(“full_name”, concat(col(“first_name”), lit(” “), col(“last_name”))).show()Concatenates two or more string columns with a separator.
Cast Column Typedf.withColumn(“age_int”, col(“age”).cast(“int”)).show()Converts the data type of an existing column to another type.
Array Operationsdf.withColumn(“first_element”, col(“array_col”)[0]).show()Extracts the first element from an array column.
Aggregate within Columndf.withColumn(“sum_sales”, expr(“array_sum(sales_array)”)).show()Performs aggregation (like sum) within an array column.
Using SQL Expressionsdf.withColumn(“discounted_price”, expr(“price * 0.9”)).show()Uses SQL expression to perform operations directly within withColumn.
Generate Column Based on Other Columnsdf.withColumn(“profit”, col(“revenue”) – col(“cost”)).show()Creates a new column by performing arithmetic operations between existing columns.
withColumnRenamed Basicdf.withColumnRenamed(“old_col”, “new_col”).show()Renames a column from old_col to new_col.
Multiple withColumnRenameddf.withColumnRenamed(“old_col1”, “new_col1”).withColumnRenamed(“old_col2”, “new_col2”).show()Renames multiple columns sequentially.
Chained withColumn Operationsdf.withColumn(“double_age”, col(“age”) * 2).withColumn(“half_age”, col(“age”) / 2).show()Adds multiple new columns by chaining multiple withColumn operations.
Apply UDF in withColumndf.withColumn(“squared_age”, square_udf(col(“age”))).show()Uses a User Defined Function (UDF) to create a new column based on existing column values.
Create Column with Random Valuesdf.withColumn(“random_val”, rand()).show()Adds a new column with random float values between 0 and 1.
withColumn with Window Functionwindow_spec = Window.partitionBy(“group_col”).orderBy(“date_col”)Adds a new column with ranking within partitions using a window function.
 df.withColumn(“rank”, rank().over(window_spec)).show() 
Conditional Replacementdf.withColumn(“score”, when(col(“score”) < 0, 0).otherwise(col(“score”))).show()Replaces negative values in score column with 0.
Chain withColumn for Transformation Pipelinedf.withColumn(“new1”, col(“col1”) + 1).withColumn(“new2”, col(“new1”) * 2).drop(“new1”).show()Demonstrates column transformations in sequence, chaining multiple transformations and dropping intermediate steps.
filterExample Code SnippetDescription / Use Case
Basic Filter with Conditiondf.filter(col(“age”) > 18).show()Filters rows where age is greater than 18.
Filter with SQL Expressiondf.filter(“age > 18”).show()Same as above but using a SQL expression string for condition.
Multiple Conditions (AND)df.filter((col(“age”) > 18) & (col(“city”) == “New York”)).show()Filters rows where age is greater than 18 and city is “New York”.
Multiple Conditions (OR)`df.filter((col(“age”) < 18)(col(“city”) == “Los Angeles”)).show()`
IN Conditiondf.filter(col(“city”).isin([“New York”, “Los Angeles”])).show()Filters rows where city is either “New York” or “Los Angeles”.
NOT IN Conditiondf.filter(~col(“city”).isin([“Chicago”, “Houston”])).show()Filters rows where city is not in the specified list of values.
NULL Checkdf.filter(col(“email”).isNull()).show()Filters rows where email column has null values.
NOT NULL Checkdf.filter(col(“email”).isNotNull()).show()Filters rows where email column is not null.
Filter with Like (Pattern Matching)df.filter(col(“name”).like(“A%”)).show()Filters rows where name starts with “A”.
Filter with RLIKE (Regex Matching)df.filter(col(“name”).rlike(“^[A-Z].*”)).show()Filters rows where name starts with an uppercase letter.
Using BETWEENdf.filter(col(“age”).between(18, 30)).show()Filters rows where age is between 18 and 30 (inclusive).
Filter with Array Containsdf.filter(array_contains(col(“hobbies”), “reading”)).show()Filters rows where hobbies array column contains the value “reading”.
Filter by Length of String Columndf.filter(length(col(“name”)) > 5).show()Filters rows where the length of name column is greater than 5.
Filter with Date Conditiondf.filter(col(“date”) > lit(“2023-01-01”)).show()Filters rows with date column after January 1, 2023.
Filter Based on Calculated Columndf.filter((col(“salary”) – col(“expenses”)) > 5000).show()Filters rows based on a calculated difference between salary and expenses.
Substring Filterdf.filter(substring(col(“name”), 1, 3) == “Joh”).show()Filters rows where the first 3 characters of name are “Joh”.
Using StartsWith and EndsWithdf.filter(col(“email”).startswith(“john”)).show()Filters rows where email starts with “john” or ends with “.com”.
 df.filter(col(“email”).endswith(“.com”)).show() 
Using exists in Array Columndf.filter(expr(“exists(array_col, x -> x > 10)”)).show()Filters rows where at least one element in array_col is greater than 10.
Filter with Row Number (Window Function)window_spec = Window.partitionBy(“city”).orderBy(col(“age”).desc())Filters to get the oldest person in each city by using a window function.
 df.withColumn(“rank”, row_number().over(window_spec)).filter(col(“rank”) == 1).show() 
Filter on Nested Fields (Struct)df.filter(col(“address.city”) == “San Francisco”).show()Filters rows based on a nested field in a struct-type column, like address.city.
Filter Using UDFdf.filter(custom_udf(col(“age”))).show()Filters rows using a custom UDF that returns a Boolean value.
Filter with CASE WHEN Logicdf.filter(when(col(“score”) > 90, “High”).otherwise(“Low”) == “High”).show()Filters rows with a conditional transformation using when.
Random Sampling with Filterdf.filter(rand() < 0.1).show()Filters a random sample of approximately 10% of the data.
Filter Rows with Aggregated Conditiondf.groupBy(“city”).count().filter(col(“count”) > 100).show()Filters groups with aggregate conditions, keeping only cities with more than 100 records.
selectExprExample Code SnippetDescription / Use Case
Basic SelectExprdf.selectExpr(“col1”, “col2”).show()Select specific columns using selectExpr syntax
Arithmetic Operationdf.selectExpr(“col1 + 10 as col1_plus_10”, “col2 * 2 as col2_times_2”).show()Perform arithmetic operations directly within selectExpr
Conditional Logicdf.selectExpr(“CASE WHEN col1 > 50 THEN ‘High’ ELSE ‘Low’ END AS level”).show()Apply conditional logic with SQL-style CASE WHEN
Combining Columnsdf.selectExpr(“concat(col1, ‘_’, col2) as combined_col”).show()Concatenate columns using SQL syntax
String Manipulationdf.selectExpr(“upper(name) as name_upper”, “lower(city) as city_lower”).show()Use SQL functions to manipulate strings, e.g., upper, lower
Date Manipulationdf.selectExpr(“date_add(date_col, 7) as next_week”).show()Perform date arithmetic within expressions, here adding 7 days
Complex Expressionsdf.selectExpr(“round(col1 * col2 / 100, 2) as calculated_col”).show()Use complex expressions to perform multiple operations in one column
Aggregationsdf.groupBy(“category”).agg(expr(“sum(value) as total_value”)).show()Aggregate values within selectExpr, here summing value
Filtering Columnsdf.selectExpr(“col1”, “col2”).where(“col1 > 50”).show()Filter data in combination with selectExpr
Conditional Aggregationsdf.groupBy(“category”).agg(expr(“sum(case when value > 50 then 1 else 0 end) as count_above_50”)).show()Count rows based on conditions within selectExpr
Mathematical Functionsdf.selectExpr(“sin(col1) as sin_col1”, “log(col2) as log_col2”).show()Use mathematical functions such as sin, log in expressions
Array Creationdf.selectExpr(“array(col1, col2, col3) as array_col”).show()Create an array column by combining multiple columns
JSON Parsingdf.selectExpr(“get_json_object(json_col, ‘$.name’) as name”).show()Parse JSON data stored within a column
Casting Typesdf.selectExpr(“cast(col1 as int) as col1_int”, “cast(col2 as string) as col2_str”).show()Change data types of columns within selectExpr
Aliasing Expressionsdf.selectExpr(“col1 + col2 as total_sum”, “col2 – col3 as diff”).show()Alias complex expressions directly in selectExpr
Using SQL Functionsdf.selectExpr(“coalesce(col1, col2, col3) as first_non_null”).show()Use SQL functions such as coalesce for data cleaning
Window Functionsdf.withColumn(“rank”, expr(“dense_rank() over (partition by category order by col1 desc)”)).show()Apply window functions for rank-based calculations
Conditional Arraydf.selectExpr(“CASE WHEN col1 > 50 THEN array(col2, col3) ELSE array(col4) END as conditional_array”).show()Create conditional arrays using selectExpr
Complex Type Handlingdf.selectExpr(“named_struct(‘field1’, col1, ‘field2’, col2) as struct_col”).show()Create complex types like structs within selectExpr

Pages: 1 2


Discover more from HintsToday

Subscribe to get the latest posts sent to your email.

Posted in

Leave a Reply

Your email address will not be published. Required fields are marked *

Discover more from HintsToday

Subscribe now to keep reading and get access to the full archive.

Continue reading