Date and Time Functions- Pyspark Dataframes

Here’s a comprehensive list of some common PySpark date functions along with detailed explanations and examples:

1. current_date()

Returns the current date.

from pyspark.sql.functions import current_date

df = spark.createDataFrame([(1,)], ["dummy"])
df.select(current_date().alias("current_date")).show()

2. current_timestamp()

Returns the current timestamp.

from pyspark.sql.functions import current_timestamp

df.select(current_timestamp().alias("current_timestamp")).show()

3. date_format()

Formats a date using the specified format.

from pyspark.sql.functions import date_format

df = spark.createDataFrame([("2010-02-05",)], ["date"])
df.select(date_format("date", "MM/dd/yyyy").alias("formatted_date")).show()

4. year(), month(), dayofmonth()

Extracts the year, month, and day from a date.

from pyspark.sql.functions import year, month, dayofmonth

df.select(year("date").alias("year"),
month("date").alias("month"),
dayofmonth("date").alias("day")).show()

5. date_add(), date_sub()

Adds or subtracts a specified number of days to/from a date.

from pyspark.sql.functions import date_add, date_sub

df.select(date_add("date", 10).alias("date_add"),
date_sub("date", 10).alias("date_sub")).show()

6. datediff()

Returns the difference in days between two dates.

from pyspark.sql.functions import datediff

df2 = spark.createDataFrame([("2010-02-15",)], ["date2"])
df.join(df2).select(datediff("date2", "date").alias("datediff")).show()

7. add_months()

Adds a specified number of months to a date.

from pyspark.sql.functions import add_months

df.select(add_months("date", 1).alias("add_months")).show()

8. months_between()

Returns the number of months between two dates.

from pyspark.sql.functions import months_between

df.join(df2).select(months_between("date2", "date").alias("months_between")).show()

9. next_day()

Returns the first date which is the specified day of the week after the given date.

from pyspark.sql.functions import next_day

df.select(next_day("date", "Sunday").alias("next_day")).show()

10. last_day()

Returns the last day of the month which the given date belongs to.

from pyspark.sql.functions import last_day

df.select(last_day("date").alias("last_day")).show()

11. trunc()

Truncates a date to the specified unit (‘year’, ‘month’).

from pyspark.sql.functions import trunc

df.select(trunc("date", "MM").alias("trunc_month"),
trunc("date", "YY").alias("trunc_year")).show()

12. date_trunc()

Truncates a timestamp to the specified unit (‘year’, ‘month’, ‘day’, ‘hour’, ‘minute’).

from pyspark.sql.functions import date_trunc

df = spark.createDataFrame([("2010-02-05 12:34:56",)], ["timestamp"])
df.select(date_trunc("hour", "timestamp").alias("date_trunc_hour")).show()

13. from_unixtime()

Converts the number of seconds from the Unix epoch to a string representing the timestamp.

from pyspark.sql.functions import from_unixtime

df = spark.createDataFrame([(1234567890,)], ["unix_time"])
df.select(from_unixtime("unix_time").alias("timestamp")).show()

14. unix_timestamp()

Converts a string timestamp to the number of seconds from the Unix epoch.

from pyspark.sql.functions import unix_timestamp

df = spark.createDataFrame([("2010-02-05 12:34:56",)], ["timestamp"])
df.select(unix_timestamp("timestamp").alias("unix_timestamp")).show()

15. to_date()

Converts a string to a date.

from pyspark.sql.functions import to_date

df = spark.createDataFrame([("2010-02-05",)], ["date_str"])
df.select(to_date("date_str").alias("date")).show()

16. to_timestamp()

Converts a string to a timestamp.

from pyspark.sql.functions import to_timestamp

df = spark.createDataFrame([("2010-02-05 12:34:56",)], ["timestamp_str"])
df.select(to_timestamp("timestamp_str").alias("timestamp")).show()

17. weekofyear()

Returns the week of the year for a date.

from pyspark.sql.functions import weekofyear

df = spark.createDataFrame([("2010-02-05",)], ["date"])
df.select(weekofyear("date").alias("weekofyear")).show()

18. quarter()

Returns the quarter of the year for a date.

from pyspark.sql.functions import quarter

df.select(quarter("date").alias("quarter")).show()

19. dayofweek()

Returns the day of the week for a date.

from pyspark.sql.functions import dayofweek

df.select(dayofweek("date").alias("dayofweek")).show()

Example Project: Combining Multiple Date Functions

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, year, month, dayofmonth, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, trunc, date_trunc, from_unixtime, unix_timestamp, to_date, to_timestamp, weekofyear, quarter, dayofweek

# Initialize Spark session
spark = SparkSession.builder.appName("date_functions_example").getOrCreate()

# Create a DataFrame with a sample date
data = [("2010-02-05", "2010-02-15 12:34:56", 1234567890)]
columns = ["date", "timestamp", "unix_time"]
df = spark.createDataFrame(data, columns)

# Applying various date functions
result = df.select(
col("date"),
col("timestamp"),
col("unix_time"),
date_format("date", "MM/dd/yyyy").alias("formatted_date"),
year("date").alias("year"),
month("date").alias("month"),
dayofmonth("date").alias("day"),
date_add("date", 10).alias("date_add"),
date_sub("date", 10).alias("date_sub"),
datediff("timestamp", "date").alias("datediff"),
add_months("date", 1).alias("add_months"),
months_between("timestamp", "date").alias("months_between"),
next_day("date", "Sunday").alias("next_day"),
last_day("date").alias("last_day"),
trunc("date", "MM").alias("trunc_month"),
trunc("date", "YY").alias("trunc_year"),
date_trunc("hour", "timestamp").alias("date_trunc_hour"),
from_unixtime("unix_time").alias("from_unixtime"),
unix_timestamp("timestamp").alias("unix_timestamp"),
to_date("timestamp").alias("to_date"),
to_timestamp("timestamp").alias("to_timestamp"),
weekofyear("date").alias("weekofyear"),
quarter("date").alias("quarter"),
dayofweek("date").alias("dayofweek")
)

# Show the results
result.show(truncate=False)
FunctionDescriptionExample
current_date()Returns the current date as a DateType objectcurrent_date()
current_timestamp()Returns the current timestamp (with microseconds) as a TimestampType objectcurrent_timestamp()
date_add(date, days)Adds a specified number of days to a datedate_add('2023-07-05', 10)
datediff(end_date, start_date)Calculates the number of days between two datesdatediff('2024-01-01', '2023-12-31')
months_between(date1, date2)Calculates the number of months between two datesmonths_between('2024-02-01', '2023-07-05')
to_date(string)Converts a string to a DateType object (format-dependent)to_date('2023-07-04', 'yyyy-MM-dd')
to_timestamp(string)Converts a string to a TimestampType object (format-dependent)to_timestamp('2023-07-04 10:20:30', 'yyyy-MM-dd HH:mm:ss')
trunc(date, format)Truncates a date to a specified unit (year, month, day)trunc('2023-07-05', 'year')
year(date)Extracts the year from a dateyear('2023-07-05')
quarter(date)Extracts the quarter (1-4) from a datequarter('2023-07-05')
month(date)Extracts the month (1-12) from a datemonth('2023-07-05')
dayofmonth(date)Extracts the day of the month (1-31) from a datedayofmonth('2023-07-05')
dayofweek(date)Extracts the day of the week (1-7, Monday=1) from a datedayofweek('2023-07-05')
dayofyear(date)Extracts the day of the year (1-365) from a datedayofyear('2023-07-05')
last_day(date)Returns the last day of the month for the given datelast_day('2023-07-05')
unix_timestamp()Returns the number of seconds since epoch (1970-01-01 UTC) for a timestampunix_timestamp()
from_unixtime(timestamp)Converts a unix timestamp to a TimestampType objectfrom_unixtime(1656816000)
date_format(date, format)Formats a date or timestamp according to a specified formatdate_format('2023-07-05', 'dd-MM-yyyy')

Discover more from HintsToday

Subscribe to get the latest posts sent to your email.