Here’s a comprehensive list of some common PySpark date functions along with detailed explanations and examples:
Contents
- 1 1. current_date()
- 2 2. current_timestamp()
- 3 3. date_format()
- 4 4. year(), month(), dayofmonth()
- 5 5. date_add(), date_sub()
- 6 6. datediff()
- 7 7. add_months()
- 8 8. months_between()
- 9 9. next_day()
- 10 10. last_day()
- 11 11. trunc()
- 12 12. date_trunc()
- 13 13. from_unixtime()
- 14 14. unix_timestamp()
- 15 15. to_date()
- 16 16. to_timestamp()
- 17 17. weekofyear()
- 18 18. quarter()
- 19 19. dayofweek()
- 20 Example Project: Combining Multiple Date Functions
- 21 Share this:
1. current_date()
Returns the current date.
from pyspark.sql.functions import current_date
df = spark.createDataFrame([(1,)], ["dummy"])
df.select(current_date().alias("current_date")).show()
2. current_timestamp()
Returns the current timestamp.
from pyspark.sql.functions import current_timestamp
df.select(current_timestamp().alias("current_timestamp")).show()
3. date_format()
Formats a date using the specified format.
from pyspark.sql.functions import date_format
df = spark.createDataFrame([("2010-02-05",)], ["date"])
df.select(date_format("date", "MM/dd/yyyy").alias("formatted_date")).show()
4. year()
, month()
, dayofmonth()
Extracts the year, month, and day from a date.
from pyspark.sql.functions import year, month, dayofmonth
df.select(year("date").alias("year"),
month("date").alias("month"),
dayofmonth("date").alias("day")).show()
5. date_add()
, date_sub()
Adds or subtracts a specified number of days to/from a date.
from pyspark.sql.functions import date_add, date_sub
df.select(date_add("date", 10).alias("date_add"),date_sub("date", 10).alias("date_sub")).show()
6. datediff()
Returns the difference in days between two dates.
from pyspark.sql.functions import datediff
df2 = spark.createDataFrame([("2010-02-15",)], ["date2"])
df.join(df2).select(datediff("date2", "date").alias("datediff")).show()
7. add_months()
Adds a specified number of months to a date.
from pyspark.sql.functions import add_months
df.select(add_months("date", 1).alias("add_months")).show()
8. months_between()
Returns the number of months between two dates.
from pyspark.sql.functions import months_between
df.join(df2).select(months_between("date2", "date").alias("months_between")).show()
9. next_day()
Returns the first date which is the specified day of the week after the given date.
from pyspark.sql.functions import next_day
df.select(next_day("date", "Sunday").alias("next_day")).show()
10. last_day()
Returns the last day of the month which the given date belongs to.
from pyspark.sql.functions import last_day
df.select(last_day("date").alias("last_day")).show()
11. trunc()
Truncates a date to the specified unit (‘year’, ‘month’).
from pyspark.sql.functions import trunc
df.select(trunc("date", "MM").alias("trunc_month"),
trunc("date", "YY").alias("trunc_year")).show()
12. date_trunc()
Truncates a timestamp to the specified unit (‘year’, ‘month’, ‘day’, ‘hour’, ‘minute’).
from pyspark.sql.functions import date_trunc
df = spark.createDataFrame([("2010-02-05 12:34:56",)], ["timestamp"])
df.select(date_trunc("hour", "timestamp").alias("date_trunc_hour")).show()
13. from_unixtime()
Converts the number of seconds from the Unix epoch to a string representing the timestamp.
from pyspark.sql.functions import from_unixtime
df = spark.createDataFrame([(1234567890,)], ["unix_time"])
df.select(from_unixtime("unix_time").alias("timestamp")).show()
14. unix_timestamp()
Converts a string timestamp to the number of seconds from the Unix epoch.
from pyspark.sql.functions import unix_timestamp
df = spark.createDataFrame([("2010-02-05 12:34:56",)], ["timestamp"])
df.select(unix_timestamp("timestamp").alias("unix_timestamp")).show()
15. to_date()
Converts a string to a date.
from pyspark.sql.functions import to_date
df = spark.createDataFrame([("2010-02-05",)], ["date_str"])
df.select(to_date("date_str").alias("date")).show()
16. to_timestamp()
Converts a string to a timestamp.
from pyspark.sql.functions import to_timestamp
df = spark.createDataFrame([("2010-02-05 12:34:56",)], ["timestamp_str"])
df.select(to_timestamp("timestamp_str").alias("timestamp")).show()
17. weekofyear()
Returns the week of the year for a date.
from pyspark.sql.functions import weekofyear
df = spark.createDataFrame([("2010-02-05",)], ["date"])
df.select(weekofyear("date").alias("weekofyear")).show()
18. quarter()
Returns the quarter of the year for a date.
from pyspark.sql.functions import quarter
df.select(quarter("date").alias("quarter")).show()
19. dayofweek()
Returns the day of the week for a date.
from pyspark.sql.functions import dayofweek
df.select(dayofweek("date").alias("dayofweek")).show()
Example Project: Combining Multiple Date Functions
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, year, month, dayofmonth, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, trunc, date_trunc, from_unixtime, unix_timestamp, to_date, to_timestamp, weekofyear, quarter, dayofweek
# Initialize Spark session
spark = SparkSession.builder.appName("date_functions_example").getOrCreate()
# Create a DataFrame with a sample date
data = [("2010-02-05", "2010-02-15 12:34:56", 1234567890)]
columns = ["date", "timestamp", "unix_time"]
df = spark.createDataFrame(data, columns)
# Applying various date functions
result = df.select(
col("date"),
col("timestamp"),
col("unix_time"),
date_format("date", "MM/dd/yyyy").alias("formatted_date"),
year("date").alias("year"),
month("date").alias("month"),
dayofmonth("date").alias("day"),
date_add("date", 10).alias("date_add"),
date_sub("date", 10).alias("date_sub"),
datediff("timestamp", "date").alias("datediff"),
add_months("date", 1).alias("add_months"),
months_between("timestamp", "date").alias("months_between"),
next_day("date", "Sunday").alias("next_day"),
last_day("date").alias("last_day"),
trunc("date", "MM").alias("trunc_month"),
trunc("date", "YY").alias("trunc_year"),
date_trunc("hour", "timestamp").alias("date_trunc_hour"),
from_unixtime("unix_time").alias("from_unixtime"),
unix_timestamp("timestamp").alias("unix_timestamp"),
to_date("timestamp").alias("to_date"),
to_timestamp("timestamp").alias("to_timestamp"),
weekofyear("date").alias("weekofyear"),
quarter("date").alias("quarter"),
dayofweek("date").alias("dayofweek")
)
# Show the results
result.show(truncate=False)
Function | Description | Example |
---|---|---|
current_date() | Returns the current date as a DateType object | current_date() |
current_timestamp() | Returns the current timestamp (with microseconds) as a TimestampType object | current_timestamp() |
date_add(date, days) | Adds a specified number of days to a date | date_add('2023-07-05', 10) |
datediff(end_date, start_date) | Calculates the number of days between two dates | datediff('2024-01-01', '2023-12-31') |
months_between(date1, date2) | Calculates the number of months between two dates | months_between('2024-02-01', '2023-07-05') |
to_date(string) | Converts a string to a DateType object (format-dependent) | to_date('2023-07-04', 'yyyy-MM-dd') |
to_timestamp(string) | Converts a string to a TimestampType object (format-dependent) | to_timestamp('2023-07-04 10:20:30', 'yyyy-MM-dd HH:mm:ss') |
trunc(date, format) | Truncates a date to a specified unit (year, month, day) | trunc('2023-07-05', 'year') |
year(date) | Extracts the year from a date | year('2023-07-05') |
quarter(date) | Extracts the quarter (1-4) from a date | quarter('2023-07-05') |
month(date) | Extracts the month (1-12) from a date | month('2023-07-05') |
dayofmonth(date) | Extracts the day of the month (1-31) from a date | dayofmonth('2023-07-05') |
dayofweek(date) | Extracts the day of the week (1-7, Monday=1) from a date | dayofweek('2023-07-05') |
dayofyear(date) | Extracts the day of the year (1-365) from a date | dayofyear('2023-07-05') |
last_day(date) | Returns the last day of the month for the given date | last_day('2023-07-05') |
unix_timestamp() | Returns the number of seconds since epoch (1970-01-01 UTC) for a timestamp | unix_timestamp() |
from_unixtime(timestamp) | Converts a unix timestamp to a TimestampType object | from_unixtime(1656816000) |
date_format(date, format) | Formats a date or timestamp according to a specified format | date_format('2023-07-05', 'dd-MM-yyyy') |