pyspark 日期格式
1. 获取当前日期
from pyspark.sql.functions import current_date spark.range(3).withColumn('date',current_date()).show() # +---+----------+ # | id| date| # +---+----------+ # | 0|2018-03-23| # | 1|2018-03-23|
2. 获取当前日期和时间
from pyspark.sql.functions import current_timestamp spark.range(3).withColumn('date',current_timestamp()).show() # +---+--------------------+ # | id| date| # +---+--------------------+ # | 0|2018-03-23 17:40:...| # | 1|2018-03-23 17:40:...| # | 2|2018-03-23 17:40:...| # +---+--------------------+
3. 日期格式转换
from pyspark.sql.functions import date_format df = spark.createDataFrame([('2015-04-08',)], ['a']) df.select(date_format('a', 'MM/dd/yyy').alias('date')).show()
1
2
3
4
5
4. 字符转日期
from pyspark.sql.functions import to_date, to_timestamp # 1.转日期 df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) df.select(to_date(df.t).alias('date')).show() # [Row(date=datetime.date(1997, 2, 28))]
# 2.带时间的日期
df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) df.select(to_timestamp(df.t).alias('dt')).show() # [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] # 还可以指定日期格式 df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).show() # [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
5. 获取日期中的年月日
from pyspark.sql.functions import year, month, dayofmonth df = spark.createDataFrame([('2015-04-08',)], ['a']) df.select(year('a').alias('year'), month('a').alias('month'), dayofmonth('a').alias('day') ).show()
6. 获取时分秒
from pyspark.sql.functions import hour, minute, second df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['a']) df.select(hour('a').alias('hour'), minute('a').alias('minute'), second('a').alias('second') ).show()
7. 获取日期对应的季度
from pyspark.sql.functions import quarter df = spark.createDataFrame([('2015-04-08',)], ['a']) df.select(quarter('a').alias('quarter')).show()
8. 日期加减
from pyspark.sql.functions import date_add, date_sub df = spark.createDataFrame([('2015-04-08',)], ['d']) df.select(date_add(df.d, 1).alias('d-add'), date_sub(df.d, 1).alias('d-sub') ).show()
9. 月份加减
from pyspark.sql.functions import add_months df = spark.createDataFrame([('2015-04-08',)], ['d']) df.select(add_months(df.d, 1).alias('d')).show()
10. 日期差,月份差
from pyspark.sql.functions import datediff, months_between # 1.日期差 df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2']) df.select(datediff(df.d2, df.d1).alias('diff')).show() # 2.月份差 df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['t', 'd']) df.select(months_between(df.t, df.d).alias('months')).show()
11. 计算下一个日子的日期
计算当前日期的下一个星期1,2,3,4,5,6,7的具体日子,属于实用函数 from pyspark.sql.functions import next_day # "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun". df = spark.createDataFrame([('2015-07-27',)], ['d']) df.select(next_day(df.d, 'Sun').alias('date')).show()
12. 本月的最后一个日期
from pyspark.sql.functions import last_day df = spark.createDataFrame([('1997-02-10',)], ['d']) df.select(last_day(df.d).alias('date')).show()