forked from spark-examples/pyspark-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyspark-split-function.py
50 lines (42 loc) · 1.8 KB
/
pyspark-split-function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
spark=SparkSession.builder.appName("sparkbyexamples").getOrCreate()
data=data = [('James','','Smith','1991-04-01'),
('Michael','Rose','','2000-05-19'),
('Robert','','Williams','1978-09-05'),
('Maria','Anne','Jones','1967-12-01'),
('Jen','Mary','Brown','1980-02-17')
]
columns=["firstname","middlename","lastname","dob"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)
df1 = df.withColumn('year', split(df['dob'], '-').getItem(0)) \
.withColumn('month', split(df['dob'], '-').getItem(1)) \
.withColumn('day', split(df['dob'], '-').getItem(2))
df1.printSchema()
df1.show(truncate=False)
# Alternatively we can do like below
split_col = pyspark.sql.functions.split(df['dob'], '-')
df2 = df.withColumn('year', split_col.getItem(0)) \
.withColumn('month', split_col.getItem(1)) \
.withColumn('day', split_col.getItem(2))
df2.show(truncate=False)
# Using split() function of Column class
split_col = pyspark.sql.functions.split(df['dob'], '-')
df3 = df.select("firstname","middlename","lastname","dob", split_col.getItem(0).alias('year'),split_col.getItem(1).alias('month'),split_col.getItem(2).alias('day'))
df3.show(truncate=False)
"""
df4=spark.createDataFrame([("20-13-2012-monday",)], ['date',])
df4.select(split(df4.date,'^([\d]+-[\d]+-[\d])').alias('date'),
regexp_replace(split(df4.date,'^([\d]+-[\d]+-[\d]+)').getItem(1),'-','').alias('day')).show()
"""
df4 = spark.createDataFrame([('oneAtwoBthree',)], ['str',])
df4.select(split(df4.str, '[AB]').alias('str')).show()
df4.select(split(df4.str, '[AB]',2).alias('str')).show()
df4.select(split(df4.str, '[AB]',1).alias('str')).show()