import
findspark
findspark.init(
'c:/spark'
)
from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
split, col
from
pyspark.sql.types
import
ArrayType, IntegerType
def
return_array(column):
return
split(col(column),
","
)
spark
=
SparkSession.builder \
.appName(
'GeeksforGeeks'
) \
.getOrCreate()
data
=
[(
"Pulkit, Dhingra"
,
"M"
,
"70,85"
),
(
"Ritika, Pandey"
,
"F"
,
"85,95"
),
(
"Kaif, Ali"
,
"M"
,
"63,72"
),
(
"Asha, Deep"
,
"F"
,
"62,92"
)
]
columns
=
[
"Name"
,
"Gender"
,
"Marks"
]
df
=
spark.createDataFrame(data,columns)
df.show()
df2
=
df.select(split(col(
"Name"
),
","
).alias(
"Name"
),
split(col(
"Gender"
),
","
).alias(
"Gender"
),
split(col(
"Marks"
),
","
).alias(
"Marks_Arr"
))
df2.show()
spark.stop()