from
pyspark.sql
import
*
import
pyspark
from
pyspark.sql
import
SparkSession
spark
=
SparkSession.builder.getOrCreate()
data_df
=
spark.createDataFrame([
Row(section
=
'A'
, name
=
'Ashish'
, sequence
=
1
),
Row(section
=
'B'
, name
=
'Bharti'
, sequence
=
1
),
Row(section
=
'B'
, name
=
'Charlie'
, sequence
=
2
),
Row(section
=
'A'
, name
=
'Marie'
, sequence
=
2
),
Row(section
=
'C'
, name
=
'Prabhakar'
, sequence
=
1
),
Row(section
=
'D'
, name
=
'Shrey'
, sequence
=
1
),
Row(section
=
'C'
, name
=
'Rose'
, sequence
=
2
),
Row(section
=
'B'
, name
=
'Ishita'
, sequence
=
3
),
Row(section
=
'C'
, name
=
'Samarth'
, sequence
=
3
),
Row(section
=
'A'
, name
=
'Vinayak'
, sequence
=
4
),
Row(section
=
'A'
, name
=
'Pranjal'
, sequence
=
3
),
])
views_df
=
data_df.select(
'section'
,
'name'
,
'sequence'
)
views_df.createOrReplaceTempView(
'views'
)
views_df
=
(views_df
.groupBy(
'section'
)
.agg(
array_sort(
collect_list(struct(
'sequence'
,
'name'
))
).alias(
'sorted_names'
)
).select(
"section"
, col(
"sorted_names.name"
).alias(
"Sorted Names"
))
).show(
20
,
False
)