from
pyspark.sql
import
SparkSession
def
create_session():
spk
=
SparkSession.builder \
.appName(
"Corona_cases_statewise.com"
) \
.getOrCreate()
return
spk
def
create_RDD(sc_obj,data):
df
=
sc.parallelize(data)
return
df
def
RDD_to_df(spark,df,schema):
df1
=
spark.createDataFrame(df,schema)
return
df1
if
__name__
=
=
"__main__"
:
input_data
=
[(
"Uttar Pradesh"
,
122000
,
89600
,
12238
),
(
"Maharashtra"
,
454000
,
380000
,
67985
),
(
"Tamil Nadu"
,
115000
,
102000
,
13933
),
(
"Karnataka"
,
147000
,
111000
,
15306
),
(
"Kerala"
,
153000
,
124000
,
5259
)]
spark
=
create_session()
sc
=
spark.sparkContext
rd_df
=
create_RDD(sc,input_data)
schema_lst
=
[
"State"
,
"Cases"
,
"Recovered"
,
"Deaths"
]
converted_df
=
RDD_to_df(spark,rd_df,schema_lst)
converted_df.printSchema()
converted_df.show()