from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
*
from
pyspark.sql.functions
import
col, struct, when
from
pyspark.sql.types
import
StructType, StructField, IntegerType, LongType, StringType, FloatType
def
create_session():
spk
=
SparkSession.builder \
.master(
"local"
) \
.appName(
"Product_mart.com"
) \
.getOrCreate()
return
spk
def
create_df(spark, data, schema):
df1
=
spark.createDataFrame(data, schema)
return
df1
if
__name__
=
=
"__main__"
:
spark
=
create_session()
input_data
=
[(
"Refrigerator"
,
112345
,
4.0
,
12499
),
(
"LED TV"
,
114567
,
4.2
,
49999
),
(
"Washing Machine"
,
113465
,
3.9
,
69999
),
(
"T-shirt"
,
124378
,
4.1
,
1999
),
(
"Jeans"
,
126754
,
3.7
,
3999
),
(
"Running Shoes"
,
134565
,
4.7
,
1499
),
(
"Face Mask"
,
145234
,
4.6
,
999
)]
schm
=
StructType([
StructField(
"Product Name"
, StringType(),
True
),
StructField(
"Product ID"
, LongType(),
True
),
StructField(
"Rating"
, FloatType(),
True
),
StructField(
"Product Price"
, IntegerType(),
True
)])
df
=
create_df(spark, input_data, schm)
new_df
=
df.withColumn(
"Product"
,
struct(col(
"Product Name"
).alias(
"Name"
),
col(
"Product ID"
).alias(
"ID"
),
col(
"Rating"
).alias(
"Rating"
),
col(
"Product Price"
).alias(
"Price"
)))
new_df
=
new_df.withColumn(
"Product Range"
,
when(col(
"Product Price"
).cast(
IntegerType()) <
1000
,
"Low"
)
.when(col(
"Product Price"
).cast(IntegerType()
) <
7000
,
"Medium"
)
.otherwise(
"High"
))
new_df
=
new_df.drop(
"Product Name"
,
"Product ID"
,
"Rating"
,
"Product Price"
)
new_df.printSchema()
new_df.show(truncate
=
False
)