from
pyspark.sql
import
SparkSession
spark
=
SparkSession.builder.appName(
"SparkByExamples.com"
).getOrCreate()
emp
=
[(
1
,
"Sagar"
,
-
1
,
"2018"
,
"10"
,
"M"
,
3000
),
(
2
,
"G"
,
1
,
"2010"
,
"20"
,
"M"
,
4000
),
(
3
,
"F"
,
1
,
"2010"
,
"10"
,
"M"
,
1000
),
(
4
,
"G"
,
2
,
"2005"
,
"10"
,
"M"
,
2000
),
(
5
,
"Great"
,
2
,
"2010"
,
"40"
, "",
-
1
),
(
6
,
"Hash"
,
2
,
"2010"
,
"50"
, "",
-
1
)]
empColumns
=
[
"emp_id"
,
"name"
,
"superior_emp_id"
,
"year_joined"
,
"emp_dept_id"
,
"gender"
,
"salary"
]
print
(
"Employee Table"
)
empDF
=
spark.createDataFrame(data
=
emp, schema
=
empColumns)
empDF.printSchema()
empDF.show(truncate
=
False
)
dept
=
[(
"F"
,
10
), (
"M"
,
20
), (
"S"
,
30
), (
"I"
,
40
)]
print
(
"Department Table"
)
deptColumns
=
[
"dept_name"
,
"dept_id"
]
deptDF
=
spark.createDataFrame(data
=
dept, schema
=
deptColumns)
deptDF.printSchema()
deptDF.show(truncate
=
False
)
print
(
"- Inconsistencies on every run-"
)
split1, split2
=
empDF.randomSplit([
0.5
,
0.5
])
print
(
"Ist Run:"
, split1.join(
deptDF, empDF.emp_dept_id
=
=
deptDF.dept_id,
"inner"
).count())
split1, split2
=
empDF.randomSplit([
0.5
,
0.5
])
print
(
"IInd Run:"
, split1.join(
deptDF, empDF.emp_dept_id
=
=
deptDF.dept_id,
"inner"
).count())
split1, split2
=
empDF.randomSplit([
0.5
,
0.5
])
print
(
"IIIrd Run:"
, split1.join(
deptDF, empDF.emp_dept_id
=
=
deptDF.dept_id,
"inner"
).count())
split1, split2
=
empDF.randomSplit([
0.5
,
0.5
])
print
(
"IVth Run:"
, split1.join(
deptDF, empDF.emp_dept_id
=
=
deptDF.dept_id,
"inner"
).count())
split1, split2
=
empDF.randomSplit([
0.5
,
0.5
])
print
(
"Vth Run:"
, split1.join(
deptDF, empDF.emp_dept_id
=
=
deptDF.dept_id,
"inner"
).count())