Merge two DataFrames with different amounts of columns in PySpark
In this article, we will discuss how to perform union on two dataframes with different amounts of columns in PySpark in Python.
Let’s consider the first dataframe
Here we are having 3 columns named id, name, and address.
Python3
import pyspark
from pyspark.sql.functions import when, lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName( 'sparkdf' ).getOrCreate()
data = [[ "1" , "sravan" , "kakumanu" ],
[ "2" , "ojaswi" , "hyd" ],
[ "3" , "rohith" , "delhi" ],
[ "4" , "sridevi" , "kakumanu" ],
[ "5" , "bobby" , "guntur" ]]
columns = [ 'ID' , 'NAME' , 'Address' ]
dataframe1 = spark.createDataFrame(data, columns)
dataframe1.show()
|
Output:
Let’s consider second dataframe
Here we are going to create dataframe with 2 columns
Python3
import pyspark
from pyspark.sql.functions import when, lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName( 'sparkdf' ).getOrCreate()
data = [[ "1" , 23 ],
[ "2" , 21 ],
[ "3" , 32 ],
]
columns = [ 'ID' , 'Age' ]
dataframe2 = spark.createDataFrame(data, columns)
dataframe2.show()
|
Output:
We can not perform union operations because the columns are different, so we have to add the missing columns. Here In first dataframe (dataframe1) , the columns [‘ID’, ‘NAME’, ‘Address’] and second dataframe (dataframe2 ) columns are [‘ID’,’Age’].
Now we have to add the Age column to the first dataframe and NAME and Address in the second dataframe, we can do this by using lit() function. This function is available in pyspark.sql.functions which is used to add a column with a value. Here we are going to add a value with None.
Syntax:
for column in [column for column in dataframe1.columns if column not in dataframe2.columns]:
dataframe2 = dataframe2.withColumn(column, lit(None))
where,
- dataframe1 is the firstdata frame
- dataframe2 is the second dataframe
Example: Add missing columns to both the dataframes
Python3
import pyspark
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName( 'sparkdf' ).getOrCreate()
data = [[ "1" , "sravan" , "kakumanu" ],
[ "2" , "ojaswi" , "hyd" ],
[ "3" , "rohith" , "delhi" ],
[ "4" , "sridevi" , "kakumanu" ],
[ "5" , "bobby" , "guntur" ]]
columns = [ 'ID' , 'NAME' , 'Address' ]
dataframe1 = spark.createDataFrame(data, columns)
data = [[ "1" , 23 ],
[ "2" , 21 ],
[ "3" , 32 ],
]
columns = [ 'ID' , 'Age' ]
dataframe2 = spark.createDataFrame(data, columns)
for column in [column for column in dataframe2.columns
if column not in dataframe1.columns]:
dataframe1 = dataframe1.withColumn(column, lit( None ))
for column in [column for column in dataframe1.columns
if column not in dataframe2.columns]:
dataframe2 = dataframe2.withColumn(column, lit( None ))
print (dataframe1.columns)
print (dataframe2.columns)
|
Output:
['ID', 'NAME', 'Address', 'Age']
['ID', 'Age', 'NAME', 'Address']
Example 1: Using union()
Now we can perform union by using union() function. This function will join two dataframes.
Syntax: dataframe1.union(dataframe2)
Example:
Python3
import pyspark
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName( 'sparkdf' ).getOrCreate()
data = [[ "1" , "sravan" , "kakumanu" ],
[ "2" , "ojaswi" , "hyd" ],
[ "3" , "rohith" , "delhi" ],
[ "4" , "sridevi" , "kakumanu" ],
[ "5" , "bobby" , "guntur" ]]
columns = [ 'ID' , 'NAME' , 'Address' ]
dataframe1 = spark.createDataFrame(data, columns)
data = [[ "1" , 23 ],
[ "2" , 21 ],
[ "3" , 32 ],
]
columns = [ 'ID' , 'Age' ]
dataframe2 = spark.createDataFrame(data, columns)
for column in [column for column in dataframe2.columns
if column not in dataframe1.columns]:
dataframe1 = dataframe1.withColumn(column, lit( None ))
for column in [column for column in dataframe1.columns
if column not in dataframe2.columns]:
dataframe2 = dataframe2.withColumn(column, lit( None ))
dataframe1.union(dataframe2).show()
|
Output:
Example 2: Using unionAll()
Syntax: dataframe1.unionAll(dataframe2)
Python3
import pyspark
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName( 'sparkdf' ).getOrCreate()
data = [[ "1" , "sravan" , "kakumanu" ],
[ "2" , "ojaswi" , "hyd" ],
[ "3" , "rohith" , "delhi" ],
[ "4" , "sridevi" , "kakumanu" ],
[ "5" , "bobby" , "guntur" ]]
columns = [ 'ID' , 'NAME' , 'Address' ]
dataframe1 = spark.createDataFrame(data, columns)
data = [[ "1" , 23 ],
[ "2" , 21 ],
[ "3" , 32 ],
]
columns = [ 'ID' , 'Age' ]
dataframe2 = spark.createDataFrame(data, columns)
for column in [column for column in dataframe2.columns\
if column not in dataframe1.columns]:
dataframe1 = dataframe1.withColumn(column, lit( None ))
for column in [column for column in dataframe1.columns \
if column not in dataframe2.columns]:
dataframe2 = dataframe2.withColumn(column, lit( None ))
dataframe1.unionAll(dataframe2).show()
|
Output:
Example 3: Using unionByName
We can also perform unionByName, This will join dataframes by name.
Syntax: dataframe1.unionByName(dataframe2)
Example:
Python3
import pyspark
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName( 'sparkdf' ).getOrCreate()
data = [[ "1" , "sravan" , "kakumanu" ],
[ "2" , "ojaswi" , "hyd" ],
[ "3" , "rohith" , "delhi" ],
[ "4" , "sridevi" , "kakumanu" ],
[ "5" , "bobby" , "guntur" ]]
columns = [ 'ID' , 'NAME' , 'Address' ]
dataframe1 = spark.createDataFrame(data, columns)
data = [[ "1" , 23 ],
[ "2" , 21 ],
[ "3" , 32 ],
]
columns = [ 'ID' , 'Age' ]
dataframe2 = spark.createDataFrame(data, columns)
for column in [column for column in dataframe2.columns \
if column not in dataframe1.columns]:
dataframe1 = dataframe1.withColumn(column, lit( None ))
for column in [column for column in dataframe1.columns \
if column not in dataframe2.columns]:
dataframe2 = dataframe2.withColumn(column, lit( None ))
dataframe1.unionByName(dataframe2).show()
|
Output:
Last Updated :
21 Dec, 2021
Like Article
Save Article
Share your thoughts in the comments
Please Login to comment...