Open In App

Full outer join in PySpark dataframe

Last Updated : 19 Dec, 2021
Improve
Improve
Like Article
Like
Save
Share
Report

In this article, we are going to see how to perform Full Outer Join in PySpark DataFrames in Python.

Create the first dataframe:

Python3




# importing module
import pyspark
  
# importing sparksession from pyspark.sql module
from pyspark.sql import SparkSession
  
# creating sparksession and giving an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
  
# list  of employee data
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"],
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"],
        ["5", "bobby", "company 1"]]
  
# specify column names
columns = ['ID', 'NAME', 'Company']
  
# creating a dataframe from the lists of data
dataframe = spark.createDataFrame(data, columns)
  
dataframe.show()


Output:

Create second Dataframe:

Python3




# importing module
import pyspark
  
# importing sparksession from pyspark.sql module
from pyspark.sql import SparkSession
  
# creating sparksession and giving an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
  
# list  of employee data
data1 = [["1", "45000", "IT"], 
         ["2", "145000", "Manager"], 
         ["6", "45000", "HR"],
         ["5", "34000", "Sales"]]
  
# specify column names
columns = ['ID', 'salary', 'department']
  
# creating a dataframe from the lists of data
dataframe1 = spark.createDataFrame(data1, columns)
  
dataframe1.show()


Output:

Method 1: Using full keyword

This is used to join the two PySpark dataframes with all rows and columns using full keyword

Syntax: dataframe1.join(dataframe2,dataframe1.column_name ==  dataframe2.column_name,”full”).show()

where

  • dataframe1 is the first PySpark dataframe
  • dataframe2 is the second PySpark dataframe
  • column_name is the column with respect to dataframe

Example: Python program to join two dataframes based on the ID column.

Python3




# importing module
import pyspark
  
# importing sparksession from pyspark.sql module
from pyspark.sql import SparkSession
  
# creating sparksession and giving an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
  
# list  of employee data
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"],
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"], 
        ["5", "bobby", "company 1"]]
  
# specify column names
columns = ['ID', 'NAME', 'Company']
  
# creating a dataframe from the lists of data
dataframe = spark.createDataFrame(data, columns)
  
# list  of employee data
data1 = [["1", "45000", "IT"], 
         ["2", "145000", "Manager"],
         ["6", "45000", "HR"],
         ["5", "34000", "Sales"]]
  
# specify column names
columns = ['ID', 'salary', 'department']
  
# creating a dataframe from the lists of data
dataframe1 = spark.createDataFrame(data1, columns)
  
  
# join two dataframes based on
# ID column using full keyword
dataframe.join(dataframe1,
               dataframe.ID == dataframe1.ID
               "full").show()


Output:

Method 2: Using fullouter keyword

This is used to join the two PySpark dataframes with all rows and columns using fullouter keyword

Syntax: dataframe1.join(dataframe2,dataframe1.column_name ==  dataframe2.column_name,”fullouter”).show()

where

  • dataframe1 is the first PySpark dataframe
  • dataframe2 is the second PySpark dataframe
  • column_name is the column with respect to dataframe

Example: Python program to join two dataframes based on the ID column.

Python3




# importing module
import pyspark
  
# importing sparksession from pyspark.sql module
from pyspark.sql import SparkSession
  
# creating sparksession and giving an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
  
# list  of employee data
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"], 
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"],
        ["5", "bobby", "company 1"]]
  
# specify column names
columns = ['ID', 'NAME', 'Company']
  
# creating a dataframe from the lists of data
dataframe = spark.createDataFrame(data, columns)
  
# list  of employee data
data1 = [["1", "45000", "IT"], 
         ["2", "145000", "Manager"],
         ["6", "45000", "HR"],
         ["5", "34000", "Sales"]]
  
# specify column names
columns = ['ID', 'salary', 'department']
  
# creating a dataframe from the lists of data
dataframe1 = spark.createDataFrame(data1, columns)
  
  
# join two dataframes based on ID
# column using full outer keyword
dataframe.join(dataframe1,
               dataframe.ID == dataframe1.ID
               "fullouter").show()


Output:

Method 3: Using outer keyword

This is used to join the two PySpark dataframes with all rows and columns using the outer keyword.

Syntax: dataframe1.join(dataframe2,dataframe1.column_name ==  dataframe2.column_name,”outer”).show()

where,

  • dataframe1 is the first PySpark dataframe
  • dataframe2 is the second PySpark dataframe
  • column_name is the column with respect to dataframe

Python3




# importing module
import pyspark
  
# importing sparksession from pyspark.sql module
from pyspark.sql import SparkSession
  
# creating sparksession and giving an app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()
  
# list  of employee data
data = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"],
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"], 
        ["5", "bobby", "company 1"]]
  
# specify column names
columns = ['ID', 'NAME', 'Company']
  
# creating a dataframe from the lists of data
dataframe = spark.createDataFrame(data, columns)
  
# list  of employee data
data1 = [["1", "45000", "IT"],
         ["2", "145000", "Manager"],
         ["6", "45000", "HR"],
         ["5", "34000", "Sales"]]
  
# specify column names
columns = ['ID', 'salary', 'department']
  
# creating a dataframe from the lists of data
dataframe1 = spark.createDataFrame(data1, columns)
  
  
# join two dataframes based on
# ID column using outer keyword
dataframe.join(dataframe1,
               dataframe.ID == dataframe1.ID
               "outer").show()


Output:



Like Article
Suggest improvement
Share your thoughts in the comments

Similar Reads