Remove Duplicate rows in R using Dplyr

Last Updated : 21 Jul, 2021

In this article, we are going to remove duplicate rows in R programming language using Dplyr package.

Method 1: distinct()

This function is used to remove the duplicate rows in the dataframe and get the unique data

Syntax:

distinct(dataframe)

We can also remove duplicate rows based on the multiple columns/variables in the dataframe

Syntax:

distinct(dataframe,column1,column2,.,column n)

Dataset in use:

Example 1: R program to remove duplicate rows from the dataframe

R

# load the package 
library(dplyr) 
  
# create dataframe with three columns 
# named id,name and address 
data1=data.frame(id=c(1,2,3,4,5,6,7,1,4,2), 
                   
                 name=c('sravan','ojaswi','bobby', 
                        'gnanesh','rohith','pinkey', 
                        'dhanush','sravan','gnanesh', 
                        'ojaswi'), 
                   
                 address=c('hyd','hyd','ponnur','tenali', 
                           'vijayawada','vijayawada','guntur', 
                           'hyd','tenali','hyd')) 
  
# remove duplicate rows 
print(distinct(data1))

Output:

Example 2: Remove duplicate rows based on single column

R

# load the package 
library(dplyr) 
  
# create dataframe with three columns  
# named id,name and address 
data1=data.frame(id=c(1,2,3,4,5,6,7,1,4,2), 
                   
                 name=c('sravan','ojaswi','bobby', 
                        'gnanesh','rohith','pinkey', 
                        'dhanush','sravan','gnanesh', 
                        'ojaswi'), 
                   
                 address=c('hyd','hyd','ponnur','tenali', 
                           'vijayawada','vijayawada','guntur', 
                           'hyd','tenali','hyd')) 
  
# remove duplicate rows based on name  
# column 
print(distinct(data1,name))

Output:

Example 3: Remove duplicate rows based on multiple columns

R

# load the package 
library(dplyr) 
  
# create dataframe with three columns  
# named id,name and address 
data1=data.frame(id=c(1,2,3,4,5,6,7,1,4,2), 
                   
                 name=c('sravan','ojaswi','bobby', 
                        'gnanesh','rohith','pinkey', 
                        'dhanush','sravan','gnanesh', 
                        'ojaswi'), 
                   
                 address=c('hyd','hyd','ponnur','tenali', 
                           'vijayawada','vijayawada','guntur', 
                           'hyd','tenali','hyd')) 
  
# remove duplicate rows based on  
# name and address columns 
print(distinct(data1,address,name))

Output:

Method 2: using duplicated() function

duplicated() function will return the duplicated rows and !duplicated() function will return the unique rows.

Syntax:

dataframe[!duplicated(dataframe$column_name), ]

Here, dataframe is the input dataframe and column_name is the column in dataframe, based on that column the duplicate data is removed.

Example: R program to remove duplicate data based on particular column

R

# load the package 
library(dplyr) 
  
# create dataframe with three columns 
# named id,name and address 
data1=data.frame(id=c(1,2,3,4,5,6,7,1,4,2), 
                   
                 name=c('sravan','ojaswi','bobby', 
                        'gnanesh','rohith','pinkey', 
                        'dhanush','sravan','gnanesh', 
                        'ojaswi'), 
                   
                 address=c('hyd','hyd','ponnur','tenali', 
                           'vijayawada','vijayawada','guntur', 
                           'hyd','tenali','hyd')) 
  
# remove duplicate rows using duplicated() 
# function based on name column 
print(data1[!duplicated(data1$name), ] ) 
print("=====================") 
  
# remove duplicate rows using duplicated() 
# function based on id column 
print(data1[!duplicated(data1$id), ] ) 
print("=====================") 
  
# remove duplicate rows using duplicated() 
# function based on address column 
print(data1[!duplicated(data1$address), ] ) 
print("=====================")

Output:

Method 3: Using unique() function

unique() function is used to remove duplicate rows by returning the unique data

Syntax:

unique(dataframe)

To get unique data from column pass the name of the column along with the name of the dataframe,

Syntax:

unique(dataframe$column_name)

Where, dataframe is the input dataframe and column_name is the column in the dataframe.

Example 1: R program to remove duplicates using unique() function

R

# load the package 
library(dplyr) 
  
# create dataframe with three columns 
# named id,name and address 
data1=data.frame(id=c(1,2,3,4,5,6,7,1,4,2), 
                   
                 name=c('sravan','ojaswi','bobby', 
                        'gnanesh','rohith','pinkey', 
                        'dhanush','sravan','gnanesh', 
                        'ojaswi'), 
                   
                 address=c('hyd','hyd','ponnur','tenali', 
                           'vijayawada','vijayawada','guntur', 
                           'hyd','tenali','hyd')) 
  
# get unique data from the dataframe 
print(unique(data1)) 

Output:

Example 2: R program to remove duplicate in particular column

R

# load the package 
library(dplyr) 
  
# create dataframe with three columns 
# named id,name and address 
data1=data.frame(id=c(1,2,3,4,5,6,7,1,4,2), 
                   
                 name=c('sravan','ojaswi','bobby', 
                        'gnanesh','rohith','pinkey', 
                        'dhanush','sravan','gnanesh', 
                        'ojaswi'), 
                   
                 address=c('hyd','hyd','ponnur','tenali', 
                           'vijayawada','vijayawada','guntur', 
                           'hyd','tenali','hyd')) 
  
# get unique data from the dataframe 
# in id column 
print(unique(data1$id)) 
  
# get unique data from the dataframe  
# in name  column 
print(unique(data1$name)) 
  
# get unique data from the dataframe  
# in address column 
print(unique(data1$address))