Open In App

How to do Fuzzy Matching on Pandas Dataframe Column Using Python?

Prerequisite: FuzzyWuzzy

In this tutorial, we will learn how to do fuzzy matching on the pandas DataFrame column using Python. Fuzzy matching is a process that lets us identify the matches which are not exact but find a given pattern in our target item. Fuzzy matching is the basis of search engines. That is why we get many recommendations or suggestions as we type our search query in any browser.



Functions Used

Examples

Example 1: (Basic Approach)




from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas
  
dict1 = {'name': ["aparna", "pankaj"
                  "sudhir", "Geeku"]}
  
dict2 = {'name': ["aparn", "arup", "Pankaj",
                  "sudhir c", "Geek", "abc"]}
  
# converting to pandas dataframes
dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)
  
# empty lists for storing the
# matches later
mat1 = []
mat2 = []
  
# printing the pandas dataframes
dframe1.show()
dframe2.show()

dframe1:



dframe2:




list1 = dframe1['name'].tolist()
list2 = dframe2['name'].tolist()
  
# taking the threshold as 80
threshold = 80

Output:




# iterating through list1 to extract 
# it's closest match from list2
for i in list1:
    mat1.append(process.extract(i, list2, limit=2))
dframe1['matches'] = mat1
  
dframe1.show()

Output:




# iterating through the closest
# matches to filter out the
# maximum closest match
for j in dframe1['matches']:
    for k in j:
        
        if k[1] >= threshold:
            p.append(k[0])
              
    mat2.append(",".join(p))
    p = []
      
# storing the resultant matches 
# back to dframe1
dframe1['matches'] = mat2
  
dframe1.show()

Output:

Example 2:

In this example, the steps are the same as in example one. Only the difference is that there are multiple matches for a particular row item like for ‘mango’ and ‘choco’. We set the threshold=82 to increase fuzzy match accuracy.




import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
  
# creating the dictionaries
dict1 = {'name': ["mango", "coco", "choco", "peanut", "apple"]}
dict2 = {'name': ["mango fruit", "coconut", "chocolate",
                  "mangoes", "chocos", "peanuts", "appl"]}
  
# converting to pandas dataframes
dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)
  
# empty lists for storing the matches later
mat1 = []
mat2 = []
p = []
  
# printing the pandas dataframes
print("First dataframe:\n", dframe1, 
      "\nSecond dataframe:\n", dframe2)
  
# converting dataframe column to list
# of elements
# to do fuzzy matching
list1 = dframe1['name'].tolist()
list2 = dframe2['name'].tolist()
  
# taking the threshold as 82
threshold = 82
  
# iterating through list1 to extract 
# it's closest match from list2
for i in list1:
    mat1.append(process.extract(i, list2, limit=2))
dframe1['matches'] = mat1
  
# iterating through the closest matches
# to filter out the maximum closest match
for j in dframe1['matches']:
    for k in j:
        if k[1] >= threshold:
            p.append(k[0])
    mat2.append(",".join(p))
    p = []
  
  
# storing the resultant matches back to dframe1
dframe1['matches'] = mat2
print("\nDataFrame after Fuzzy matching:")
dframe1

Output:

Now we will use the process.extractOne() method to match only the closest between the two dataframes. Inside this method we will apply different fuzzy matching functions which are as follows:

Example 3: Using fuzz.ratio()




import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
  
# creating the dictionaries
dict1 = {'name': ["aparna", "pankaj", "sudhir",
                  "Geeku", "geeks for geeks"]}
  
dict2 = {'name': ["aparn", "arup", "Pankaj",
                  "for geeks geeks", "sudhir c",
                  "geeks geeks"]}
  
# converting to pandas dataframes
dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)
  
# empty lists for storing the matches 
# later
mat1 = []
mat2 = []
p = []
  
# printing the pandas dataframes
print("First dataframe:\n", dframe1,
      "\nSecond dataframe:\n", dframe2)
  
# converting dataframe column to 
# list of elements
# to do fuzzy matching
list1 = dframe1['name'].tolist()
list2 = dframe2['name'].tolist()
  
# taking the threshold as 80
threshold = 80
  
# iterating through list1 to extract 
# it's closest match from list2
for i in list1:
    mat1.append(process.extractOne(i, list2, scorer=fuzz.ratio))
dframe1['matches'] = mat1
  
# iterating through the closest matches
# to filter out the maximum closest match
for j in dframe1['matches']:
    if j[1] >= threshold:
        p.append(j[0])
    mat2.append(",".join(p))
    p = []
  
  
# storing the resultant matches back to dframe1
dframe1['matches'] = mat2
print("\nDataFrame after Fuzzy matching using fuzz.ratio():")
dframe1

Output:

Example 4: Using fuzz.partial_ratio()




import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
  
# creating the dictionaries
dict1 = {'name': ["aparna", "pankaj", "sudhir"
                  "Geeku", "geeks for geeks"]}
  
dict2 = {'name': ["aparn", "arup", "Pankaj",
                  "for geeks geeks", "sudhir c"
                  "geeks geeks"]}
  
# converting to pandas dataframes
dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)
  
# empty lists for storing the matches 
# later
mat1 = []
mat2 = []
p = []
  
# printing the pandas dataframes
print("First dataframe:\n", dframe1,
      "\nSecond dataframe:\n", dframe2)
  
# converting dataframe column to 
# list of elements
# to do fuzzy matching
list1 = dframe1['name'].tolist()
list2 = dframe2['name'].tolist()
  
# taking the threshold as 80
threshold = 80
  
# iterating through list1 to extract 
# it's closest match from list2
for i in list1:
    mat1.append(process.extractOne(
      i, list2, scorer=fuzz.partial_ratio))
dframe1['matches'] = mat1
  
# iterating through the closest matches
# to filter out the maximum closest match
for j in dframe1['matches']:
    if j[1] >= threshold:
        p.append(j[0])
    mat2.append(",".join(p))
    p = []
  
  
# storing the resultant matches back to dframe1
dframe1['matches'] = mat2
print("\nDataFrame after Fuzzy matching using fuzz.partial_ratio:")
dframe1

Output:

Example 5: Using fuzz.token_sort_ratio()




import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
  
# creating the dictionaries
dict1 = {'name': ["aparna", "pankaj", "sudhir"
                  "Geeku", "geeks for geeks"]}
  
dict2 = {'name': ["aparn", "arup", "Pankaj",
                  "for geeks geeks", "sudhir c",
                  "Geek"]}
  
# converting to pandas dataframes
dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)
  
# empty lists for storing the matches
# later
mat1 = []
mat2 = []
p = []
  
# printing the pandas dataframes
print("First dataframe:\n", dframe1,
      "\nSecond dataframe:\n", dframe2)
  
# converting dataframe column to 
# list of elements
# to do fuzzy matching
list1 = dframe1['name'].tolist()
list2 = dframe2['name'].tolist()
  
# taking the threshold as 80
threshold = 80
  
# iterating through list1 to extract
# it's closest match from list2
for i in list1:
    mat1.append(process.extractOne(
      i, list2, scorer=fuzz.token_sort_ratio))
dframe1['matches'] = mat1
  
# iterating through the closest matches
# to filter out the maximum closest match
for j in dframe1['matches']:
    if j[1] >= threshold:
        p.append(j[0])
    mat2.append(",".join(p))
    p = []
  
  
# storing the resultant matches back 
# to dframe1
dframe1['matches'] = mat2
print("\nDataFrame after Fuzzy matching using fuzz.token_sort_ratio:")
dframe1

Output:

Example 6: Using fuzz.token_set_ratio()




import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
  
# creating the dictionaries
dict1 = {'name': ["aparna", "pankaj", "Geeku"
                  "geeks for geeks"]}
  
dict2 = {'name': ["aparn", "arup", "Pankaj",
                  "geeks for for geeks"
                  "geeks for geeks", "Geek"]}
  
# converting to pandas dataframes
dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)
  
# empty lists for storing the matches
# later
mat1 = []
mat2 = []
p = []
  
# printing the pandas dataframes
print("First dataframe:\n", dframe1,
      "\nSecond dataframe:\n", dframe2)
  
# converting dataframe column 
# to list of elements
# to do fuzzy matching
list1 = dframe1['name'].tolist()
list2 = dframe2['name'].tolist()
  
# taking the threshold as 80
threshold = 80
  
# iterating through list1 to extract
# it's closest match from list2
for i in list1:
    mat1.append(process.extractOne(
      i, list2, scorer=fuzz.token_set_ratio))
dframe1['matches'] = mat1
  
# iterating through the closest matches
# to filter out the maximum closest match
for j in dframe1['matches']:
    if j[1] >= threshold:
        p.append(j[0])
    mat2.append(",".join(p))
    p = []
  
  
# storing the resultant matches back 
# to dframe1
dframe1['matches'] = mat2
print("\nDataFrame after Fuzzy matching using token_set_ratio():")
dframe1

Output:


Article Tags :