# Jaro and Jaro-Winkler similarity

Last Updated : 15 Feb, 2022

### Jaro Similarity

Jaro Similarity is the measure of similarity between two strings. The value of Jaro distance ranges from 0 to 1. where 1 means the strings are equal and 0 means no similarity between the two strings.

Examples:

Input: s1 = “CRATE”, s2 = “TRACE”;
Output: Jaro Similarity = 0.733333

Input: s1 = “DwAyNE”, s2 = “DuANE”;
Output: Jaro Similarity = 0.822222

Algorithm:
The Jaro Similarity is calculated using the following formula

where:

• m is the number of matching characters
• t is half the number of transpositions
• where |s1| and |s2| are the lengths of strings s1 and s2 respectively.

The characters are said to be matching if they are the same and the characters are not further than
Transpositions are half the number of matching characters in both strings but in a different order.
Calculation:

• Let s1=”arnab”, s2=”raanb”, so the maximum distance to which each character is matched is 1.
• It is evident that both the strings have 5 matching characters, but the order is not the same, so the number of characters that are not in order is 4, so the number of transpositions is 2.
• Therefore, Jaro similarity can be calculated as follows:
Jaro Similarity = (1/3) * {(5/5) + (5/5) + (5-2)/5 } = 0.86667

Below is the implementation of the above approach.

## C++

 // C++ implementation of above approach#include using namespace std; // Function to calculate the// Jaro Similarity of two stringsdouble jaro_distance(string s1, string s2){    // If the strings are equal    if (s1 == s2)        return 1.0;     // Length of two strings    int len1 = s1.length(),        len2 = s2.length();     // Maximum distance upto which matching    // is allowed    int max_dist = floor(max(len1, len2) / 2) - 1;     // Count of matches    int match = 0;     // Hash for matches    int hash_s1[s1.length()] = { 0 },        hash_s2[s2.length()] = { 0 };     // Traverse through the first string    for (int i = 0; i < len1; i++) {         // Check if there is any matches        for (int j = max(0, i - max_dist);             j < min(len2, i + max_dist + 1); j++)             // If there is a match            if (s1[i] == s2[j] && hash_s2[j] == 0) {                hash_s1[i] = 1;                hash_s2[j] = 1;                match++;                break;            }    }     // If there is no match    if (match == 0)        return 0.0;     // Number of transpositions    double t = 0;     int point = 0;     // Count number of occurrences    // where two characters match but    // there is a third matched character    // in between the indices    for (int i = 0; i < len1; i++)        if (hash_s1[i]) {             // Find the next matched character            // in second string            while (hash_s2[point] == 0)                point++;             if (s1[i] != s2[point++])                t++;        }     t /= 2;     // Return the Jaro Similarity    return (((double)match) / ((double)len1)            + ((double)match) / ((double)len2)            + ((double)match - t) / ((double)match))           / 3.0;} // Driver codeint main(){    string s1 = "CRATE", s2 = "TRACE";     // Print jaro Similarity of two strings    cout << jaro_distance(s1, s2) << endl;     return 0;}

## Java

 // Java implementation of above approachclass GFG{ // Function to calculate the// Jaro Similarity of two Stringsstatic double jaro_distance(String s1, String s2){    // If the Strings are equal    if (s1 == s2)        return 1.0;     // Length of two Strings    int len1 = s1.length(),        len2 = s2.length();     // Maximum distance upto which matching    // is allowed    int max_dist = (int) (Math.floor(Math.max(len1, len2) / 2) - 1);     // Count of matches    int match = 0;     // Hash for matches    int hash_s1[] = new int[s1.length()];    int hash_s2[] = new int[s2.length()];     // Traverse through the first String    for (int i = 0; i < len1; i++)     {         // Check if there is any matches        for (int j = Math.max(0, i - max_dist);            j < Math.min(len2, i + max_dist + 1); j++)             // If there is a match            if (s1.charAt(i) == s2.charAt(j) && hash_s2[j] == 0)             {                hash_s1[i] = 1;                hash_s2[j] = 1;                match++;                break;            }    }     // If there is no match    if (match == 0)        return 0.0;     // Number of transpositions    double t = 0;     int point = 0;     // Count number of occurrences    // where two characters match but    // there is a third matched character    // in between the indices    for (int i = 0; i < len1; i++)        if (hash_s1[i] == 1)        {             // Find the next matched character            // in second String            while (hash_s2[point] == 0)                point++;             if (s1.charAt(i) != s2.charAt(point++) )                t++;        }     t /= 2;     // Return the Jaro Similarity    return (((double)match) / ((double)len1)            + ((double)match) / ((double)len2)            + ((double)match - t) / ((double)match))        / 3.0;} // Driver codepublic static void main(String[] args){    String s1 = "CRATE", s2 = "TRACE";     // Print jaro Similarity of two Strings    System.out.print(jaro_distance(s1, s2) +"\n");}} // This code is contributed by PrinciRaj1992

## Python3

 # Python3 implementation of above approachfrom math import floor, ceil # Function to calculate the# Jaro Similarity of two sdef jaro_distance(s1, s2):         # If the s are equal    if (s1 == s2):        return 1.0     # Length of two s    len1 = len(s1)    len2 = len(s2)     # Maximum distance upto which matching    # is allowed    max_dist = floor(max(len1, len2) / 2) - 1     # Count of matches    match = 0     # Hash for matches    hash_s1 = [0] * len(s1)    hash_s2 = [0] * len(s2)     # Traverse through the first    for i in range(len1):         # Check if there is any matches        for j in range(max(0, i - max_dist),                        min(len2, i + max_dist + 1)):                         # If there is a match            if (s1[i] == s2[j] and hash_s2[j] == 0):                hash_s1[i] = 1                hash_s2[j] = 1                match += 1                break     # If there is no match    if (match == 0):        return 0.0     # Number of transpositions    t = 0    point = 0     # Count number of occurrences    # where two characters match but    # there is a third matched character    # in between the indices    for i in range(len1):        if (hash_s1[i]):             # Find the next matched character            # in second            while (hash_s2[point] == 0):                point += 1             if (s1[i] != s2[point]):                t += 1            point += 1    t = t//2     # Return the Jaro Similarity    return (match/ len1 + match / len2 +            (match - t) / match)/ 3.0 # Driver codes1 = "CRATE"s2 = "TRACE" # Prjaro Similarity of two sprint(round(jaro_distance(s1, s2),6)) # This code is contributed by mohit kumar 29

## C#

 // C# implementation of above approach using System; class GFG {      // Function to calculate the     // Jaro Similarity of two Strings     static double jaro_distance(string s1, string s2)     {         // If the Strings are equal         if (s1 == s2)             return 1.0;              // Length of two Strings         int len1 = s1.Length ;        int len2 = s2.Length;              // Maximum distance upto which matching         // is allowed         int max_dist = (int)(Math.Floor((double)(                        (Math.Max(len1, len2) / 2) - 1)));              // Count of matches         int match = 0;              // Hash for matches         int []hash_s1 = new int[s1.Length];         int []hash_s2 = new int[s2.Length];              // Traverse through the first String         for (int i = 0; i < len1; i++)         {                  // Check if there is any matches             for (int j = Math.Max(0, i - max_dist);                 j < Math.Min(len2, i + max_dist + 1); j++)                      // If there is a match                 if (s1[i] == s2[j] && hash_s2[j] == 0)                 {                     hash_s1[i] = 1;                     hash_s2[j] = 1;                     match++;                     break;                 }         }              // If there is no match         if (match == 0)             return 0.0;              // Number of transpositions         double t = 0;              int point = 0;              // Count number of occurrences         // where two characters match but         // there is a third matched character         // in between the indices         for (int i = 0; i < len1; i++)             if (hash_s1[i] == 1)             {                      // Find the next matched character                 // in second String                 while (hash_s2[point] == 0)                     point++;                      if (s1[i] != s2[point++] )                     t++;             }              t /= 2;              // Return the Jaro Similarity         return (((double)match) / ((double)len1)                 + ((double)match) / ((double)len2)                 + ((double)match - t) / ((double)match))             / 3.0;     }          // Driver code     public static void Main()     {         string s1 = "CRATE", s2 = "TRACE";              // Print jaro Similarity of two Strings         Console.WriteLine(jaro_distance(s1, s2));     } }  // This code is contributed by AnkitRai01

## Javascript

 

Output:
0.733333

Time Complexity: O(N * M), where N is the length of string s1 and M is the length of string s2.

Auxiliary Space: O(N + M)

### Jaro-Winkler Similarity

The Jaro-Winkler similarity is a string metric measuring edit distance between two strings. Jaro – Winkler Similarity is much similar to Jaro Similarity. They both differ when the prefix of two string match. Jaro – Winkler Similarity uses a prefix scale ‘p’ which gives a more accurate answer when the strings have a common prefix up to a defined maximum length l.
Examples:

Input: s1 = “DwAyNE”, s2 = “DuANE”;
Output: Jaro-Winkler Similarity =0.84

Input: s1=”TRATE”, s2=”TRACE”;
Output: Jaro-Winkler similarity = 0.906667

Calculation:

• Jaro Winkler similarity is defined as follows
Sw = Sj + P * L * (1 – Sj)
where,
• Sj, is jaro similarity
• Sw, is jaro- winkler similarity
• P is the scaling factor (0.1 by default)
• L is the length of the matching prefix up to a maximum of 4 characters.
• Let s1=”arnab”, s2=”aranb”. The Jaro similarity of the two strings is 0.933333 (From the above calculation.)
• The length of the matching prefix is 2 and we take the scaling factor as 0.1.
• Substituting in the formula;
Jaro-Winkler Similarity= 0.9333333 + 0.1 * 2 * (1-0.9333333) = 0.946667

Below is the implementation of the above approach.

## C++

 // C++ implementation of above approach#include using namespace std; // Function to calculate the// Jaro Similarity of two stringsdouble jaro_distance(string s1, string s2){    // If the strings are equal    if (s1 == s2)        return 1.0;     // Length of two strings    int len1 = s1.length(),        len2 = s2.length();     if (len1 == 0 || len2 == 0)        return 0.0;     // Maximum distance upto which matching    // is allowed    int max_dist = floor(max(len1, len2) / 2) - 1;     // Count of matches    int match = 0;     // Hash for matches    int hash_s1[s1.length()] = { 0 },        hash_s2[s2.length()] = { 0 };     // Traverse through the first string    for (int i = 0; i < len1; i++) {         // Check if there is any matches        for (int j = max(0, i - max_dist);             j < min(len2, i + max_dist + 1); j++)            // If there is a match            if (s1[i] == s2[j] && hash_s2[j] == 0) {                hash_s1[i] = 1;                hash_s2[j] = 1;                match++;                break;            }    }     // If there is no match    if (match == 0)        return 0.0;     // Number of transpositions    double t = 0;     int point = 0;     // Count number of occurrences    // where two characters match but    // there is a third matched character    // in between the indices    for (int i = 0; i < len1; i++)        if (hash_s1[i]) {             // Find the next matched character            // in second string            while (hash_s2[point] == 0)                point++;             if (s1[i] != s2[point++])                t++;        }     t /= 2;     // Return the Jaro Similarity    return (((double)match) / ((double)len1)            + ((double)match) / ((double)len2)            + ((double)match - t) / ((double)match))           / 3.0;} // Jaro Winkler Similaritydouble jaro_Winkler(string s1, string s2){    double jaro_dist = jaro_distance(s1, s2);     // If the jaro Similarity is above a threshold    if (jaro_dist > 0.7) {         // Find the length of common prefix        int prefix = 0;         for (int i = 0;             i < min(s1.length(), s2.length()); i++) {            // If the characters match            if (s1[i] == s2[i])                prefix++;             // Else break            else                break;        }         // Maximum of 4 characters are allowed in prefix        prefix = min(4, prefix);         // Calculate jaro winkler Similarity        jaro_dist += 0.1 * prefix * (1 - jaro_dist);    }     return jaro_dist;} // Driver codeint main(){    string s1 = "TRATE", s2 = "TRACE";     // Print Jaro-Winkler Similarity of two strings    cout << "Jaro-Winkler Similarity ="         << jaro_Winkler(s1, s2) << endl;     return 0;}

## Java

 // Java implementation of above approach class GFG{         // Function to calculate the     // Jaro Similarity of two strings     static double jaro_distance(String s1, String s2)     {         // If the strings are equal         if (s1 == s2)             return 1.0;              // Length of two strings         int len1 = s1.length(),             len2 = s2.length();              if (len1 == 0 || len2 == 0)             return 0.0;              // Maximum distance upto which matching         // is allowed         int max_dist = (int)Math.floor(Math.max(len1, len2) / 2) - 1;              // Count of matches         int match = 0;              // Hash for matches         int hash_s1[] = new int [s1.length()];         int hash_s2[] = new int[s2.length()];              // Traverse through the first string         for (int i = 0; i < len1; i++)         {                  // Check if there is any matches             for (int j = Math.max(0, i - max_dist);                 j < Math.min(len2, i + max_dist + 1); j++)                                  // If there is a match                 if (s1.charAt(i) == s2.charAt(j) &&                                 hash_s2[j] == 0)                 {                     hash_s1[i] = 1;                     hash_s2[j] = 1;                     match++;                     break;                 }         }              // If there is no match         if (match == 0)             return 0.0;              // Number of transpositions         double t = 0;              int point = 0;              // Count number of occurrences         // where two characters match but         // there is a third matched character         // in between the indices         for (int i = 0; i < len1; i++)             if (hash_s1[i] == 1)             {                      // Find the next matched character                 // in second string                 while (hash_s2[point] == 0)                     point++;                      if (s1.charAt(i) != s2.charAt(point++))                     t++;             }              t /= 2;              // Return the Jaro Similarity         return (((double)match) / ((double)len1)                 + ((double)match) / ((double)len2)                 + ((double)match - t) / ((double)match))             / 3.0;     }          // Jaro Winkler Similarity     static double jaro_Winkler(String s1, String s2)     {         double jaro_dist = jaro_distance(s1, s2);              // If the jaro Similarity is above a threshold         if (jaro_dist > 0.7)        {                  // Find the length of common prefix             int prefix = 0;                  for (int i = 0;                 i < Math.min(s1.length(), s2.length()); i++)             {                                  // If the characters match                 if (s1.charAt(i) == s2.charAt(i))                     prefix++;                      // Else break                 else                    break;             }                  // Maximum of 4 characters are allowed in prefix             prefix = Math.min(4, prefix);                  // Calculate jaro winkler Similarity             jaro_dist += 0.1 * prefix * (1 - jaro_dist);         }         return jaro_dist;     }          // Driver code     public static void main (String[] args)     {         String s1 = "TRATE", s2 = "TRACE";              // Print Jaro-Winkler Similarity of two strings         System.out.println("Jaro-Winkler Similarity =" +                             jaro_Winkler(s1, s2));     } } // This code is contributed by AnkitRai01

## Python3

 # Python3 implementation of above approach from math import floor # Function to calculate the # Jaro Similarity of two strings def jaro_distance(s1, s2) :     # If the strings are equal     if (s1 == s2) :        return 1.0;      # Length of two strings     len1 = len(s1);    len2 = len(s2);      if (len1 == 0 or len2 == 0) :        return 0.0;      # Maximum distance upto which matching     # is allowed     max_dist = (max(len(s1), len(s2)) // 2 ) - 1;      # Count of matches     match = 0;      # Hash for matches     hash_s1 = [0] * len(s1) ;    hash_s2 = [0] * len(s2) ;      # Traverse through the first string     for i in range(len1) :          # Check if there is any matches         for j in range( max(0, i - max_dist),                     min(len2, i + max_dist + 1)) :                          # If there is a match             if (s1[i] == s2[j] and hash_s2[j] == 0) :                 hash_s1[i] = 1;                 hash_s2[j] = 1;                 match += 1;                 break;              # If there is no match     if (match == 0) :        return 0.0;      # Number of transpositions     t = 0;      point = 0;      # Count number of occurrences     # where two characters match but     # there is a third matched character     # in between the indices     for i in range(len1) :         if (hash_s1[i]) :             # Find the next matched character             # in second string             while (hash_s2[point] == 0) :                point += 1;              if (s1[i] != s2[point]) :                point += 1;                t += 1;            else :                point += 1;                         t /= 2;      # Return the Jaro Similarity     return ((match / len1 + match / len2 +            (match - t) / match ) / 3.0);  # Jaro Winkler Similarity def jaro_Winkler(s1, s2) :      jaro_dist = jaro_distance(s1, s2);      # If the jaro Similarity is above a threshold     if (jaro_dist > 0.7) :         # Find the length of common prefix         prefix = 0;          for i in range(min(len(s1), len(s2))) :                     # If the characters match             if (s1[i] == s2[i]) :                prefix += 1;              # Else break             else :                break;          # Maximum of 4 characters are allowed in prefix         prefix = min(4, prefix);          # Calculate jaro winkler Similarity         jaro_dist += 0.1 * prefix * (1 - jaro_dist);      return jaro_dist;  # Driver code if __name__ == "__main__" :      s1 = "TRATE"; s2 = "TRACE";      # Print Jaro-Winkler Similarity of two strings     print("Jaro-Winkler Similarity =", jaro_Winkler(s1, s2)) ;  # This code is contributed by AnkitRai01

## C#

 // C# implementation of above approach using System; class GFG {          // Function to calculate the     // Jaro Similarity of two strings     static double jaro_distance(string s1, string s2)     {         // If the strings are equal         if (s1 == s2)             return 1.0;              // Length of two strings         int len1 = s1.Length,             len2 = s2.Length;              if (len1 == 0 || len2 == 0)             return 0.0;              // Maximum distance upto which matching         // is allowed         int max_dist = (int)Math.Floor((double)                        Math.Max(len1, len2) / 2) - 1;              // Count of matches         int match = 0;              // Hash for matches         int []hash_s1 = new int [s1.Length];         int []hash_s2 = new int[s2.Length];              // Traverse through the first string         for (int i = 0; i < len1; i++)         {                  // Check if there is any matches             for (int j = Math.Max(0, i - max_dist);                 j < Math.Min(len2, i + max_dist + 1); j++)                                  // If there is a match                 if (s1[i] == s2[j] &&                     hash_s2[j] == 0)                 {                     hash_s1[i] = 1;                     hash_s2[j] = 1;                     match++;                     break;                 }         }              // If there is no match         if (match == 0)             return 0.0;              // Number of transpositions         double t = 0;              int point = 0;              // Count number of occurrences         // where two characters match but         // there is a third matched character         // in between the indices         for (int i = 0; i < len1; i++)             if (hash_s1[i] == 1)             {                      // Find the next matched character                 // in second string                 while (hash_s2[point] == 0)                     point++;                      if (s1[i] != s2[point++])                     t++;             }         t /= 2;              // Return the Jaro Similarity         return (((double)match) / ((double)len1)                 + ((double)match) / ((double)len2)                 + ((double)match - t) / ((double)match))             / 3.0;     }          // Jaro Winkler Similarity     static double jaro_Winkler(string s1, string s2)     {         double jaro_dist = jaro_distance(s1, s2);              // If the jaro Similarity is above a threshold         if (jaro_dist > 0.7)         {                  // Find the length of common prefix             int prefix = 0;                  for (int i = 0; i < Math.Min(s1.Length,                                         s2.Length); i++)             {                                  // If the characters match                 if (s1[i] == s2[i])                     prefix++;                      // Else break                 else                    break;             }                  // Maximum of 4 characters are allowed in prefix             prefix = Math.Min(4, prefix);                  // Calculate jaro winkler Similarity             jaro_dist += 0.1 * prefix * (1 - jaro_dist);         }         return jaro_dist;     }          // Driver code     public static void Main ()     {         string s1 = "TRATE", s2 = "TRACE";              // Print Jaro-Winkler Similarity of two strings         Console.WriteLine("Jaro-Winkler Similarity =" +                            jaro_Winkler(s1, s2));     } }  // This code is contributed by AnkitRai01

## Javascript

 

Output:
Jaro-Winkler Similarity =0.906667

Time Complexity: O(N * M), where N is the length of string s1 and M is the length of string s2.
Auxiliary Space: O(N + M)

Previous
Next