Jaro Similarity
Jaro Similarity is the measure of similarity between two strings. The value of Jaro distance ranges from 0 to 1. where 1 means the strings are equal and 0 means no similarity between the two strings.
Examples:
Input: s1 = “CRATE”, s2 = “TRACE”;
Output: Jaro Similarity = 0.733333
Input: s1 = “DwAyNE”, s2 = “DuANE”;
Output: Jaro Similarity = 0.822222
Algorithm:
The Jaro Similarity is calculated using the following formula
where:
- m is the number of matching characters
- t is half the number of transpositions
- where |s1| and |s2| are the lengths of strings s1 and s2 respectively.
The characters are said to be matching if they are the same and the characters are not further than
Transpositions are half the number of matching characters in both strings but in a different order.
Calculation:
- Let s1=”arnab”, s2=”raanb”, so the maximum distance to which each character is matched is 1.
- It is evident that both the strings have 5 matching characters, but the order is not the same, so the number of characters that are not in order is 4, so the number of transpositions is 2.
- Therefore, Jaro similarity can be calculated as follows:
Jaro Similarity = (1/3) * {(5/5) + (5/5) + (5-2)/5 } = 0.86667
Below is the implementation of the above approach.
// C++ implementation of above approach #include <bits/stdc++.h> using namespace std;
// Function to calculate the // Jaro Similarity of two strings double jaro_distance(string s1, string s2)
{ // If the strings are equal
if (s1 == s2)
return 1.0;
// Length of two strings
int len1 = s1.length(),
len2 = s2.length();
// Maximum distance upto which matching
// is allowed
int max_dist = floor (max(len1, len2) / 2) - 1;
// Count of matches
int match = 0;
// Hash for matches
int hash_s1[s1.length()] = { 0 },
hash_s2[s2.length()] = { 0 };
// Traverse through the first string
for ( int i = 0; i < len1; i++) {
// Check if there is any matches
for ( int j = max(0, i - max_dist);
j < min(len2, i + max_dist + 1); j++)
// If there is a match
if (s1[i] == s2[j] && hash_s2[j] == 0) {
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break ;
}
}
// If there is no match
if (match == 0)
return 0.0;
// Number of transpositions
double t = 0;
int point = 0;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( int i = 0; i < len1; i++)
if (hash_s1[i]) {
// Find the next matched character
// in second string
while (hash_s2[point] == 0)
point++;
if (s1[i] != s2[point++])
t++;
}
t /= 2;
// Return the Jaro Similarity
return ((( double )match) / (( double )len1)
+ (( double )match) / (( double )len2)
+ (( double )match - t) / (( double )match))
/ 3.0;
} // Driver code int main()
{ string s1 = "CRATE" , s2 = "TRACE" ;
// Print jaro Similarity of two strings
cout << jaro_distance(s1, s2) << endl;
return 0;
} |
// Java implementation of above approach class GFG
{ // Function to calculate the // Jaro Similarity of two Strings static double jaro_distance(String s1, String s2)
{ // If the Strings are equal
if (s1 == s2)
return 1.0 ;
// Length of two Strings
int len1 = s1.length(),
len2 = s2.length();
// Maximum distance upto which matching
// is allowed
int max_dist = ( int ) (Math.floor(Math.max(len1, len2) / 2 ) - 1 );
// Count of matches
int match = 0 ;
// Hash for matches
int hash_s1[] = new int [s1.length()];
int hash_s2[] = new int [s2.length()];
// Traverse through the first String
for ( int i = 0 ; i < len1; i++)
{
// Check if there is any matches
for ( int j = Math.max( 0 , i - max_dist);
j < Math.min(len2, i + max_dist + 1 ); j++)
// If there is a match
if (s1.charAt(i) == s2.charAt(j) && hash_s2[j] == 0 )
{
hash_s1[i] = 1 ;
hash_s2[j] = 1 ;
match++;
break ;
}
}
// If there is no match
if (match == 0 )
return 0.0 ;
// Number of transpositions
double t = 0 ;
int point = 0 ;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( int i = 0 ; i < len1; i++)
if (hash_s1[i] == 1 )
{
// Find the next matched character
// in second String
while (hash_s2[point] == 0 )
point++;
if (s1.charAt(i) != s2.charAt(point++) )
t++;
}
t /= 2 ;
// Return the Jaro Similarity
return ((( double )match) / (( double )len1)
+ (( double )match) / (( double )len2)
+ (( double )match - t) / (( double )match))
/ 3.0 ;
} // Driver code public static void main(String[] args)
{ String s1 = "CRATE" , s2 = "TRACE" ;
// Print jaro Similarity of two Strings
System.out.print(jaro_distance(s1, s2) + "\n" );
} } // This code is contributed by PrinciRaj1992 |
# Python3 implementation of above approach from math import floor, ceil
# Function to calculate the # Jaro Similarity of two s def jaro_distance(s1, s2):
# If the s are equal
if (s1 = = s2):
return 1.0
# Length of two s
len1 = len (s1)
len2 = len (s2)
# Maximum distance upto which matching
# is allowed
max_dist = floor( max (len1, len2) / 2 ) - 1
# Count of matches
match = 0
# Hash for matches
hash_s1 = [ 0 ] * len (s1)
hash_s2 = [ 0 ] * len (s2)
# Traverse through the first
for i in range (len1):
# Check if there is any matches
for j in range ( max ( 0 , i - max_dist),
min (len2, i + max_dist + 1 )):
# If there is a match
if (s1[i] = = s2[j] and hash_s2[j] = = 0 ):
hash_s1[i] = 1
hash_s2[j] = 1
match + = 1
break
# If there is no match
if (match = = 0 ):
return 0.0
# Number of transpositions
t = 0
point = 0
# Count number of occurrences
# where two characters match but
# there is a third matched character
# in between the indices
for i in range (len1):
if (hash_s1[i]):
# Find the next matched character
# in second
while (hash_s2[point] = = 0 ):
point + = 1
if (s1[i] ! = s2[point]):
t + = 1
point + = 1
t = t / / 2
# Return the Jaro Similarity
return (match / len1 + match / len2 +
(match - t) / match) / 3.0
# Driver code s1 = "CRATE"
s2 = "TRACE"
# Prjaro Similarity of two s print ( round (jaro_distance(s1, s2), 6 ))
# This code is contributed by mohit kumar 29 |
// C# implementation of above approach using System;
class GFG
{ // Function to calculate the
// Jaro Similarity of two Strings
static double jaro_distance( string s1, string s2)
{
// If the Strings are equal
if (s1 == s2)
return 1.0;
// Length of two Strings
int len1 = s1.Length ;
int len2 = s2.Length;
// Maximum distance upto which matching
// is allowed
int max_dist = ( int )(Math.Floor(( double )(
(Math.Max(len1, len2) / 2) - 1)));
// Count of matches
int match = 0;
// Hash for matches
int []hash_s1 = new int [s1.Length];
int []hash_s2 = new int [s2.Length];
// Traverse through the first String
for ( int i = 0; i < len1; i++)
{
// Check if there is any matches
for ( int j = Math.Max(0, i - max_dist);
j < Math.Min(len2, i + max_dist + 1); j++)
// If there is a match
if (s1[i] == s2[j] && hash_s2[j] == 0)
{
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break ;
}
}
// If there is no match
if (match == 0)
return 0.0;
// Number of transpositions
double t = 0;
int point = 0;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( int i = 0; i < len1; i++)
if (hash_s1[i] == 1)
{
// Find the next matched character
// in second String
while (hash_s2[point] == 0)
point++;
if (s1[i] != s2[point++] )
t++;
}
t /= 2;
// Return the Jaro Similarity
return ((( double )match) / (( double )len1)
+ (( double )match) / (( double )len2)
+ (( double )match - t) / (( double )match))
/ 3.0;
}
// Driver code
public static void Main()
{
string s1 = "CRATE" , s2 = "TRACE" ;
// Print jaro Similarity of two Strings
Console.WriteLine(jaro_distance(s1, s2));
}
} // This code is contributed by AnkitRai01 |
<script> // Javascript implementation of above approach // Function to calculate the // Jaro Similarity of two strings function jaro_distance(s1, s2)
{ // If the strings are equal
if (s1 == s2)
return 1.0;
// Length of two strings
var len1 = s1.length,
len2 = s2.length;
// Maximum distance upto which matching
// is allowed
var max_dist = Math.floor(Math.max(len1, len2) / 2) - 1;
// Count of matches
var match = 0;
// Hash for matches
var hash_s1 = Array(s1.length).fill(0);
var hash_s2 = Array(s1.length).fill(0);
// Traverse through the first string
for ( var i = 0; i < len1; i++) {
// Check if there is any matches
for ( var j = Math.max(0, i - max_dist);
j < Math.min(len2, i + max_dist + 1); j++)
// If there is a match
if (s1[i] == s2[j] && hash_s2[j] == 0) {
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break ;
}
}
// If there is no match
if (match == 0)
return 0.0;
// Number of transpositions
var t = 0;
var point = 0;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( var i = 0; i < len1; i++)
if (hash_s1[i]) {
// Find the next matched character
// in second string
while (hash_s2[point] == 0)
point++;
if (s1[i] != s2[point++])
t++;
}
t /= 2;
// Return the Jaro Similarity
return ((match) / (len1)
+ (match) / (len2)
+ (match - t) / (match))
/ 3.0;
} // Driver code var s1 = "CRATE" , s2 = "TRACE" ;
// Print jaro Similarity of two strings document.write( jaro_distance(s1, s2).toFixed(5)); </script> |
Output:
0.733333
Time Complexity: O(N * M), where N is the length of string s1 and M is the length of string s2.
Auxiliary Space: O(N + M)
Jaro-Winkler Similarity
The Jaro-Winkler similarity is a string metric measuring edit distance between two strings. Jaro – Winkler Similarity is much similar to Jaro Similarity. They both differ when the prefix of two string match. Jaro – Winkler Similarity uses a prefix scale ‘p’ which gives a more accurate answer when the strings have a common prefix up to a defined maximum length l.
Examples:
Input: s1 = “DwAyNE”, s2 = “DuANE”;
Output: Jaro-Winkler Similarity =0.84
Input: s1=”TRATE”, s2=”TRACE”;
Output: Jaro-Winkler similarity = 0.906667
Calculation:
- Jaro Winkler similarity is defined as follows
Sw = Sj + P * L * (1 – Sj)
where,- Sj, is jaro similarity
- Sw, is jaro- winkler similarity
- P is the scaling factor (0.1 by default)
- L is the length of the matching prefix up to a maximum of 4 characters.
- Let s1=”arnab”, s2=”aranb”. The Jaro similarity of the two strings is 0.933333 (From the above calculation.)
- The length of the matching prefix is 2 and we take the scaling factor as 0.1.
- Substituting in the formula;
Jaro-Winkler Similarity= 0.9333333 + 0.1 * 2 * (1-0.9333333) = 0.946667
Below is the implementation of the above approach.
// C++ implementation of above approach #include <bits/stdc++.h> using namespace std;
// Function to calculate the // Jaro Similarity of two strings double jaro_distance(string s1, string s2)
{ // If the strings are equal
if (s1 == s2)
return 1.0;
// Length of two strings
int len1 = s1.length(),
len2 = s2.length();
if (len1 == 0 || len2 == 0)
return 0.0;
// Maximum distance upto which matching
// is allowed
int max_dist = floor (max(len1, len2) / 2) - 1;
// Count of matches
int match = 0;
// Hash for matches
int hash_s1[s1.length()] = { 0 },
hash_s2[s2.length()] = { 0 };
// Traverse through the first string
for ( int i = 0; i < len1; i++) {
// Check if there is any matches
for ( int j = max(0, i - max_dist);
j < min(len2, i + max_dist + 1); j++)
// If there is a match
if (s1[i] == s2[j] && hash_s2[j] == 0) {
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break ;
}
}
// If there is no match
if (match == 0)
return 0.0;
// Number of transpositions
double t = 0;
int point = 0;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( int i = 0; i < len1; i++)
if (hash_s1[i]) {
// Find the next matched character
// in second string
while (hash_s2[point] == 0)
point++;
if (s1[i] != s2[point++])
t++;
}
t /= 2;
// Return the Jaro Similarity
return ((( double )match) / (( double )len1)
+ (( double )match) / (( double )len2)
+ (( double )match - t) / (( double )match))
/ 3.0;
} // Jaro Winkler Similarity double jaro_Winkler(string s1, string s2)
{ double jaro_dist = jaro_distance(s1, s2);
// If the jaro Similarity is above a threshold
if (jaro_dist > 0.7) {
// Find the length of common prefix
int prefix = 0;
for ( int i = 0;
i < min(s1.length(), s2.length()); i++) {
// If the characters match
if (s1[i] == s2[i])
prefix++;
// Else break
else
break ;
}
// Maximum of 4 characters are allowed in prefix
prefix = min(4, prefix);
// Calculate jaro winkler Similarity
jaro_dist += 0.1 * prefix * (1 - jaro_dist);
}
return jaro_dist;
} // Driver code int main()
{ string s1 = "TRATE" , s2 = "TRACE" ;
// Print Jaro-Winkler Similarity of two strings
cout << "Jaro-Winkler Similarity ="
<< jaro_Winkler(s1, s2) << endl;
return 0;
} |
// Java implementation of above approach class GFG
{ // Function to calculate the
// Jaro Similarity of two strings
static double jaro_distance(String s1, String s2)
{
// If the strings are equal
if (s1 == s2)
return 1.0 ;
// Length of two strings
int len1 = s1.length(),
len2 = s2.length();
if (len1 == 0 || len2 == 0 )
return 0.0 ;
// Maximum distance upto which matching
// is allowed
int max_dist = ( int )Math.floor(Math.max(len1, len2) / 2 ) - 1 ;
// Count of matches
int match = 0 ;
// Hash for matches
int hash_s1[] = new int [s1.length()];
int hash_s2[] = new int [s2.length()];
// Traverse through the first string
for ( int i = 0 ; i < len1; i++)
{
// Check if there is any matches
for ( int j = Math.max( 0 , i - max_dist);
j < Math.min(len2, i + max_dist + 1 ); j++)
// If there is a match
if (s1.charAt(i) == s2.charAt(j) &&
hash_s2[j] == 0 )
{
hash_s1[i] = 1 ;
hash_s2[j] = 1 ;
match++;
break ;
}
}
// If there is no match
if (match == 0 )
return 0.0 ;
// Number of transpositions
double t = 0 ;
int point = 0 ;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( int i = 0 ; i < len1; i++)
if (hash_s1[i] == 1 )
{
// Find the next matched character
// in second string
while (hash_s2[point] == 0 )
point++;
if (s1.charAt(i) != s2.charAt(point++))
t++;
}
t /= 2 ;
// Return the Jaro Similarity
return ((( double )match) / (( double )len1)
+ (( double )match) / (( double )len2)
+ (( double )match - t) / (( double )match))
/ 3.0 ;
}
// Jaro Winkler Similarity
static double jaro_Winkler(String s1, String s2)
{
double jaro_dist = jaro_distance(s1, s2);
// If the jaro Similarity is above a threshold
if (jaro_dist > 0.7 )
{
// Find the length of common prefix
int prefix = 0 ;
for ( int i = 0 ;
i < Math.min(s1.length(), s2.length()); i++)
{
// If the characters match
if (s1.charAt(i) == s2.charAt(i))
prefix++;
// Else break
else
break ;
}
// Maximum of 4 characters are allowed in prefix
prefix = Math.min( 4 , prefix);
// Calculate jaro winkler Similarity
jaro_dist += 0.1 * prefix * ( 1 - jaro_dist);
}
return jaro_dist;
}
// Driver code
public static void main (String[] args)
{
String s1 = "TRATE" , s2 = "TRACE" ;
// Print Jaro-Winkler Similarity of two strings
System.out.println( "Jaro-Winkler Similarity =" +
jaro_Winkler(s1, s2));
}
} // This code is contributed by AnkitRai01 |
# Python3 implementation of above approach from math import floor
# Function to calculate the # Jaro Similarity of two strings def jaro_distance(s1, s2) :
# If the strings are equal
if (s1 = = s2) :
return 1.0 ;
# Length of two strings
len1 = len (s1);
len2 = len (s2);
if (len1 = = 0 or len2 = = 0 ) :
return 0.0 ;
# Maximum distance upto which matching
# is allowed
max_dist = ( max ( len (s1), len (s2)) / / 2 ) - 1 ;
# Count of matches
match = 0 ;
# Hash for matches
hash_s1 = [ 0 ] * len (s1) ;
hash_s2 = [ 0 ] * len (s2) ;
# Traverse through the first string
for i in range (len1) :
# Check if there is any matches
for j in range ( max ( 0 , i - max_dist),
min (len2, i + max_dist + 1 )) :
# If there is a match
if (s1[i] = = s2[j] and hash_s2[j] = = 0 ) :
hash_s1[i] = 1 ;
hash_s2[j] = 1 ;
match + = 1 ;
break ;
# If there is no match
if (match = = 0 ) :
return 0.0 ;
# Number of transpositions
t = 0 ;
point = 0 ;
# Count number of occurrences
# where two characters match but
# there is a third matched character
# in between the indices
for i in range (len1) :
if (hash_s1[i]) :
# Find the next matched character
# in second string
while (hash_s2[point] = = 0 ) :
point + = 1 ;
if (s1[i] ! = s2[point]) :
point + = 1 ;
t + = 1 ;
else :
point + = 1 ;
t / = 2 ;
# Return the Jaro Similarity
return ((match / len1 + match / len2 +
(match - t) / match ) / 3.0 );
# Jaro Winkler Similarity def jaro_Winkler(s1, s2) :
jaro_dist = jaro_distance(s1, s2);
# If the jaro Similarity is above a threshold
if (jaro_dist > 0.7 ) :
# Find the length of common prefix
prefix = 0 ;
for i in range ( min ( len (s1), len (s2))) :
# If the characters match
if (s1[i] = = s2[i]) :
prefix + = 1 ;
# Else break
else :
break ;
# Maximum of 4 characters are allowed in prefix
prefix = min ( 4 , prefix);
# Calculate jaro winkler Similarity
jaro_dist + = 0.1 * prefix * ( 1 - jaro_dist);
return jaro_dist;
# Driver code if __name__ = = "__main__" :
s1 = "TRATE" ; s2 = "TRACE" ;
# Print Jaro-Winkler Similarity of two strings
print ( "Jaro-Winkler Similarity =" , jaro_Winkler(s1, s2)) ;
# This code is contributed by AnkitRai01 |
// C# implementation of above approach using System;
class GFG
{ // Function to calculate the
// Jaro Similarity of two strings
static double jaro_distance( string s1, string s2)
{
// If the strings are equal
if (s1 == s2)
return 1.0;
// Length of two strings
int len1 = s1.Length,
len2 = s2.Length;
if (len1 == 0 || len2 == 0)
return 0.0;
// Maximum distance upto which matching
// is allowed
int max_dist = ( int )Math.Floor(( double )
Math.Max(len1, len2) / 2) - 1;
// Count of matches
int match = 0;
// Hash for matches
int []hash_s1 = new int [s1.Length];
int []hash_s2 = new int [s2.Length];
// Traverse through the first string
for ( int i = 0; i < len1; i++)
{
// Check if there is any matches
for ( int j = Math.Max(0, i - max_dist);
j < Math.Min(len2, i + max_dist + 1); j++)
// If there is a match
if (s1[i] == s2[j] &&
hash_s2[j] == 0)
{
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break ;
}
}
// If there is no match
if (match == 0)
return 0.0;
// Number of transpositions
double t = 0;
int point = 0;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for ( int i = 0; i < len1; i++)
if (hash_s1[i] == 1)
{
// Find the next matched character
// in second string
while (hash_s2[point] == 0)
point++;
if (s1[i] != s2[point++])
t++;
}
t /= 2;
// Return the Jaro Similarity
return ((( double )match) / (( double )len1)
+ (( double )match) / (( double )len2)
+ (( double )match - t) / (( double )match))
/ 3.0;
}
// Jaro Winkler Similarity
static double jaro_Winkler( string s1, string s2)
{
double jaro_dist = jaro_distance(s1, s2);
// If the jaro Similarity is above a threshold
if (jaro_dist > 0.7)
{
// Find the length of common prefix
int prefix = 0;
for ( int i = 0; i < Math.Min(s1.Length,
s2.Length); i++)
{
// If the characters match
if (s1[i] == s2[i])
prefix++;
// Else break
else
break ;
}
// Maximum of 4 characters are allowed in prefix
prefix = Math.Min(4, prefix);
// Calculate jaro winkler Similarity
jaro_dist += 0.1 * prefix * (1 - jaro_dist);
}
return jaro_dist;
}
// Driver code
public static void Main ()
{
string s1 = "TRATE" , s2 = "TRACE" ;
// Print Jaro-Winkler Similarity of two strings
Console.WriteLine( "Jaro-Winkler Similarity =" +
jaro_Winkler(s1, s2));
}
} // This code is contributed by AnkitRai01 |
<script> // Javascript implementation of above approach
// Function to calculate the
// Jaro Similarity of two strings
function jaro_distance(s1, s2)
{
// If the strings are equal
if (s1 == s2)
return 1.0;
// Length of two strings
let len1 = s1.length, len2 = s2.length;
if (len1 == 0 || len2 == 0)
return 0.0;
// Maximum distance upto which matching
// is allowed
let max_dist = Math.floor(Math.max(len1, len2) / 2) - 1;
// Count of matches
let match = 0;
// Hash for matches
let hash_s1 = new Array(s1.length);
hash_s1.fill(0);
let hash_s2 = new Array(s2.length);
hash_s2.fill(0);
// Traverse through the first string
for (let i = 0; i < len1; i++)
{
// Check if there is any matches
for (let j = Math.max(0, i - max_dist);
j < Math.min(len2, i + max_dist + 1); j++)
// If there is a match
if (s1[i] == s2[j] &&
hash_s2[j] == 0)
{
hash_s1[i] = 1;
hash_s2[j] = 1;
match++;
break ;
}
}
// If there is no match
if (match == 0)
return 0.0;
// Number of transpositions
let t = 0;
let point = 0;
// Count number of occurrences
// where two characters match but
// there is a third matched character
// in between the indices
for (let i = 0; i < len1; i++)
if (hash_s1[i] == 1)
{
// Find the next matched character
// in second string
while (hash_s2[point] == 0)
point++;
if (s1[i] != s2[point++])
t++;
}
t /= 2;
// Return the Jaro Similarity
return ((match) / (len1)
+ (match) / (len2)
+ (match - t) / (match))
/ 3.0;
}
// Jaro Winkler Similarity
function jaro_Winkler(s1, s2)
{
let jaro_dist = jaro_distance(s1, s2);
// If the jaro Similarity is above a threshold
if (jaro_dist > 0.7)
{
// Find the length of common prefix
let prefix = 0;
for (let i = 0; i < Math.min(s1.length,s2.length); i++)
{
// If the characters match
if (s1[i] == s2[i])
prefix++;
// Else break
else
break ;
}
// Maximum of 4 characters are allowed in prefix
prefix = Math.min(4, prefix);
// Calculate jaro winkler Similarity
jaro_dist += 0.1 * prefix * (1 - jaro_dist);
}
return jaro_dist.toFixed(6);
}
let s1 = "TRATE" , s2 = "TRACE" ;
// Print Jaro-Winkler Similarity of two strings
document.write( "Jaro-Winkler Similarity =" +
jaro_Winkler(s1, s2));
</script> |
Output:
Jaro-Winkler Similarity =0.906667
Time Complexity: O(N * M), where N is the length of string s1 and M is the length of string s2.
Auxiliary Space: O(N + M)