Open In App

Find the Longest Common Substring using Binary search and Rolling Hash

Last Updated : 17 Dec, 2023
Improve
Improve
Like Article
Like
Save
Share
Report

Given two strings X and Y, the task is to find the length of the longest common substring. 

Examples:

Input: X = “GeeksforGeeks”, y = “GeeksQuiz” 
Output:
Explanation: The longest common substring is “Geeks” and is of length 5.

Input: X = “abcdxyz”, y = “xyzabcd” 
Output:
Explanation: The longest common substring is “abcd” and is of length 4.

Input: X = “zxabcdezy”, y = “yzabcdezx” 
Output:
Explanation: The longest common substring is “abcdez” and is of length 6.

Longest Common Substring using Dynamic Programming:

This problem can be solved using dynamic programming in O(len(X) * len(Y)), see this. In this article we are going to discuss about an efficient approach.

Longest Common Substring using Binary Search and Rolling Hash

Pre-requisites:

Observation:

If there is a common substring of length K in both the strings, then there will be common substrings of length 0, 1, …, K – 1. Hence, binary search on answer can be applied.\

Follow the below steps to implement the idea:

  • Smallest possible answer(low) = 0 and largest possible answer(high) = min(len(X), len(Y)), Range of binary search will be [0, min(len(X), len(Y))].
    • For every mid, check if there exists a common substring of length mid, if exists then update low, else update high.
    • To check the existence of a common substring of length K, Polynomial rolling hash function can be used. 
      • Iterate over all the windows of size K  in string X and string Y and get the hash
      • If there is a common hash return True, else return False.

Below is the implementation of this approach.

C++




#include <iostream>
#include <cmath>
#include <vector>
 
class ComputeHash {
private:
    std::vector<long> hash;
    std::vector<long> invMod;
    long mod;
    long p;
 
public:
    ComputeHash(std::string s, long p, long mod) {
        int n = s.length();
        this->hash.resize(n);
        this->invMod.resize(n);
        this->mod = mod;
        this->p = p;
 
        long pPow = 1;
        long hashValue = 0;
 
        for (int i = 0; i < n; i++) {
            char c = s[i];
            c = static_cast<char>(c - 'A' + 1);
            hashValue = (hashValue + c * pPow) % this->mod;
            this->hash[i] = hashValue;
            this->invMod[i] = static_cast<long>(std::pow(pPow, this->mod - 2)) % this->mod;
            pPow = (pPow * this->p) % this->mod;
        }
    }
 
    long getHash(int l, int r) {
        if (l == 0) {
            return this->hash[r];
        }
 
        long window = (this->hash[r] - this->hash[l - 1] + this->mod) % this->mod;
        return (window * this->invMod[l]) % this->mod;
    }
};
 
bool exists(int k, std::string X, std::string Y, ComputeHash &hashX1,
            ComputeHash &hashX2, ComputeHash &hashY1, ComputeHash &hashY2) {
    for (int i = 0; i <= X.length() - k; i++) {
        for (int j = 0; j <= Y.length() - k; j++) {
            if (X.substr(i, k) == Y.substr(j, k)) {
                return true;
            }
        }
    }
    return false;
}
 
int longestCommonSubstr(std::string X, std::string Y) {
    int n = X.length();
    int m = Y.length();
 
    long p1 = 31;
    long p2 = 37;
    long m1 = static_cast<long>(std::pow(10, 9) + 9);
    long m2 = static_cast<long>(std::pow(10, 9) + 7);
 
    // Initialize two hash objects
    // with different p1, p2, m1, m2
    // to reduce collision
    ComputeHash hashX1(X, p1, m1);
    ComputeHash hashX2(X, p2, m2);
 
    ComputeHash hashY1(Y, p1, m1);
    ComputeHash hashY2(Y, p2, m2);
 
    // Function that returns the existence
    // of a common substring of length k
    int low = 0, high = std::min(n, m);
    int answer = 0;
 
    while (low <= high) {
        int mid = (low + high) / 2;
        if (exists(mid, X, Y, hashX1, hashX2, hashY1, hashY2)) {
            answer = mid;
            low = mid + 1;
        } else {
            high = mid - 1;
        }
    }
 
    return answer;
}
 
 
 
int main() {
    std::string X = "GeeksforGeeks";
    std::string Y = "GeeksQuiz";
    std::cout << longestCommonSubstr(X, Y) << std::endl;
 
    return 0;
}


Java




import java.util.HashSet;
import java.util.Set;
 
class ComputeHash {
    private long[] hash;
    private long[] invMod;
    private long mod;
    private long p;
 
    // Generates hash in O(n(log(n)))
    public ComputeHash(String s, long p, long mod) {
        int n = s.length();
        this.hash = new long[n];
        this.invMod = new long[n];
        this.mod = mod;
        this.p = p;
 
        long pPow = 1;
        long hashValue = 0;
 
        for (int i = 0; i < n; i++) {
            char c = s.charAt(i);
            c = (char) (c - 'A' + 1);
            hashValue = (hashValue + c * pPow) % this.mod;
            this.hash[i] = hashValue;
            this.invMod[i] = (long)(Math.pow(pPow, this.mod - 2) % this.mod);
            pPow = (pPow * this.p) % this.mod;
        }
    }
 
    // Return hash of a window in O(1)
    public long getHash(int l, int r) {
        if (l == 0) {
            return this.hash[r];
        }
 
        long window = (this.hash[r] - this.hash[l - 1]) % this.mod;
        return (window * this.invMod[l]) % this.mod;
    }
}
 
public class Main {
    // Function to get the longest common substring
    public static int longestCommonSubstr(String X, String Y) {
    int n = X.length();
    int m = Y.length();
 
    long p1 = 31;
    long p2 = 37;
    long m1 = (long) (Math.pow(10, 9) + 9);
    long m2 = (long) (Math.pow(10, 9) + 7);
 
    // Initialize two hash objects
    // with different p1, p2, m1, m2
    // to reduce collision
    ComputeHash hashX1 = new ComputeHash(X, p1, m1);
    ComputeHash hashX2 = new ComputeHash(X, p2, m2);
 
    ComputeHash hashY1 = new ComputeHash(Y, p1, m1);
    ComputeHash hashY2 = new ComputeHash(Y, p2, m2);
 
    // Function that returns the existence
    // of a common substring of length k
    int low = 0, high = Math.min(n, m);
    int answer = 0;
    while (low <= high) {
        int mid = (low + high) / 2;
        if (exists(mid, X, Y)) {
        answer = mid;
        low = mid + 1;
    } else {
        high = mid - 1;
    }
 
    }
    return answer;
}
    private static boolean exists(int k, String X, String Y) {
    for (int i = 0; i <= X.length() - k; i++) {
        for (int j = 0; j <= Y.length() - k; j++) {
            if (X.substring(i, i + k).equals(Y.substring(j, j + k))) {
                return true;
            }
        }
    }
    return false;
}
 
    public static void main(String[] args) {
    String X = "GeeksforGeeks";
    String Y = "GeeksQuiz";
    System.out.println(longestCommonSubstr(X, Y));
}
}


Python3




# Python code to implement the approach
 
# Function to implement rolling hash
class ComputeHash:
 
    # Generates hash in O(n(log(n)))
    def __init__(self, s, p, mod):
        n = len(s)
        self.hash = [0] * n
        self.inv_mod = [0] * n
        self.mod = mod
        self.p = p
 
        p_pow = 1
        hash_value = 0
 
        for i in range(n):
            c = ord(s[i]) - 65 + 1
            hash_value = (hash_value + c * p_pow) % self.mod
            self.hash[i] = hash_value
            self.inv_mod[i] = pow(p_pow, self.mod - 2, self.mod)
            p_pow = (p_pow * self.p) % self.mod
 
    # Return hash of a window in O(1)
    def get_hash(self, l, r):
 
        if l == 0:
            return self.hash[r]
 
        window = (self.hash[r] - self.hash[l - 1]) % self.mod
        return (window * self.inv_mod[l]) % self.mod
 
# Function to get the longest common substring
def longestCommonSubstr(X, Y, n, m):
 
    p1, p2 = 31, 37
    m1, m2 = pow(10, 9) + 9, pow(10, 9) + 7
 
    # Initialize two hash objects
    # with different p1, p2, m1, m2
    # to reduce collision
    hash_X1 = ComputeHash(X, p1, m1)
    hash_X2 = ComputeHash(X, p2, m2)
 
    hash_Y1 = ComputeHash(Y, p1, m1)
    hash_Y2 = ComputeHash(Y, p2, m2)
 
    # Function that returns the existence
    # of a common substring of length k
    def exists(k):
 
        if k == 0:
            return True
 
        st = set()
         
        # Iterate on X and get hash tuple
        # of all windows of size k
        for i in range(n - k + 1):
            h1 = hash_X1.get_hash(i, i + k - 1)
            h2 = hash_X2.get_hash(i, i + k - 1)
 
            cur_window_hash = (h1, h2)
             
            # Put the hash tuple in the set
            st.add(cur_window_hash)
 
        # Iterate on Y and get hash tuple
        # of all windows of size k
        for i in range(m - k + 1):
            h1 = hash_Y1.get_hash(i, i + k - 1)
            h2 = hash_Y2.get_hash(i, i + k - 1)
 
            cur_window_hash = (h1, h2)
             
            # If hash exists in st return True
            if cur_window_hash in st:
                return True
        return False
 
    # Binary Search on length
    answer = 0
    low, high = 0, min(n, m)
 
    while low <= high:
        mid = (low + high) // 2
 
        if exists(mid):
            answer = mid
            low = mid + 1
        else:
            high = mid - 1
 
    return answer
 
 
# Driver Code
if __name__ == '__main__':
    X = 'GeeksforGeeks'
    Y = 'GeeksQuiz'
    print(longestCommonSubstr(X, Y, len(X), len(Y)))


C#




using System;
using System.Collections.Generic;
 
class ComputeHash
{
    private List<long> hash;
    private List<long> invMod;
    private long mod;
    private long p;
 
    // Constructor to initialize hash values for the given string
    public ComputeHash(string s, long p, long mod)
    {
        int n = s.Length;
        this.hash = new List<long>(n);
        this.invMod = new List<long>(n);
        this.mod = mod;
        this.p = p;
 
        long pPow = 1;
        long hashValue = 0;
 
        for (int i = 0; i < n; i++)
        {
            char c = s[i];
            // Convert character to numeric value
            c = (char)(c - 'A' + 1);
            hashValue = (hashValue + c * pPow) % this.mod;
            this.hash.Add(hashValue);
            // Compute modular inverse
            this.invMod.Add(ModularInverse(pPow, this.mod));
            pPow = (pPow * this.p) % this.mod;
        }
    }
 
    // Helper function to compute modular inverse using extended Euclidean algorithm
    private long ModularInverse(long a, long m)
    {
        long m0 = m, t, q;
        long x0 = 0, x1 = 1;
 
        if (m == 1)
            return 0;
 
        while (a > 1)
        {
            q = a / m;
            t = m;
            m = a % m;
            a = t;
            t = x0;
            x0 = x1 - q * x0;
            x1 = t;
        }
 
        if (x1 < 0)
            x1 += m0;
 
        return x1;
    }
 
    // Function to get hash value for a substring [l, r]
    public long GetHash(int l, int r)
    {
        if (l == 0)
        {
            return this.hash[r];
        }
 
        long window = (this.hash[r] - this.hash[l - 1] + this.mod) % this.mod;
        return (window * this.invMod[l]) % this.mod;
    }
}
 
class LongestCommonSubstring
{
    // Function to check if a common substring of length k exists
    private static bool Exists(int k, string X, string Y,
                               ComputeHash hashX1, ComputeHash hashX2,
                               ComputeHash hashY1, ComputeHash hashY2)
    {
        for (int i = 0; i <= X.Length - k; i++)
        {
            for (int j = 0; j <= Y.Length - k; j++)
            {
                if (X.Substring(i, k) == Y.Substring(j, k))
                {
                    return true;
                }
            }
        }
        return false;
    }
 
    // Function to find the length of the longest common substring of X and Y
    private static int LongestCommonSubstr(string X, string Y)
    {
        int n = X.Length;
        int m = Y.Length;
 
        long p1 = 31;
        long p2 = 37;
        long m1 = (long)Math.Pow(10, 9) + 9;
        long m2 = (long)Math.Pow(10, 9) + 7;
 
        // Initialize two hash objects with different p1, p2, m1, m2 to reduce collision
        ComputeHash hashX1 = new ComputeHash(X, p1, m1);
        ComputeHash hashX2 = new ComputeHash(X, p2, m2);
 
        ComputeHash hashY1 = new ComputeHash(Y, p1, m1);
        ComputeHash hashY2 = new ComputeHash(Y, p2, m2);
 
        // Binary search to find the length of the longest common substring
        int low = 0, high = Math.Min(n, m);
        int answer = 0;
 
        while (low <= high)
        {
            int mid = (low + high) / 2;
            if (Exists(mid, X, Y, hashX1, hashX2, hashY1, hashY2))
            {
                answer = mid;
                low = mid + 1;
            }
            else
            {
                high = mid - 1;
            }
        }
 
        return answer;
    }
 
    static void Main()
    {
        string X = "GeeksforGeeks";
        string Y = "GeeksQuiz";
        Console.WriteLine(LongestCommonSubstr(X, Y));
 
        // Output: 5
    }
}


Javascript




// javascript code to implement the approach
class ComputeHash {
  constructor(mod) {
    this.mod = mod;
  }
  compute(s) {
    let hashValue = 0n;
    let pPow = 1n;
    for (let i = 0; i < s.length; i++) {
      let c = BigInt(s.charCodeAt(i) - "A".charCodeAt(0) + 1);
      hashValue = (hashValue + c * pPow) % this.mod;
      pPow = pPow * 26n % this.mod;
    }
    return hashValue;
  }
}
 
function longestCommonSubstr(s, t) {
  const mod = BigInt(10**9+7);
  const p1 = new ComputeHash(mod);
  const p2 = new ComputeHash(mod);
  let left = 0, right = Math.min(s.length, t.length);
  while (left < right) {
    const mid = Math.floor((left + right + 1) / 2);
    let found = false;
    const set = new Set();
    for (let i = 0; i + mid <= s.length; i++) {
      const hashValue = p1.compute(s.substring(i, i + mid));
      set.add(hashValue);
    }
    for (let i = 0; i + mid <= t.length; i++) {
      const hashValue = p2.compute(t.substring(i, i + mid));
      if (set.has(hashValue)) {
        found = true;
        break;
      }
    }
    if (found) {
      left = mid;
    } else {
      right = mid - 1;
    }
  }
  return left;
}
 
console.log(longestCommonSubstr("ABABAB", "BABABA"));  // expected output: 3
 
//code is implemented by chetanbargal


Output

5




Time Complexity: O(n * log(m1)) + O(m * log((m1)) + O((n + m) * log(min(n, m)))

  1. Generating hash object takes O(n*log(m1)), where n is the length of string and m1 = pow(10, 9) + 7.
  2. Binary search takes O(log(min(n, m))), where n, m are the lengths of both strings.
  3. Hash of a window takes O(1) time.
  4. Exist function takes O(n + m) time.

Auxiliary Space: O(n + m)



Like Article
Suggest improvement
Previous
Next
Share your thoughts in the comments

Similar Reads