Given a string of length n of lowercase alphabet characters, we need to count total number of distinct substrings of this string.
Examples:
Input : str = “ababa” Output : 10 Total number of distinct substring are 10, which are, "", "a", "b", "ab", "ba", "aba", "bab", "abab", "baba" and "ababa"
We have discussed a Suffix Trie based solution in below post :
Count of distinct substrings of a string using Suffix Trie
We can solve this problem using suffix array and longest common prefix concept. A suffix array is a sorted array of all suffixes of a given string.
For string “ababa” suffixes are : “ababa”, “baba”, “aba”, “ba”, “a”. After taking these suffixes in sorted form we get our suffix array as [4, 2, 0, 3, 1]
Then we calculate lcp array using kasai’s algorithm. For string “ababa”, lcp array is [1, 3, 0, 2, 0]
After constructing both arrays, we calculate total number of distinct substring by keeping this fact in mind : If we look through the prefixes of each suffix of a string, we cover all substrings of that string.
We will explain the procedure for above example,
String = “ababa” Suffixes in sorted order : “a”, “aba”, “ababa”, “ba”, “baba” Initializing distinct substring count by length of first suffix, Count = length(“a”) = 1 Substrings taken in consideration : “a” Now we consider each consecutive pair of suffix, lcp("a", "aba") = "a". All characters that are not part of the longest common prefix contribute to a distinct substring. In the above case, they are 'b' and ‘a'. So they should be added to Count. Count += length(“aba”) - lcp(“a”, “aba”) Count = 3 Substrings taken in consideration : “aba”, “ab” Similarly for next pair also, Count += length(“ababa”) - lcp(“aba”, “ababa”) Count = 5 Substrings taken in consideration : “ababa”, “abab” Count += length(“ba”) - lcp(“ababa”, “ba”) Count = 7 Substrings taken in consideration : “ba”, “b” Count += length(“baba”) - lcp(“ba”, “baba”) Count = 9 Substrings taken in consideration : “baba”, “bab” We finally add 1 for empty string. count = 10
Implementation:
// C++ code to count total distinct substrings // of a string #include <bits/stdc++.h> using namespace std;
// Structure to store information of a suffix struct suffix
{ int index; // To store original index
int rank[2]; // To store ranks and next
// rank pair
}; // A comparison function used by sort() to compare // two suffixes. Compares two pairs, returns 1 if // first pair is smaller int cmp( struct suffix a, struct suffix b)
{ return (a.rank[0] == b.rank[0])?
(a.rank[1] < b.rank[1] ?1: 0):
(a.rank[0] < b.rank[0] ?1: 0);
} // This is the main function that takes a string // 'txt' of size n as an argument, builds and return // the suffix array for the given string vector< int > buildSuffixArray(string txt, int n)
{ // A structure to store suffixes and their indexes
struct suffix suffixes[n];
// Store suffixes and their indexes in an array
// of structures. The structure is needed to sort
// the suffixes alphabetically and maintain their
// old indexes while sorting
for ( int i = 0; i < n; i++)
{
suffixes[i].index = i;
suffixes[i].rank[0] = txt[i] - 'a' ;
suffixes[i].rank[1] = ((i+1) < n)?
(txt[i + 1] - 'a' ): -1;
}
// Sort the suffixes using the comparison function
// defined above.
sort(suffixes, suffixes+n, cmp);
// At his point, all suffixes are sorted according
// to first 2 characters. Let us sort suffixes
// according to first 4 characters, then first
// 8 and so on
int ind[n]; // This array is needed to get the
// index in suffixes[] from original
// index. This mapping is needed to get
// next suffix.
for ( int k = 4; k < 2*n; k = k*2)
{
// Assigning rank and index values to first suffix
int rank = 0;
int prev_rank = suffixes[0].rank[0];
suffixes[0].rank[0] = rank;
ind[suffixes[0].index] = 0;
// Assigning rank to suffixes
for ( int i = 1; i < n; i++)
{
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if (suffixes[i].rank[0] == prev_rank &&
suffixes[i].rank[1] == suffixes[i-1].rank[1])
{
prev_rank = suffixes[i].rank[0];
suffixes[i].rank[0] = rank;
}
else // Otherwise increment rank and assign
{
prev_rank = suffixes[i].rank[0];
suffixes[i].rank[0] = ++rank;
}
ind[suffixes[i].index] = i;
}
// Assign next rank to every suffix
for ( int i = 0; i < n; i++)
{
int nextindex = suffixes[i].index + k/2;
suffixes[i].rank[1] = (nextindex < n)?
suffixes[ind[nextindex]].rank[0]: -1;
}
// Sort the suffixes according to first k characters
sort(suffixes, suffixes+n, cmp);
}
// Store indexes of all sorted suffixes in the suffix
// array
vector< int >suffixArr;
for ( int i = 0; i < n; i++)
suffixArr.push_back(suffixes[i].index);
// Return the suffix array
return suffixArr;
} /* To construct and return LCP */ vector< int > kasai(string txt, vector< int > suffixArr)
{ int n = suffixArr.size();
// To store LCP array
vector< int > lcp(n, 0);
// An auxiliary array to store inverse of suffix array
// elements. For example if suffixArr[0] is 5, the
// invSuff[5] would store 0. This is used to get next
// suffix string from suffix array.
vector< int > invSuff(n, 0);
// Fill values in invSuff[]
for ( int i=0; i < n; i++)
invSuff[suffixArr[i]] = i;
// Initialize length of previous LCP
int k = 0;
// Process all suffixes one by one starting from
// first suffix in txt[]
for ( int i=0; i<n; i++)
{
/* If the current suffix is at n-1, then we don’t
have next substring to consider. So lcp is not
defined for this substring, we put zero. */
if (invSuff[i] == n-1)
{
k = 0;
continue ;
}
/* j contains index of the next substring to
be considered to compare with the present
substring, i.e., next string in suffix array */
int j = suffixArr[invSuff[i]+1];
// Directly start matching from k'th index as
// at-least k-1 characters will match
while (i+k<n && j+k<n && txt[i+k]==txt[j+k])
k++;
lcp[invSuff[i]] = k; // lcp for the present suffix.
// Deleting the starting character from the string.
if (k>0)
k--;
}
// return the constructed lcp array
return lcp;
} // method to return count of total distinct substring int countDistinctSubstring(string txt)
{ int n = txt.length();
// calculating suffix array and lcp array
vector< int > suffixArr = buildSuffixArray(txt, n);
vector< int > lcp = kasai(txt, suffixArr);
// n - suffixArr[i] will be the length of suffix
// at ith position in suffix array initializing
// count with length of first suffix of sorted
// suffixes
int result = n - suffixArr[0];
for ( int i = 1; i < lcp.size(); i++)
// subtract lcp from the length of suffix
result += (n - suffixArr[i]) - lcp[i - 1];
result++; // For empty string
return result;
} // Driver code to test above methods int main()
{ string txt = "ababa" ;
cout << countDistinctSubstring(txt);
return 0;
} |
/*package whatever //do not write package name here */ import java.util.*;
class Suffix implements Comparable<Suffix> {
int index;
int [] rank = new int [ 2 ];
public int compareTo(Suffix s)
{
if (rank[ 0 ] == s.rank[ 0 ]) {
return Integer.compare(rank[ 1 ], s.rank[ 1 ]);
}
else {
return Integer.compare(rank[ 0 ], s.rank[ 0 ]);
}
}
} class Main {
static int [] buildSuffixArray(String txt, int n)
{
Suffix[] suffixes = new Suffix[n];
for ( int i = 0 ; i < n; i++) {
suffixes[i] = new Suffix();
suffixes[i].index = i;
suffixes[i].rank[ 0 ] = txt.charAt(i) - 'a' ;
suffixes[i].rank[ 1 ]
= (i + 1 ) < n ? txt.charAt(i + 1 ) - 'a'
: - 1 ;
}
// Sort the suffixes
Arrays.sort(suffixes);
int [] ind = new int [n];
for ( int k = 4 ; k < 2 * n; k = k * 2 ) {
// Assigning rank and index values to first
// suffix
int rank = 0 ;
int prevRank = suffixes[ 0 ].rank[ 0 ];
suffixes[ 0 ].rank[ 0 ] = rank;
ind[suffixes[ 0 ].index] = 0 ;
for ( int i = 1 ; i < n; i++) {
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if (suffixes[i].rank[ 0 ] == prevRank
&& suffixes[i].rank[ 1 ]
== suffixes[i - 1 ].rank[ 1 ]) {
prevRank = suffixes[i].rank[ 0 ];
suffixes[i].rank[ 0 ] = rank;
}
else { // Otherwise increment rank and
// assign
prevRank = suffixes[i].rank[ 0 ];
suffixes[i].rank[ 0 ] = ++rank;
}
ind[suffixes[i].index] = i;
}
for ( int i = 0 ; i < n; i++) {
int nextIndex = suffixes[i].index + k / 2 ;
suffixes[i].rank[ 1 ]
= nextIndex < n
? suffixes[ind[nextIndex]].rank[ 0 ]
: - 1 ;
}
Arrays.sort(suffixes);
}
// Store indexes of all sorted suffixes in the
// suffix array
int [] suffixArr = new int [n];
for ( int i = 0 ; i < n; i++) {
suffixArr[i] = suffixes[i].index;
}
return suffixArr;
}
static int [] Const_LCP(String txt, int [] suffixArr)
{
int n = suffixArr.length;
int [] lcp = new int [n];
int [] invSuff = new int [n];
for ( int i = 0 ; i < n; i++) {
invSuff[suffixArr[i]] = i;
}
int k = 0 ;
for ( int i = 0 ; i < n; i++) {
if (invSuff[i] == n - 1 ) {
k = 0 ;
continue ;
}
int j = suffixArr[invSuff[i] + 1 ];
while (i + k < n && j + k < n
&& txt.charAt(i + k)
== txt.charAt(j + k)) {
k++;
}
lcp[invSuff[i]] = k;
if (k > 0 ) {
k--;
}
}
return lcp;
}
static int cnt_Dist_Substr(String txt)
{
int n = txt.length();
// calculating suffix array and lcp array
int [] suffixArr = buildSuffixArray(txt, n);
int [] lcp = Const_LCP(txt, suffixArr);
// suffixes
int result = n - suffixArr[ 0 ];
for ( int i = 1 ; i < lcp.length; i++) {
// subtract lcp from the length of suffix
result += (n - suffixArr[i]) - lcp[i - 1 ];
}
result++; // For empty string
return result;
}
public static void main(String[] args)
{
String txt = "ababa" ;
System.out.println(cnt_Dist_Substr(txt));
}
} // This code is contributed by Jay |
# Python code to count total distinct substrings # of a string # This is the main function that takes a string # 'txt' of size n as an argument, builds and return # the suffix array for the given string def build_suffix_array(txt, n):
# Structure to store information of a suffix
class Suffix:
def __init__( self , index, rank):
self .index = index # To store original index
self .rank = rank # To store ranks and next rank pair
# Store suffixes and their indexes in an array
# of structures. The structure is needed to sort
# the suffixes alphabetically and maintain their
# old indexes while sorting
suffixes = [Suffix(i, [ ord (txt[i]) - ord ( 'a' ), ord (txt[i + 1 ]) - ord ( 'a' ) if i + 1 < n else - 1 ]) for i in range (n)]
# Sort the suffixes using the comparison function
# defined above.
suffixes.sort(key = lambda x: x.rank)
# At his point, all suffixes are sorted according
# to first 2 characters. Let us sort suffixes
# according to first 4 characters, then first
# 8 and so on
ind = [ 0 ] * n
# This array is needed to get the
# index in suffixes[] from original
# index. This mapping is needed to get
# next suffix.
k = 4
while k < 2 * n:
# Assigning rank and index values to first suffix
rank, prev_rank = 0 , suffixes[ 0 ].rank[ 0 ]
suffixes[ 0 ].rank[ 0 ] = rank
ind[suffixes[ 0 ].index] = 0
# Assigning rank to suffixes
for i in range ( 1 , n):
# If first rank and next ranks are same as
# that of previous suffix in array, assign
# the same new rank to this suffix
if suffixes[i].rank[ 0 ] = = prev_rank and suffixes[i].rank[ 1 ] = = suffixes[i - 1 ].rank[ 1 ]:
prev_rank = suffixes[i].rank[ 0 ]
suffixes[i].rank[ 0 ] = rank
# Otherwise increment rank and assign
else :
prev_rank = suffixes[i].rank[ 0 ]
rank + = 1
suffixes[i].rank[ 0 ] = rank
ind[suffixes[i].index] = i
# Assign next rank to every suffix
for i in range (n):
nextindex = suffixes[i].index + k / / 2
suffixes[i].rank[ 1 ] = suffixes[ind[nextindex]].rank[ 0 ] if nextindex < n else - 1
# Sort the suffixes according to first k characters
suffixes.sort(key = lambda x: x.rank)
k * = 2
# Store indexes of all sorted suffixes in the suffix
# array
# Return the suffix array
return [suffix.index for suffix in suffixes]
# To construct and return LCP def kasai(txt, suffixArr):
n = len (suffixArr)
# To store LCP array
lcp = [ 0 ] * n
# An auxiliary array to store inverse of suffix array
# elements. For example if suffixArr[0] is 5, the
# invSuff[5] would store 0. This is used to get next
# suffix string from suffix array.
invSuff = [ 0 ] * n
# Fill values in invSuff[]
for i in range (n):
invSuff[suffixArr[i]] = i
# Initialize length of previous LCP
k = 0
# Process all suffixes one by one starting from
# first suffix in txt[]
for i in range (n):
# If the current suffix is at n-1, then we don’t
# have next substring to consider. So lcp is not
# defined for this substring, we put zero
if invSuff[i] = = n - 1 :
k = 0
continue
# j contains index of the next substring to
# be considered to compare with the present
# substring, i.e., next string in suffix array
j = suffixArr[invSuff[i] + 1 ]
# Directly start matching from k'th index as
# at-least k-1 characters will match
while i + k < n and j + k < n and txt[i + k] = = txt[j + k]:
k + = 1
lcp[invSuff[i]] = k # lcp for the present suffix.
# Deleting the starting character from the string.
if k > 0 :
k - = 1
# return the constructed lcp array
return lcp
# method to return count of total distinct substring def count_distinct_substring(txt):
n = len (txt)
# calculating suffix array and lcp array
suffixArr = build_suffix_array(txt, n)
lcp = kasai(txt, suffixArr)
# n - suffixArr[i] will be the length of suffix
# at ith position in suffix array initializing
# count with length of first suffix of sorted
# suffixes
result = n - suffixArr[ 0 ]
for i in range ( 1 , len (lcp)):
# subtract lcp from the length of suffix
result + = (n - suffixArr[i]) - lcp[i - 1 ]
result + = 1 # For empty string
return result
# Driver code to test above methods txt = "ababa"
print (count_distinct_substring(txt))
# This code is contributed by Aman Kumar |
// C# code addition using System;
using System.Linq;
class Suffix : IComparable<Suffix>
{ public int index;
public int [] rank = new int [2];
public int CompareTo(Suffix s)
{
if (rank[0] == s.rank[0])
{
return rank[1].CompareTo(s.rank[1]);
}
else
{
return rank[0].CompareTo(s.rank[0]);
}
}
} class Program
{ static int [] buildSuffixArray( string txt, int n)
{
Suffix[] suffixes = new Suffix[n];
for ( int i = 0; i < n; i++)
{
suffixes[i] = new Suffix();
suffixes[i].index = i;
suffixes[i].rank[0] = txt[i] - 'a' ;
suffixes[i].rank[1] = (i + 1) < n ? txt[i + 1] - 'a' : -1;
}
// Sort the suffixes
Array.Sort(suffixes);
int [] ind = new int [n];
for ( int k = 4; k < 2 * n; k = k * 2)
{
// Assigning rank and index values to first
// suffix
int rank = 0;
int prevRank = suffixes[0].rank[0];
suffixes[0].rank[0] = rank;
ind[suffixes[0].index] = 0;
for ( int i = 1; i < n; i++)
{
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if (suffixes[i].rank[0] == prevRank
&& suffixes[i].rank[1] == suffixes[i - 1].rank[1])
{
prevRank = suffixes[i].rank[0];
suffixes[i].rank[0] = rank;
}
else
{
// Otherwise increment rank and assign
prevRank = suffixes[i].rank[0];
suffixes[i].rank[0] = ++rank;
}
ind[suffixes[i].index] = i;
}
for ( int i = 0; i < n; i++)
{
int nextIndex = suffixes[i].index + k / 2;
suffixes[i].rank[1] = nextIndex < n ? suffixes[ind[nextIndex]].rank[0] : -1;
}
Array.Sort(suffixes);
}
// Store indexes of all sorted suffixes in the
// suffix array
int [] suffixArr = new int [n];
for ( int i = 0; i < n; i++)
{
suffixArr[i] = suffixes[i].index;
}
return suffixArr;
}
static int [] Const_LCP( string txt, int [] suffixArr)
{
int n = suffixArr.Length;
int [] lcp = new int [n];
int [] invSuff = new int [n];
for ( int i = 0; i < n; i++)
{
invSuff[suffixArr[i]] = i;
}
int k = 0;
for ( int i = 0; i < n; i++)
{
if (invSuff[i] == n - 1)
{
k = 0;
continue ;
}
int j = suffixArr[invSuff[i] + 1];
while (i + k < n && j + k < n
&& txt[i + k] == txt[j + k])
{
k++;
}
lcp[invSuff[i]] = k;
if (k > 0)
{
k--;
}
}
return lcp;
}
static int cnt_Dist_Substr( string txt)
{
int n = txt.Length;
// calculating suffix array and lcp array
int [] suffixArr = buildSuffixArray(txt, n);
int [] lcp = Const_LCP(txt, suffixArr);
// suffixes
int result = n - suffixArr[0];
for ( int i = 1; i < lcp.Length; i++)
{
// subtract lcp from the length of suffix
result += (n - suffixArr[i]) - lcp[i - 1];
}
result++; // For empty string
return result;
}
static void Main() {
String txt = "ababa" ;
Console.WriteLine(cnt_Dist_Substr(txt));
}
} // The code is contributed by Arushi Goel. |
// Javascript code to count total distinct substrings // of a string // This is the main function that takes a string // 'txt' of size n as an argument, builds and return // the suffix array for the given string function buildSuffixArray(txt, n) {
// Structure to store information of a suffix class Suffix {
constructor() { this .index = 0; // To store original index
this .rank = [0, 0]; // To store ranks and next
// rank pair
} }
// A comparison function used by sort() to compare // two suffixes. Compares two pairs, returns 1 if // first pair is smaller function cmp(a, b) {
return a.rank[0] !== b.rank[0]
? a.rank[0] - b.rank[0] : a.rank[1] - b.rank[1]; } // A structure to store suffixes and their indexes let suffixes = new Array(n);
// Store suffixes and their indexes in an array
// of structures. The structure is needed to sort // the suffixes alphabetically and maintain their // old indexes while sorting for (let i = 0; i < n; i++) {
suffixes[i] = new Suffix();
suffixes[i].index = i; suffixes[i].rank[0] = txt.charCodeAt(i) - "a" .charCodeAt(0);
suffixes[i].rank[1] = i + 1 < n ? txt.charCodeAt(i + 1) - "a" .charCodeAt(0) : -1;
}
// Sort the suffixes using the comparison function
// defined above. suffixes.sort((a, b) => cmp(a,b));
// At his point, all suffixes are sorted according
// to first 2 characters. Let us sort suffixes // according to first 4 characters, then first // 8 and so on let ind = new Array(n); // This array is needed to get the
// index in suffixes[] from original
// index. This mapping is needed to get
// next suffix.
for (let k = 4; k < 2 * n; k *= 2) {
// Assigning rank and index values to first suffix
let rank = 0; let prev_rank = suffixes[0].rank[0]; suffixes[0].rank[0] = rank; ind[suffixes[0].index] = 0; // Assigning rank to suffixes for (let i = 1; i < n; i++) {
// If first rank and next ranks are same as
// that of previous suffix in array, assign
// the same new rank to this suffix
if (
suffixes[i].rank[0] === prev_rank &&
suffixes[i].rank[1] === suffixes[i - 1].rank[1]
) {
prev_rank = suffixes[i].rank[0];
suffixes[i].rank[0] = rank;
} else // Otherwise increment rank and assign
{
prev_rank = suffixes[i].rank[0];
suffixes[i].rank[0] = ++rank;
}
ind[suffixes[i].index] = i;
} // Assign next rank to every suffix for (let i = 0; i < n; i++) {
let nextindex = suffixes[i].index + k / 2;
suffixes[i].rank[1] =
nextindex < n ? suffixes[ind[nextindex]].rank[0] : -1;
} // Sort the suffixes according to first k characters suffixes.sort(cmp); }
// Store indexes of all sorted suffixes in the suffix
// array let suffixArr = new Array(n);
for (let i = 0; i < n; i++) suffixArr[i] = suffixes[i].index;
// Return the suffix array
return suffixArr;
} /* To construct and return LCP */ function kasai(txt, suffixArr) {
let n = suffixArr.length;
// To store LCP array
let lcp = new Array(n).fill(0);
// An auxiliary array to store inverse of suffix array
// elements. For example if suffixArr[0] is 5, the // invSuff[5] would store 0. This is used to get next // suffix string from suffix array. let invSuff = new Array(n).fill(0);
// Fill values in invSuff[] for (let i = 0; i < n; i++) invSuff[suffixArr[i]] = i;
let k = 0;
// Process all suffixes one by one starting from // first suffix in txt[] for (let i = 0; i < n; i++) {
/* If the current suffix is at n-1, then we don’t have next substring to consider. So lcp is not
defined for this substring, we put zero. */
if (invSuff[i] == n - 1) {
k = 0;
continue ;
} /* j contains index of the next substring to be considered to compare with the present
substring, i.e., next string in suffix array */
let j = suffixArr[invSuff[i] + 1]; // Directly start matching from k'th index as
// at-least k-1 characters will match
while (i + k < n && j + k < n && txt[i + k] === txt[j + k]) k++;
lcp[invSuff[i]] = k; // lcp for the present suffix.\
// Deleting the starting character from the string. if (k > 0) k--;
}
// return the constructed lcp array return lcp;
} // method to return count of total distinct substring function countDistinctSubstring(txt) {
let n = txt.length;
// calculating suffix array and lcp array
let suffixArr = buildSuffixArray(txt, n);
let lcp = kasai(txt, suffixArr);
// n - suffixArr[i] will be the length of suffix // at ith position in suffix array initializing // count with length of first suffix of sorted // suffixes let result = n - suffixArr[0]; for (let i = 1; i < lcp.length; i++)
// subtract lcp from the length of suffix
result += (n - suffixArr[i]) - lcp[i - 1];
result++; // For empty string
return result;
} // Driver code to test above methods let txt = "ababa" ;
console.log(countDistinctSubstring(txt)); // This code is contributed by Utkarsh Kumar. |
10
Time Complexity : O(nlogn), where n is the length of string.
Auxiliary Space : O(n), where n is the length of string.
This article is contributed by Utkarsh Trivedi