Pre-requisite: Suffix Array. What are k-mers? The term k-mers typically refers to all the possible substrings of length k that are contained in a string. Counting all the k-mers in DNA/RNA sequencing reads is the preliminary step of many bioinformatics applications. What is a Suffix Array? A suffix array is a sorted array of all suffixes of a string. It is a data structure used, among others, in full text indices, data compression algorithms. More information can be found here. Problem: We are given a string str and an integer k. We have to find all pairs (substr, i) such that substr is a length – k substring of str that occurs exactly i times.
Steps involved in the approach:
Let’s take the word “banana$” as an example.
Step 1: Compute the suffix array of the given text.
6 $ 5 a$ 3 ana$ 1 anana$ 0 banana$ 4 na$ 2 nana$
Step 2: Iterate through the suffix array keeping “curr_count”.
- If the length of current suffix is less than k, then skip the iteration. That is, if k = 2, then iteration would be skipped when current suffix is $.
- If the current suffix begins with the same length – k substring as the previous suffix, then increment curr_count. For example, during fourth iteration current suffix “anana$” starts with same substring of length k “an” as previous suffix “ana$” started with. So, we will increment curr_count in this case.
- If condition 2 is not satisfied, then if length of previous suffix is equal to k, then that it is a valid pair and we will output it along with its current count, otherwise, we will skip that iteration.
curr_count Valid Pair 6 $ 1 5 a$ 1 3 ana$ 1 (a$, 1) 1 anana$ 1 0 banana$ 2 (an, 2) 4 na$ 1 (ba, 1) 2 nana$ 1 (na, 2)
Examples:
Input : banana$ // Input text Output : (a$, 1) // k- mers (an, 2) (ba, 1) (na, 2) Input : geeksforgeeks Output : (ee, 2) (ek, 2) (fo, 1) (ge, 2) (ks, 2) (or, 1) (sf, 1)
The following is the C code for approach explained above:
// C++ program to solve K-mers counting problem #include <bits/stdc++.h> using namespace std;
// Structure to store data of a rotation struct rotation {
int index;
char * suffix;
}; // Compares the rotations and // sorts the rotations alphabetically int cmpfunc( const void * x, const void * y)
{ struct rotation* rx = ( struct rotation*)x;
struct rotation* ry = ( struct rotation*)y;
return strcmp (rx->suffix, ry->suffix);
} // Takes input_text and its length as arguments // and returns the corresponding suffix array char ** computeSuffixArray( char * input_text, int len_text)
{ int i;
// Array of structures to store rotations
// and their indexes
struct rotation suff[len_text];
// Structure is needed to maintain old
// indexes of rotations after sorting them
for (i = 0; i < len_text; i++) {
suff[i].index = i;
suff[i].suffix = (input_text + i);
}
// Sorts rotations using comparison function
// defined above
qsort (suff, len_text, sizeof ( struct rotation), cmpfunc);
// Stores the suffixes of sorted rotations
char ** suffix_arr
= ( char **) malloc (len_text * sizeof ( char *));
for (i = 0; i < len_text; i++) {
suffix_arr[i]
= ( char *) malloc ((len_text + 1) * sizeof ( char ));
strcpy (suffix_arr[i], suff[i].suffix);
}
// Returns the computed suffix array
return suffix_arr;
} // Takes suffix array, its size and valid length as // arguments and outputs the valid pairs of k - mers void findValidPairs( char ** suffix_arr, int n, int k)
{ int curr_count = 1, i;
char * prev_suff = ( char *) malloc (n * sizeof ( char ));
// Iterates over the suffix array,
// keeping a current count
for (i = 0; i < n; i++) {
// Skipping the current suffix
// if it has length < valid length
if ( strlen (suffix_arr[i]) < k) {
if (i != 0 && strlen (prev_suff) == k) {
cout << "(" << prev_suff << ", "
<< curr_count << ")" << endl;
curr_count = 1;
}
strcpy (prev_suff, suffix_arr[i]);
continue ;
}
// Incrementing the curr_count if first
// k chars of prev_suff and current suffix
// are same
if (!( memcmp (prev_suff, suffix_arr[i], k))) {
curr_count++;
}
else {
// Pair is valid when i!=0 (as there is
// no prev_suff for i = 0) and when strlen
// of prev_suff is k
if (i != 0 && strlen (prev_suff) == k) {
cout << "(" << prev_suff << ", "
<< curr_count << ")" << endl;
curr_count = 1;
memcpy (prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0' ;
}
else {
memcpy (prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0' ;
continue ;
}
}
// Modifying prev_suff[i] to current suffix
memcpy (prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0' ;
}
// Printing the last valid pair
cout << "(" << prev_suff << ", " << curr_count << ")"
<< endl;
} // Driver program to test functions above int main()
{ char input_text[] = "geeksforgeeks" ;
int k = 2;
int len_text = strlen (input_text);
// Computes the suffix array of our text
cout << "Input Text: " << input_text << endl;
char ** suffix_arr
= computeSuffixArray(input_text, len_text);
// Finds and outputs all valid pairs
cout << "k-mers: " << endl;
findValidPairs(suffix_arr, len_text, k);
return 0;
} |
// C program to solve K-mers counting problem #include <stdio.h> #include <stdlib.h> #include <string.h> // Structure to store data of a rotation struct rotation {
int index;
char * suffix;
}; // Compares the rotations and // sorts the rotations alphabetically int cmpfunc( const void * x, const void * y)
{ struct rotation* rx = ( struct rotation*)x;
struct rotation* ry = ( struct rotation*)y;
return strcmp (rx->suffix, ry->suffix);
} // Takes input_text and its length as arguments // and returns the corresponding suffix array char ** computeSuffixArray( char * input_text,
int len_text)
{ int i;
// Array of structures to store rotations
// and their indexes
struct rotation suff[len_text];
// Structure is needed to maintain old
// indexes of rotations after sorting them
for (i = 0; i < len_text; i++) {
suff[i].index = i;
suff[i].suffix = (input_text + i);
}
// Sorts rotations using comparison function
// defined above
qsort (suff, len_text, sizeof ( struct rotation), cmpfunc);
// Stores the suffixes of sorted rotations
char ** suffix_arr =
( char **) malloc (len_text * sizeof ( char *));
for (i = 0; i < len_text; i++) {
suffix_arr[i] =
( char *) malloc ((len_text + 1) * sizeof ( char ));
strcpy (suffix_arr[i], suff[i].suffix);
}
// Returns the computed suffix array
return suffix_arr;
} // Takes suffix array, its size and valid length as // arguments and outputs the valid pairs of k - mers void findValidPairs( char ** suffix_arr, int n, int k)
{ int curr_count = 1, i;
char * prev_suff = ( char *) malloc (n * sizeof ( char ));
// Iterates over the suffix array,
// keeping a current count
for (i = 0; i < n; i++) {
// Skipping the current suffix
// if it has length < valid length
if ( strlen (suffix_arr[i]) < k) {
if (i != 0 && strlen (prev_suff) == k) {
printf ( "(%s, %d)\n" , prev_suff, curr_count);
curr_count = 1;}
strcpy (prev_suff, suffix_arr[i]);
continue ;
}
// Incrementing the curr_count if first
// k chars of prev_suff and current suffix
// are same
if (!( memcmp (prev_suff, suffix_arr[i], k))) {
curr_count++;
}
else {
// Pair is valid when i!=0 (as there is
// no prev_suff for i = 0) and when strlen
// of prev_suff is k
if (i != 0 && strlen (prev_suff) == k) {
printf ( "(%s, %d)\n" , prev_suff, curr_count);
curr_count = 1;
memcpy (prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0' ;
}
else {
memcpy (prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0' ;
continue ;
}
}
// Modifying prev_suff[i] to current suffix
memcpy (prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0' ;
}
// Printing the last valid pair
printf ( "(%s, %d)\n" , prev_suff, curr_count);
} // Driver program to test functions above int main()
{ char input_text[] = "geeksforgeeks" ;
int k = 2;
int len_text = strlen (input_text);
// Computes the suffix array of our text
printf ( "Input Text: %s\n" , input_text);
char ** suffix_arr =
computeSuffixArray(input_text, len_text);
// Finds and outputs all valid pairs
printf ( "k-mers: \n" );
findValidPairs(suffix_arr, len_text, k);
return 0;
} |
Output:
Input Text: banana$ k-mers: (a$, 1) (an, 2) (ba, 1) (na, 2)
Time Complexity: O(s*len_text*log(len_text)), assuming s is the length of the longest suffix.