Thursday, November 27, 2014

Suffix Array

A suffix array is a sorted array of all the suffixes of a given string. It does the same thing which suffix tree does.
Advantage of suffix array over suffix trees include improved space requirements, simpler linear time construction algorithms and improved cache locality.

Building Suffix Array - Method 1: Make an array of all suffixes and then sort the array.

Implementation:

struct Suffix
{
        int index;
        char* suff;
};

bool compareSuffixes(const Suffix& s1, const Suffix& s2)
{
        return std::strcmp(s1.suff, s2.suff) < 0 ? true : false;
}

int* buildSuffixArray(char* str, int len)
{
        Suffix* suffixes = new Suffix[len];
        for(int i = 0; i < len; ++i)
        {
                suffixes[i].index = i;
                suffixes[i].suff = str + i;
        }
        std::sort(suffixes, suffixes + len, compareSuffixes);
        int *suffixArr = new int[len];
        for(int i = 0; i < len; ++i)
                suffixArr[i] = suffixes[i].index;
        return suffixArr;

}

Time complexity: O(n^2logn)

Building Suffix Array - Method 2: The idea is to use the fact that strings that are to be sorted are suffixes of a single string. The algorithm is mainly based on maintaining the order of the string’s suffixes sorted by their 2^k long prefixes.

Implementation:

struct Suffix
{
        int index;
        int rank[2];
};

bool compareSuffix(const Suffix& s1, const Suffix& s2)
{
        return s1.rank[0] == s2.rank[0] ? (s1.rank[1] < s2.rank[1] ? true : false) : (s1.rank[0] < s2.rank[0] ? true : false);
}

int* buildSuffixArray(char* text, int len)
{
        Suffix* suffixes = new Suffix[len];
        for(int i = 0; i < len; ++i)
        {
                suffixes[i].index = i;
                suffixes[i].rank[0] = text[i] - 'a';
                suffixes[i].rank[1] = ((i+1) < len) ? (text[i+1] - 'a') : -1;
        }

        std::sort(suffixes, suffixes + len, compareSuffix);

        int* index = new int[len];

        for(int k = 4; k < 2*len; k = k*2)
        {
                int rank = 0, prev_rank = suffixes[0].rank[0];
                suffixes[0].rank[0] = rank;
                index[suffixes[0].index] = 0;

                for(int i = 1; i < len; ++i)
                {
                        if(suffixes[i].rank[0] == prev_rank && suffixes[i].rank[1] == suffixes[i-1].rank[1])
                        {
                                prev_rank = suffixes[i].rank[0];
                                suffixes[i].rank[0] = rank;
                        }
                        else
                        {
                                prev_rank = suffixes[i].rank[0];
                                suffixes[i].rank[0] = ++rank;
                        }
                        index[suffixes[i].index] = i;
                }
                for(int i = 0; i < len; ++i)
                {      
                        int nextIndex = suffixes[i].index + k/2;
                        suffixes[i].rank[1] = (nextIndex < len) ? suffixes[index[nextIndex]].rank[0]: -1;
                }

                std::sort(suffixes, suffixes + len, compareSuffix);

        }

        int *suffixArr = new int[len];
        for(int i = 0; i < len; ++i)
                suffixArr[i] = suffixes[i].index;
        return suffixArr;

}


Searching a Pattern: Now we have a sorted array of all the suffixes, we can use binary search to search pattern in the text.

int search(char *text, char *pattern, int *suffArr)
{
        int len = std::strlen(text);
        int patrnLen = std::strlen(pattern);
        int left = 0, right = len - 1;
        while(left <= right)
        {
                int mid = left + (right - left) / 2;
                int result = std::strncmp(pattern, text + suffArr[mid], patrnLen);
                if(result == 0)
                        return suffArr[mid];
                if(result < 0)
                        right = mid - 1;
                else
                        left = mid + 1;
        }
        return -1;

}

Time complexity: O(mlogn)

________________________________________________

No comments:

Post a Comment