Effective implementation of algorithms (Master Thesis)
Effective and error-free implementation of algorithms
src/strings/suffix_array_myers/manber_myers.h
Go to the documentation of this file.
00001 #ifndef H_SUFFIX_ARRAY_MAMBER_MEYERS_OPT
00002 #define H_SUFFIX_ARRAY_MAMBER_MEYERS_OPT
00003 
00010 #include "utils/preconditions/preconditions.h"
00011 #include <vector>
00012 #include <algorithm>
00013 #include "debug/ppdebug.h"
00014 
00015 namespace strings {
00016 namespace suffix_array {
00017 
00031 class ManberMyers {
00032  private:
00038   template<typename _Iterator>
00039   static void sortByFirstCharacter(
00040       _Iterator first, _Iterator last, int sigma, std::vector<int>* out) {
00041     for (_Iterator it = first; it != last; ++it) {
00042       Preconditions::checkRange((int)(*it), sigma);
00043     }
00044 
00045     // Note: we NEED count sort here, std::sort is awfully slow
00046     // when sorting reasonable small alphabet
00047     // TODO(ppershing): benchmark it.
00048     vector<int> csort(sigma, 0);
00049 
00050     for (_Iterator it = first; it != last; ++it) {
00051       csort[*it]++;
00052     }
00053 
00054     for (int i = 1; i < sigma; i++) {
00055       csort[i] += csort[i - 1]; // make prefix sums
00056     }
00057 
00058     int n = last - first;
00059     out->resize(n);
00060     int i = 0;
00061     for (_Iterator it = first; it != last; ++it) {
00062       (*out)[--csort[*it]] = i++;
00063     }
00064     assert(i == n);
00065   }
00066 
00067  public:
00068 
00090   template<typename _Iterator>
00091    static void buildSuffixArray(_Iterator first,
00092        _Iterator last, int sigma,
00093        std::vector<int>* out) {
00095      std::vector<int>& pos = *out;
00097      int n = last - first;
00098 
00105      std::vector<bool> bucket_h(n);
00107      std::vector<bool> bucket_2h(n, false);
00108 
00115      std::vector<int> rank(n);
00116      std::vector<int> count(n);
00117 
00118 
00119      // first stage
00120      sortByFirstCharacter(first, last, sigma, out);
00121 
00122      // find the initial buckets
00123      bucket_h[0] = false;
00124      for (int i = 1; i < n; i++) {
00125        bucket_h[i] = (*(first + pos[i]) != *(first + pos[i - 1]));
00126      }
00127 
00128      // second stage, sort by 2, 4, 8, 16, ... characters
00129      for (int h = 1; h < n; h*=2) {
00130        // Now we have sorted suffixes according to first h characters
00131        // let's do it for first 2h
00132 
00133        int bucket_start = 0;
00134        int bucket_count = 0;
00135 
00136        // for each =H bucket [l,r]
00137        // clear representative count values
00138        // and reset ranks to the left of the bucket
00139        // (we will use left of the bucket as representative
00140        // for counting)
00141        count[0] = 0;
00142        for (int i = 0; i < n; i++) {
00143          if (bucket_h[i]) {
00144             bucket_start = i;
00145             count[i] = 0;
00146             bucket_count ++;
00147          }
00148          rank[pos[i]] = bucket_start;
00149        }
00150 
00151        if (bucket_count == n) {
00152           return; // we are lucky and don't need another iteration
00153        }
00154 
00155        // indexing is ok, h < n
00156        // TODO(ppershing): why these?
00157        count[rank[n - h]]++;
00158        bucket_2h[rank[n - h]] = true;
00159 
00160        bucket_start = 0;
00161        // Go through all buckets, one at the time
00162        while (bucket_start < n) {
00163          // find the end of the bucket
00164          int bucket_end = bucket_start + 1;
00165          while (bucket_end < n && !bucket_h[bucket_end]) {
00166            bucket_end++;
00167          }
00168 
00169          for (int c = bucket_start; c < bucket_end; c++) {
00170            // select (pos[c] - h) \union [0, n)
00171            int d = pos[c] - h;
00172            if (d >= 0) {
00173              assert(d < n);
00174              int head = rank[d];
00175              rank[d] += count[head];
00176              count[head]++;
00177              bucket_2h[rank[d]] = true;
00178            }
00179          }
00180          for (int c = bucket_start; c < bucket_end; c++) {
00181            // select (pos[c] - h) \union [0, n)
00182            int d = pos[c] - h;
00183            if (d >= 0 && bucket_2h[rank[d]]) {
00184              for (int f = rank[d] + 1; !bucket_h[f] && bucket_2h[f]; f++) {
00185               bucket_2h[f] = false;
00186              }
00187            }
00188          }  
00189 
00190          bucket_start = bucket_end;
00191        }
00192 
00193        for (int i = 0; i < n; ++i) {
00194          pos[rank[i]] = i;
00195          if (bucket_2h[i] && !bucket_h[i]) {
00196            // set(i, h + min_height(rank[pos[i-1]+h], rank[pos[i]+h]))
00197            bucket_h[i] = true;
00198          }
00199        }
00200      }
00201    }
00202 };
00203 
00204 } // namespace suffix_array
00205 } // namespace strings
00206 
00207 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines