Effective implementation of algorithms (Master Thesis)
Effective and error-free implementation of algorithms
src/strings/suffix_array_lcp_manzini/lcp_manzini.h
Go to the documentation of this file.
00001 #ifndef H_STRINGS_SUFFIX_ARRAY_LCP_MANZINI
00002 #define H_STRINGS_SUFFIX_ARRAY_LCP_MANZINI
00003 
00010 #include "utils/branch_predict/branch_predict.h"
00011 #include "utils/preconditions/preconditions.h"
00012 #include "utils/macros/evil_constructors.h"
00013 #include "debug/ppdebug.h"
00014 
00015 namespace strings {
00016 namespace suffix_array {
00017 
00018 class LCPManzini {
00019 
00020 template<typename _Iterator>
00021 static void compute_counts(_Iterator first, _Iterator last, int alphabet_size,
00022     std::vector<int>* out)
00023 {
00024   // Check preconditions
00025   Preconditions::check(alphabet_size > 0);
00026   _Iterator it = first;
00027   while (it != last) {
00028     Preconditions::checkRange((int)*it, alphabet_size);
00029     ++it;
00030   }
00031   // Do the real work
00032   out->clear();
00033   out->resize(alphabet_size + 1); // Note: +1 is mandatory!
00034   it = first;
00035   while (it != last) {
00036     (*out)[1 + (*it)]++;
00037     ++it;
00038   }
00039 
00040   for (int i = 1; i <= alphabet_size; i++) {
00041     (*out)[i] += (*out)[i - 1];
00042   }
00043   out->pop_back(); // remove last element, which is not needed
00044   // and we stored it only for convenience
00045 
00046 }
00047 
00061 // Pseudocode taken from [ltLCP]
00062 // Procedure Sa2RankNext(rank next)
00063 // 1.  j = count[t[n]]++;
00064 // 2.  rank_next[j]=0;
00065 // 3.  for(i=1;i<=n;i++) {
00066 // 4.    if(sa[i]==1)
00067 // 5.      eos_pos = i;
00068 // 6.    else {
00069 // 7.      c = t[sa[i]-1];
00070 // 8.      j = count[c]++;
00071 // 9.      rank_next[j]=i;
00072 // 10.   }
00073 // 11. }
00074 // 12. return eos_pos;
00075 template<typename _Iterator, typename _SAIterator>
00076 static int compute_rank_next(_Iterator seq_first, _Iterator seq_last,
00077     _SAIterator sa_first, _SAIterator sa_last, int alphabet_size,
00078     std::vector<int> *rank_next)
00079 {
00080   Preconditions::check((seq_last - seq_first) == (sa_last - sa_first),
00081       "Sequence and suffix array lengths does not match");
00082   Preconditions::check(seq_last - seq_first > 0,
00083       "Can't compute rank_next on empty sequence");
00084 
00085   std::vector<int> count;
00086   // Warning: this step is not in the pseudocode, but
00087   // if you read paper carefully, you will know that
00088   // count should be initialized
00089   compute_counts(seq_first, seq_last, alphabet_size, &count);
00090   int n = seq_last - seq_first;
00091   int j = count[*(seq_last - 1)]++;
00092   int eos_pos = - 1; // undefined
00093   rank_next->clear();
00094   rank_next->resize(seq_last - seq_first);
00095 
00096   (*rank_next)[j] = -1; // marker
00097   for (int i = 0; i < n; i++) {
00098     if (UNLIKELY(*(sa_first + i) == 0)) {
00099       eos_pos = i;
00100     } else {
00101       int tmp = *(sa_first + i) - 1;
00102       j = count[*(seq_first + tmp)]++;
00103       (*rank_next)[j] = i;
00104     }
00105   }
00106   assert(eos_pos != -1);
00107   return eos_pos;
00108 }
00109 
00110 // Procedure Lcp9
00111 // 1.  k = Sa2RankNext(lcp);
00112 // 2.  h=0;
00113 // 3.  for(i=1;i<=n;i++) {
00114 // 4.    nextk = lcp[k];
00115 // 5.    if(k==1) lcp[k]=-1;
00116 // 6.    else {
00117 // 7.      j = sa[k-1];
00118 // 8.      while(i+h<=n && j+h<=n && t[i+h]==t[j+h])
00119 // 9.        h++;
00120 // 10.     lcp[k] = h;
00121 // 11.   }
00122 // 12.   if(h>0) h--;
00123 // 13.   k=nextk;
00124 // 14. }
00125 public:
00126 template<typename _Iterator, typename _SAIterator>
00127 static void getHeightArray(_Iterator seq_first, _Iterator seq_last,
00128     _SAIterator sa_first, _SAIterator sa_last, int alphabet_size,
00129     std::vector<int> *out)
00130 {
00131   Preconditions::check((seq_last - seq_first) == (sa_last - sa_first));
00132 
00133   std::vector<int>& lcp(*out); // shorthand
00134   int current = compute_rank_next(seq_first, seq_last,
00135       sa_first, sa_last, alphabet_size, out);
00136   int h = 0;
00137   int n = seq_last - seq_first;
00138   for (int i = 0; i < n; i++) {
00139     assert(current >= 0); // current is not a marker
00140     assert(i == *(sa_first + current)); // current == rank[i]
00141     int next = lcp[current];
00142     // skip the rank[i]==0
00143     if (current != 0) {
00144       int j = *(sa_first + (current - 1));
00145       _Iterator ita = seq_first + (i + h);
00146       _Iterator itb = seq_first + (j + h);
00147       while ((ita != seq_last) && (itb != seq_last) &&
00148              (*ita == *itb)) {
00149         ++ita;
00150         ++itb;
00151         ++h;
00152       }
00153       lcp[current] = h;
00154       if (h > 0) {
00155         h--;
00156       }
00157     }
00158     // rank[i+1] = rankNext[rank[i]]
00159     current = next;
00160   }
00161   assert(current == -1); // marker
00162 }
00163 
00164 private:
00165   DISALLOW_EVIL_CONSTRUCTORS(LCPManzini);
00166 };
00167 
00168 } // namespace suffix_array
00169 } // namespace strings
00170 
00171 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines