Effective implementation of algorithms (Master Thesis)
Effective and error-free implementation of algorithms
|
00001 #ifndef H_SUFFIX_ARRAY_MAMBER_MEYERS_OPT 00002 #define H_SUFFIX_ARRAY_MAMBER_MEYERS_OPT 00003 00010 #include "utils/preconditions/preconditions.h" 00011 #include <vector> 00012 #include <algorithm> 00013 #include "debug/ppdebug.h" 00014 00015 namespace strings { 00016 namespace suffix_array { 00017 00031 class ManberMyers { 00032 private: 00038 template<typename _Iterator> 00039 static void sortByFirstCharacter( 00040 _Iterator first, _Iterator last, int sigma, std::vector<int>* out) { 00041 for (_Iterator it = first; it != last; ++it) { 00042 Preconditions::checkRange((int)(*it), sigma); 00043 } 00044 00045 // Note: we NEED count sort here, std::sort is awfully slow 00046 // when sorting reasonable small alphabet 00047 // TODO(ppershing): benchmark it. 00048 vector<int> csort(sigma, 0); 00049 00050 for (_Iterator it = first; it != last; ++it) { 00051 csort[*it]++; 00052 } 00053 00054 for (int i = 1; i < sigma; i++) { 00055 csort[i] += csort[i - 1]; // make prefix sums 00056 } 00057 00058 int n = last - first; 00059 out->resize(n); 00060 int i = 0; 00061 for (_Iterator it = first; it != last; ++it) { 00062 (*out)[--csort[*it]] = i++; 00063 } 00064 assert(i == n); 00065 } 00066 00067 public: 00068 00090 template<typename _Iterator> 00091 static void buildSuffixArray(_Iterator first, 00092 _Iterator last, int sigma, 00093 std::vector<int>* out) { 00095 std::vector<int>& pos = *out; 00097 int n = last - first; 00098 00105 std::vector<bool> bucket_h(n); 00107 std::vector<bool> bucket_2h(n, false); 00108 00115 std::vector<int> rank(n); 00116 std::vector<int> count(n); 00117 00118 00119 // first stage 00120 sortByFirstCharacter(first, last, sigma, out); 00121 00122 // find the initial buckets 00123 bucket_h[0] = false; 00124 for (int i = 1; i < n; i++) { 00125 bucket_h[i] = (*(first + pos[i]) != *(first + pos[i - 1])); 00126 } 00127 00128 // second stage, sort by 2, 4, 8, 16, ... characters 00129 for (int h = 1; h < n; h*=2) { 00130 // Now we have sorted suffixes according to first h characters 00131 // let's do it for first 2h 00132 00133 int bucket_start = 0; 00134 int bucket_count = 0; 00135 00136 // for each =H bucket [l,r] 00137 // clear representative count values 00138 // and reset ranks to the left of the bucket 00139 // (we will use left of the bucket as representative 00140 // for counting) 00141 count[0] = 0; 00142 for (int i = 0; i < n; i++) { 00143 if (bucket_h[i]) { 00144 bucket_start = i; 00145 count[i] = 0; 00146 bucket_count ++; 00147 } 00148 rank[pos[i]] = bucket_start; 00149 } 00150 00151 if (bucket_count == n) { 00152 return; // we are lucky and don't need another iteration 00153 } 00154 00155 // indexing is ok, h < n 00156 // TODO(ppershing): why these? 00157 count[rank[n - h]]++; 00158 bucket_2h[rank[n - h]] = true; 00159 00160 bucket_start = 0; 00161 // Go through all buckets, one at the time 00162 while (bucket_start < n) { 00163 // find the end of the bucket 00164 int bucket_end = bucket_start + 1; 00165 while (bucket_end < n && !bucket_h[bucket_end]) { 00166 bucket_end++; 00167 } 00168 00169 for (int c = bucket_start; c < bucket_end; c++) { 00170 // select (pos[c] - h) \union [0, n) 00171 int d = pos[c] - h; 00172 if (d >= 0) { 00173 assert(d < n); 00174 int head = rank[d]; 00175 rank[d] += count[head]; 00176 count[head]++; 00177 bucket_2h[rank[d]] = true; 00178 } 00179 } 00180 for (int c = bucket_start; c < bucket_end; c++) { 00181 // select (pos[c] - h) \union [0, n) 00182 int d = pos[c] - h; 00183 if (d >= 0 && bucket_2h[rank[d]]) { 00184 for (int f = rank[d] + 1; !bucket_h[f] && bucket_2h[f]; f++) { 00185 bucket_2h[f] = false; 00186 } 00187 } 00188 } 00189 00190 bucket_start = bucket_end; 00191 } 00192 00193 for (int i = 0; i < n; ++i) { 00194 pos[rank[i]] = i; 00195 if (bucket_2h[i] && !bucket_h[i]) { 00196 // set(i, h + min_height(rank[pos[i-1]+h], rank[pos[i]+h])) 00197 bucket_h[i] = true; 00198 } 00199 } 00200 } 00201 } 00202 }; 00203 00204 } // namespace suffix_array 00205 } // namespace strings 00206 00207 #endif