|
1 | 1 | #include <Rcpp/Lightest> |
2 | 2 | #include <memory> // for make_unique |
3 | 3 | #include <stdexcept> /* for errors */ |
| 4 | +#include <string> /* for string (hash key) */ |
| 5 | +#include <unordered_set> /* for unordered_set */ |
4 | 6 | #include "../inst/include/TreeTools/assert.h" /* for ASSERT */ |
5 | 7 | #include "../inst/include/TreeTools.h" |
6 | 8 |
|
@@ -197,44 +199,30 @@ LogicalVector duplicated_splits(const RawMatrix splits, |
197 | 199 | } |
198 | 200 | } |
199 | 201 |
|
| 202 | + // Hash-based O(n) deduplication |
200 | 203 | LogicalVector ret(n_split); |
| 204 | + std::unordered_set<std::string> seen; |
| 205 | + seen.reserve(n_split * 2); |
| 206 | + std::string key(check_bins, '\0'); |
| 207 | + |
201 | 208 | if (fromLast[0]) { |
202 | | - for (intx it = n_split - 1; it--; ) { |
203 | | - const intx i = it + 1; // nothing to duplicate split(0, _) |
204 | | - if (ret[i]) { |
205 | | - continue; |
| 209 | + // Scan from end; first seen (from end) is kept, earlier dupes are marked |
| 210 | + for (intx i = n_split; i--; ) { |
| 211 | + for (intx b = 0; b < check_bins; ++b) { |
| 212 | + key[b] = static_cast<char>(compare(i, b)); |
206 | 213 | } |
207 | | - for (intx j = i; j--; ) { |
208 | | - // Rcout << " check split " << i << " (" << uintx(compare(i, 0)) << |
209 | | - // ") vs " << j << " (" << uintx(compare(j, 0)) << "): "; |
210 | | - for(intx bin = 0; compare(i, bin) == compare(j, bin); ) { |
211 | | - // Rcout << " [bin " << bin << "] "; |
212 | | - ++bin; |
213 | | - if (bin == check_bins) { |
214 | | - // Rcout << "Duplicate!"; |
215 | | - ret[j] = true; |
216 | | - break; |
217 | | - } |
218 | | - } |
219 | | - // Rcout << "\n"; |
220 | | - |
| 214 | + if (!seen.insert(key).second) { |
| 215 | + ret[i] = true; |
221 | 216 | } |
222 | 217 | } |
223 | 218 | } else { |
224 | | - for (intx i = 0; i != n_split - 1; ++i) { |
225 | | - if (ret[i]) { |
226 | | - continue; |
| 219 | + // Scan from start; first seen is kept, later dupes are marked |
| 220 | + for (intx i = 0; i < n_split; ++i) { |
| 221 | + for (intx b = 0; b < check_bins; ++b) { |
| 222 | + key[b] = static_cast<char>(compare(i, b)); |
227 | 223 | } |
228 | | - for (intx j = i + 1; j != n_split; ++j) { |
229 | | - |
230 | | - for(intx bin = 0; compare(i, bin) == compare(j, bin); ) { |
231 | | - ++bin; |
232 | | - if (bin == check_bins) { |
233 | | - ret[j] = true; |
234 | | - break; |
235 | | - } |
236 | | - } |
237 | | - |
| 224 | + if (!seen.insert(key).second) { |
| 225 | + ret[i] = true; |
238 | 226 | } |
239 | 227 | } |
240 | 228 | } |
|
0 commit comments