diff --git a/mfind/aho/ahoCorasick.h b/mfind/aho/ahoCorasick.h index 131ff7e..18a1e3f 100644 --- a/mfind/aho/ahoCorasick.h +++ b/mfind/aho/ahoCorasick.h @@ -17,6 +17,7 @@ class AhoCorasick { Node* root; std::vector insertedKeywords; + size_t maxPatternLength = 0; public: AhoCorasick() { @@ -43,12 +44,17 @@ class AhoCorasick { } node->output.push_back(keyword); insertedKeywords.push_back(keyword); + maxPatternLength = std::max(maxPatternLength, keyword.size()); } const std::vector& getKeywords() const { return insertedKeywords; } + size_t getMaxPatternLength() const { + return maxPatternLength; + } + // Failure links void build() { std::queue q; diff --git a/mfind/helpers/chunkScanner.h b/mfind/helpers/chunkScanner.h new file mode 100644 index 0000000..a463fde --- /dev/null +++ b/mfind/helpers/chunkScanner.h @@ -0,0 +1,45 @@ +#ifndef CHUNK_SCANNER_H +#define CHUNK_SCANNER_H + +#include +#include +#include +#include +#include +#include +#include "../aho/ahoCorasick.h" +#include "formatHelper.h" + +class ChunkScanner { +public: + static void scan(const std::string& chunk, + const AhoCorasick& ac, + std::function callback, + size_t baseOffset = 0, + size_t threadCount = std::thread::hardware_concurrency(), + size_t contextSize = 30) { + + size_t chunkLength = chunk.size(); + size_t sliceSize = std::max(chunkLength / threadCount, 4096); + std::vector> futures; + + for (size_t i = 0; i < chunkLength; i += sliceSize) { + size_t start = i; + size_t end = std::min(chunkLength, i + sliceSize + ac.getMaxPatternLength()); + + futures.push_back(std::async(std::launch::async, [&, start, end] { + std::string slice = chunk.substr(start, end - start); + auto matches = ac.search(slice); + for (const auto& [localPos, word] : matches) { + size_t absolute = baseOffset + start + localPos; + std::string context = extractContext(chunk, localPos + start, word.length(), contextSize); + callback(absolute, word, context); + } + })); + } + + for (auto& f : futures) f.get(); + } +}; + +#endif diff --git a/mfind/helpers/searchHelper.h b/mfind/helpers/searchHelper.h index e9d40ca..1711795 100644 --- a/mfind/helpers/searchHelper.h +++ b/mfind/helpers/searchHelper.h @@ -3,22 +3,25 @@ #include #include -#include -#include +#include #include "../aho/ahoCorasick.h" -#include "configHelper.h" +#include "formatHelper.h" +#include "chunkScanner.h" inline void searchFile(const std::string& filename, const AhoCorasick& ac, - std::function onMatch) { + std::function callback, + size_t chunkSize = 8192) { std::ifstream file(filename, std::ios::binary); - if (!file.is_open()) return; + if (!file.is_open()) { + std::cerr << "Error: Could not open file " << filename << std::endl; + return; + } - const size_t overlap = 100; - const size_t chunkSize = 8192; + size_t overlap = ac.getMaxPatternLength() - 1; std::vector buffer(chunkSize + overlap); - size_t offset = 0; + while (!file.eof()) { if (offset != 0) { std::copy(buffer.end() - overlap, buffer.end(), buffer.begin()); @@ -28,21 +31,12 @@ inline void searchFile(const std::string& filename, std::streamsize bytesRead = file.gcount(); if (bytesRead == 0) break; - size_t totalSize = (offset == 0 ? bytesRead : bytesRead + overlap); - std::string chunkText(buffer.data(), totalSize); + size_t total = (offset == 0 ? bytesRead : bytesRead + overlap); + std::string chunk(buffer.data(), total); - auto matches = ac.search(chunkText); - for (const auto& [pos, word] : matches) { - size_t absPos = offset + pos; - - // Get context - size_t ctxSize = 30; - size_t ctxStart = (pos > ctxSize) ? (pos - ctxSize) : 0; - size_t ctxEnd = std::min(pos + word.length() + ctxSize, chunkText.length()); - std::string context = chunkText.substr(ctxStart, ctxEnd - ctxStart); - - onMatch(filename, absPos, word, context); - } + ChunkScanner::scan(chunk, ac, [&](size_t pos, const std::string& word, const std::string& context) { + callback(filename, pos, word, context); + }, offset); offset += bytesRead; } diff --git a/mfind/mfind b/mfind/mfind index 7063015..1d0fc9d 100755 Binary files a/mfind/mfind and b/mfind/mfind differ diff --git a/mfind/mfind.cpp b/mfind/mfind.cpp index 5cd8ec8..8317635 100644 --- a/mfind/mfind.cpp +++ b/mfind/mfind.cpp @@ -1,15 +1,17 @@ #include +#include #include -#include #include +#include #include +#include +#include "../aho/ahoCorasick.h" +#include "../helpers/searchHelper.h" #include "../helpers/formatHelper.h" #include "../helpers/configHelper.h" -#include "ahoCorasick.h" #include "walkers/walker.h" #include "queues/fileQueue.h" #include "workers/workerPool.h" -#include "../helpers/searchHelper.h" int main(int argc, char* argv[]) { if (argc < 3) { @@ -44,6 +46,6 @@ int main(int argc, char* argv[]) { walker.join(); fileQueue.setFinished(); pool.join(); - + return 0; }