Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions mfind/aho/ahoCorasick.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class AhoCorasick {

Node* root;
std::vector<std::string> insertedKeywords;
size_t maxPatternLength = 0;

public:
AhoCorasick() {
Expand All @@ -43,12 +44,17 @@ class AhoCorasick {
}
node->output.push_back(keyword);
insertedKeywords.push_back(keyword);
maxPatternLength = std::max(maxPatternLength, keyword.size());
}

const std::vector<std::string>& getKeywords() const {
return insertedKeywords;
}

size_t getMaxPatternLength() const {
return maxPatternLength;
}

// Failure links
void build() {
std::queue<Node*> q;
Expand Down
45 changes: 45 additions & 0 deletions mfind/helpers/chunkScanner.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef CHUNK_SCANNER_H
#define CHUNK_SCANNER_H

#include <string>
#include <functional>
#include <vector>
#include <thread>
#include <future>
#include <algorithm>
#include "../aho/ahoCorasick.h"
#include "formatHelper.h"

class ChunkScanner {
public:
static void scan(const std::string& chunk,
const AhoCorasick& ac,
std::function<void(size_t, const std::string&, const std::string&)> callback,
size_t baseOffset = 0,
size_t threadCount = std::thread::hardware_concurrency(),
size_t contextSize = 30) {

size_t chunkLength = chunk.size();
size_t sliceSize = std::max<size_t>(chunkLength / threadCount, 4096);
std::vector<std::future<void>> futures;

for (size_t i = 0; i < chunkLength; i += sliceSize) {
size_t start = i;
size_t end = std::min(chunkLength, i + sliceSize + ac.getMaxPatternLength());

futures.push_back(std::async(std::launch::async, [&, start, end] {
std::string slice = chunk.substr(start, end - start);
auto matches = ac.search(slice);
for (const auto& [localPos, word] : matches) {
size_t absolute = baseOffset + start + localPos;
std::string context = extractContext(chunk, localPos + start, word.length(), contextSize);
callback(absolute, word, context);
}
}));
}

for (auto& f : futures) f.get();
}
};

#endif
38 changes: 16 additions & 22 deletions mfind/helpers/searchHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,25 @@

#include <fstream>
#include <string>
#include <vector>
#include <iostream>
#include <functional>
#include "../aho/ahoCorasick.h"
#include "configHelper.h"
#include "formatHelper.h"
#include "chunkScanner.h"

inline void searchFile(const std::string& filename,
const AhoCorasick& ac,
std::function<void(const std::string&, size_t, const std::string&, const std::string&)> onMatch) {
std::function<void(const std::string&, size_t, const std::string&, const std::string&)> callback,
size_t chunkSize = 8192) {
std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) return;
if (!file.is_open()) {
std::cerr << "Error: Could not open file " << filename << std::endl;
return;
}

const size_t overlap = 100;
const size_t chunkSize = 8192;
size_t overlap = ac.getMaxPatternLength() - 1;
std::vector<char> buffer(chunkSize + overlap);

size_t offset = 0;

while (!file.eof()) {
if (offset != 0) {
std::copy(buffer.end() - overlap, buffer.end(), buffer.begin());
Expand All @@ -28,21 +31,12 @@ inline void searchFile(const std::string& filename,
std::streamsize bytesRead = file.gcount();
if (bytesRead == 0) break;

size_t totalSize = (offset == 0 ? bytesRead : bytesRead + overlap);
std::string chunkText(buffer.data(), totalSize);
size_t total = (offset == 0 ? bytesRead : bytesRead + overlap);
std::string chunk(buffer.data(), total);

auto matches = ac.search(chunkText);
for (const auto& [pos, word] : matches) {
size_t absPos = offset + pos;

// Get context
size_t ctxSize = 30;
size_t ctxStart = (pos > ctxSize) ? (pos - ctxSize) : 0;
size_t ctxEnd = std::min(pos + word.length() + ctxSize, chunkText.length());
std::string context = chunkText.substr(ctxStart, ctxEnd - ctxStart);

onMatch(filename, absPos, word, context);
}
ChunkScanner::scan(chunk, ac, [&](size_t pos, const std::string& word, const std::string& context) {
callback(filename, pos, word, context);
}, offset);

offset += bytesRead;
}
Expand Down
Binary file modified mfind/mfind
Binary file not shown.
10 changes: 6 additions & 4 deletions mfind/mfind.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#include <iostream>
#include <filesystem>
#include <fstream>
#include <string>
#include <vector>
#include <string>
#include <unordered_map>
#include <algorithm>
#include "../aho/ahoCorasick.h"
#include "../helpers/searchHelper.h"
#include "../helpers/formatHelper.h"
#include "../helpers/configHelper.h"
#include "ahoCorasick.h"
#include "walkers/walker.h"
#include "queues/fileQueue.h"
#include "workers/workerPool.h"
#include "../helpers/searchHelper.h"

int main(int argc, char* argv[]) {
if (argc < 3) {
Expand Down Expand Up @@ -44,6 +46,6 @@ int main(int argc, char* argv[]) {
walker.join();
fileQueue.setFinished();
pool.join();

return 0;
}