Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,7 @@
*.app

# Other
helpers/*.txt
find/*.txt
find/data/*
multi-find/data/*
.vscode/
23 changes: 14 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,26 @@ lil side project for implementing:
## setup
- clone
- run with
- `clang++ main.cpp -o findr`
- `clang++ find.cpp -o find`
- `./findr`

- for std::filesystem
- `clang++ -std=c++17 -stdlib=libc++ find.cpp -o find`
- `./findr`

## deep dive

keyword search
- [X] "boyer-moore" on file specified to find instances where keywords exists
- [ ] read file into chunks
- [ ] deploy worker threads and run boyer moore in parallel across the file
- [ ] replace feature (tbd)
- [X] read file into chunks
- [X] recursively detect presence of keyword across a whole directory
- [ ] "replace feature"
- [ ] deploy worker threads and run boyer moore in parallel across the file (tbd)

multi keyword search
- "aho-corasick" to determine existence of multiple keywords
- replace tbd
- for large files we load as chunks, figure out the in b/w
multi-keyword search
- [ ] "aho-corasick" to determine existence of multiple keywords
- [ ] read file into chunks
- [ ] "replace feature"

file search
- "fuzzy-search" over entire dir
- [ ] "fuzzy-search" over entire dir
8 changes: 4 additions & 4 deletions find/boyerMoore.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class BoyerMoore {
preprocessGoodSuffixTable();
}

int find(const std::string& text) {
int find(const std::string& text) const {
int m = pattern.length();
int n = text.length();
if (m == 0 || m > n) return -1;
Expand All @@ -87,7 +87,7 @@ class BoyerMoore {
if (j < 0) {
return s; // Match found
} else {
int badCharShift = std::max(1, j - badCharTable[text[s + j]]);
int badCharShift = std::max(1, j - badCharTable.at(text[s + j]));
int goodSuffixShift = goodSuffixTable[j];
s += std::max(badCharShift, goodSuffixShift);
}
Expand All @@ -96,7 +96,7 @@ class BoyerMoore {
return -1;
}

std::vector<size_t> findAll(const std::string& text) {
std::vector<size_t> findAll(const std::string& text) const {
std::vector<size_t> positions;
int m = pattern.length();
int n = text.length();
Expand All @@ -113,7 +113,7 @@ class BoyerMoore {
positions.push_back(s);
s += goodSuffixTable[0]; // Shift after match
} else {
int badCharShift = std::max(1, j - badCharTable[text[s + j]]);
int badCharShift = std::max(1, j - badCharTable.at(text[s + j]));
int goodSuffixShift = goodSuffixTable[j];
s += std::max(badCharShift, goodSuffixShift);
}
Expand Down
Binary file modified find/find
Binary file not shown.
26 changes: 20 additions & 6 deletions find/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
#include <string>
#include <vector>
#include <fstream>
#include <filesystem>
#include "boyerMoore.h"

void searchFile(const std::string& filename, const std::string& searchPattern, size_t chunkSize = 8192) {
void searchFile(const std::string& filename, const BoyerMoore& bm, const std::string& searchPattern, size_t chunkSize = 8192) {
std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Error: Could not open file " << filename << std::endl;
return;
}

BoyerMoore bm(searchPattern);
const size_t overlap = searchPattern.size() - 1;
std::vector<char> buffer(chunkSize + overlap);

Expand Down Expand Up @@ -70,15 +70,29 @@ void searchFile(const std::string& filename, const std::string& searchPattern, s

int main(int argc, char* argv[]) {
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <filename> <search_pattern>" << std::endl;
std::cerr << "Usage: " << argv[0] << " <path> <search_pattern>" << std::endl;
return 1;
}

std::string filename = argv[1];
std::string path = argv[1];
std::string pattern = argv[2];

std::cout << "**Searching for '" << pattern << "' in " << filename << " using chunked read...**\n" << std::endl;
searchFile(filename, pattern);
const BoyerMoore bm(pattern);

if (std::filesystem::is_regular_file(path)) {
std::cout << "**Searching for '" << pattern << "' in file: " << path << "**\n\n";
searchFile(path, bm, pattern);
} else if (std::filesystem::is_directory(path)) {
for (const auto& entry : std::filesystem::recursive_directory_iterator(path)) {
if (entry.is_regular_file()) {
std::cout << "**Searching for '" << pattern << "' in file: " << entry.path() << "**\n\n";
searchFile(entry.path().string(), bm, pattern);
}
}
} else {
std::cerr << "Error: Path is not a file or directory.\n";
return 1;
}

return 0;
}
22 changes: 0 additions & 22 deletions find/sample.txt

This file was deleted.

116 changes: 116 additions & 0 deletions multi-find/ahoCorasick.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#ifndef AHOCORASICK_H
#define AHOCORASICK_H

#include <unordered_map>
#include <vector>
#include <queue>
#include <string>
#include <functional>

class AhoCorasick {
private:
struct Node {
std::unordered_map<char, Node*> children;
Node* fail = nullptr;
std::vector<std::string> output; // matched keywords at this node
};

Node* root;
std::vector<std::string> insertedKeywords;

public:
AhoCorasick() {
root = new Node();
}

~AhoCorasick() {
std::function<void(Node*)> cleanup = [&](Node* node) {
for (auto& [_, child] : node->children) {
cleanup(child);
}
delete node;
};
cleanup(root);
}

// Add a single keyword into the trie
void addKeyword(const std::string& keyword) {
Node* node = root;
for (char ch : keyword) {
if (!node->children.count(ch)) {
node->children[ch] = new Node();
}
node = node->children[ch];
}
node->output.push_back(keyword);
insertedKeywords.push_back(keyword);
}

// Expose inserted keywords (e.g., for color mapping)
const std::vector<std::string>& getKeywords() const {
return insertedKeywords;
}

// Build failure links for the trie
void build() {
std::queue<Node*> q;
root->fail = root;

// Set level 1 fail links to root
for (auto& [ch, node] : root->children) {
node->fail = root;
q.push(node);
}

// BFS through the trie
while (!q.empty()) {
Node* current = q.front(); q.pop();

for (auto& [ch, child] : current->children) {
Node* fallback = current->fail;
while (fallback != root && !fallback->children.count(ch)) {
fallback = fallback->fail;
}

if (fallback->children.count(ch) && fallback->children[ch] != child) {
child->fail = fallback->children[ch];
} else {
child->fail = root;
}

// Merge output from fail link
for (const std::string& kw : child->fail->output) {
child->output.push_back(kw);
}

q.push(child);
}
}
}

// Search for matches in the given text, return (position, matched word) pairs
std::vector<std::pair<size_t, std::string>> search(const std::string& text) const {
std::vector<std::pair<size_t, std::string>> results;
Node* node = root;

for (size_t i = 0; i < text.size(); ++i) {
char ch = text[i];

while (node != root && !node->children.count(ch)) {
node = node->fail;
}

if (node->children.count(ch)) {
node = node->children[ch];
}

for (const std::string& match : node->output) {
results.emplace_back(i + 1 - match.size(), match);
}
}

return results;
}
};

#endif // AHOCORASICK_H
Binary file added multi-find/mfind
Binary file not shown.
111 changes: 111 additions & 0 deletions multi-find/mfind.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#include <iostream>
#include <filesystem>
#include <fstream>
#include <vector>
#include <string>
#include <unordered_map>
#include <algorithm>
#include "ahoCorasick.h"

// ANSI colors
std::vector<std::string> colors = {
"\033[31m", // Red
"\033[32m", // Green
"\033[33m", // Yellow
"\033[34m", // Blue
"\033[35m", // Magenta
"\033[36m", // Cyan
};

std::string resetColor = "\033[0m";

void searchFile(const std::string& filename, const AhoCorasick& ac,
const std::unordered_map<std::string, std::string>& colorMap,
size_t chunkSize = 8192) {

std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Error: Could not open file " << filename << std::endl;
return;
}

const size_t overlap = 100;
std::vector<char> buffer(chunkSize + overlap);
size_t offset = 0;

while (!file.eof()) {
if (offset != 0) {
std::copy(buffer.end() - overlap, buffer.end(), buffer.begin());
}

file.read(buffer.data() + (offset != 0 ? overlap : 0), chunkSize);
std::streamsize bytesRead = file.gcount();
if (bytesRead == 0) break;

size_t totalSize = (offset == 0 ? bytesRead : bytesRead + overlap);
std::string chunkText(buffer.data(), totalSize);

auto matches = ac.search(chunkText);
for (const auto& [pos, word] : matches) {
size_t absolutePos = offset + pos;

// Context window
size_t contextSize = 30;
size_t start = (pos > contextSize) ? (pos - contextSize) : 0;
size_t end = std::min(pos + word.length() + contextSize, chunkText.length());
std::string context = chunkText.substr(start, end - start);

// Color match
auto it = colorMap.find(word);
std::string color = (it != colorMap.end()) ? it->second : "\033[37m";

// Output format: filename | byte offset | colored word | context
std::cout << filename
<< " | byte " << absolutePos
<< " | " << color << word << resetColor
<< " | ..." << context << "..." << std::endl;
}

offset += bytesRead;
}

file.close();
}

int main(int argc, char* argv[]) {
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << " <path> <keyword1> [keyword2] ...\n";
return 1;
}

std::string path = argv[1];
AhoCorasick ac;

// Add keywords to Aho-Corasick
for (int i = 2; i < argc; ++i) {
ac.addKeyword(argv[i]);
}
ac.build();

// Assign colors to each keyword
std::unordered_map<std::string, std::string> colorMap;
int colorIndex = 0;
for (const auto& word : ac.getKeywords()) {
colorMap[word] = colors[colorIndex++ % colors.size()];
}

if (std::filesystem::is_regular_file(path)) {
searchFile(path, ac, colorMap);
} else if (std::filesystem::is_directory(path)) {
for (const auto& entry : std::filesystem::recursive_directory_iterator(path)) {
if (entry.is_regular_file()) {
searchFile(entry.path().string(), ac, colorMap);
}
}
} else {
std::cerr << "Invalid path.\n";
return 1;
}

return 0;
}