diff --git a/.gitignore b/.gitignore index ec2c50e..2c9c809 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ *.app # Other -helpers/*.txt \ No newline at end of file +find/*.txt +find/data/* +multi-find/data/* +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index 3971c32..82a09a8 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,26 @@ lil side project for implementing: ## setup - clone - run with - - `clang++ main.cpp -o findr` + - `clang++ find.cpp -o find` + - `./findr` + +- for std::filesystem + - `clang++ -std=c++17 -stdlib=libc++ find.cpp -o find` - `./findr` ## deep dive keyword search - [X] "boyer-moore" on file specified to find instances where keywords exists -- [ ] read file into chunks -- [ ] deploy worker threads and run boyer moore in parallel across the file -- [ ] replace feature (tbd) +- [X] read file into chunks +- [X] recursively detect presence of keyword across a whole directory +- [ ] "replace feature" +- [ ] deploy worker threads and run boyer moore in parallel across the file (tbd) -multi keyword search -- "aho-corasick" to determine existence of multiple keywords -- replace tbd -- for large files we load as chunks, figure out the in b/w +multi-keyword search +- [ ] "aho-corasick" to determine existence of multiple keywords +- [ ] read file into chunks +- [ ] "replace feature" file search -- "fuzzy-search" over entire dir +- [ ] "fuzzy-search" over entire dir diff --git a/find/boyerMoore.h b/find/boyerMoore.h index 107bae5..8f4df17 100644 --- a/find/boyerMoore.h +++ b/find/boyerMoore.h @@ -72,7 +72,7 @@ class BoyerMoore { preprocessGoodSuffixTable(); } - int find(const std::string& text) { + int find(const std::string& text) const { int m = pattern.length(); int n = text.length(); if (m == 0 || m > n) return -1; @@ -87,7 +87,7 @@ class BoyerMoore { if (j < 0) { return s; // Match found } else { - int badCharShift = std::max(1, j - badCharTable[text[s + j]]); + int badCharShift = std::max(1, j - badCharTable.at(text[s + j])); int goodSuffixShift = goodSuffixTable[j]; s += std::max(badCharShift, goodSuffixShift); } @@ -96,7 +96,7 @@ class BoyerMoore { return -1; } - std::vector findAll(const std::string& text) { + std::vector findAll(const std::string& text) const { std::vector positions; int m = pattern.length(); int n = text.length(); @@ -113,7 +113,7 @@ class BoyerMoore { positions.push_back(s); s += goodSuffixTable[0]; // Shift after match } else { - int badCharShift = std::max(1, j - badCharTable[text[s + j]]); + int badCharShift = std::max(1, j - badCharTable.at(text[s + j])); int goodSuffixShift = goodSuffixTable[j]; s += std::max(badCharShift, goodSuffixShift); } diff --git a/find/find b/find/find index 59bb8d2..38a598d 100755 Binary files a/find/find and b/find/find differ diff --git a/find/find.cpp b/find/find.cpp index fc97c5e..8ffcaa0 100644 --- a/find/find.cpp +++ b/find/find.cpp @@ -2,16 +2,16 @@ #include #include #include +#include #include "boyerMoore.h" -void searchFile(const std::string& filename, const std::string& searchPattern, size_t chunkSize = 8192) { +void searchFile(const std::string& filename, const BoyerMoore& bm, const std::string& searchPattern, size_t chunkSize = 8192) { std::ifstream file(filename, std::ios::binary); if (!file.is_open()) { std::cerr << "Error: Could not open file " << filename << std::endl; return; } - BoyerMoore bm(searchPattern); const size_t overlap = searchPattern.size() - 1; std::vector buffer(chunkSize + overlap); @@ -70,15 +70,29 @@ void searchFile(const std::string& filename, const std::string& searchPattern, s int main(int argc, char* argv[]) { if (argc != 3) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; + std::cerr << "Usage: " << argv[0] << " " << std::endl; return 1; } - std::string filename = argv[1]; + std::string path = argv[1]; std::string pattern = argv[2]; - std::cout << "**Searching for '" << pattern << "' in " << filename << " using chunked read...**\n" << std::endl; - searchFile(filename, pattern); + const BoyerMoore bm(pattern); + + if (std::filesystem::is_regular_file(path)) { + std::cout << "**Searching for '" << pattern << "' in file: " << path << "**\n\n"; + searchFile(path, bm, pattern); + } else if (std::filesystem::is_directory(path)) { + for (const auto& entry : std::filesystem::recursive_directory_iterator(path)) { + if (entry.is_regular_file()) { + std::cout << "**Searching for '" << pattern << "' in file: " << entry.path() << "**\n\n"; + searchFile(entry.path().string(), bm, pattern); + } + } + } else { + std::cerr << "Error: Path is not a file or directory.\n"; + return 1; + } return 0; } diff --git a/find/sample.txt b/find/sample.txt deleted file mode 100644 index b6bc543..0000000 --- a/find/sample.txt +++ /dev/null @@ -1,22 +0,0 @@ -The Forgotten Library -Alice discovered the old library on a rainy Tuesday afternoon. The weathered oak door creaked as she pushed it open, revealing towering bookshelves that seemed to stretch endlessly toward the ceiling. Dust particles danced in the beams of light that filtered through stained glass windows. -"Hello?" she called, her voice echoing through the cavernous space. No answer came. -The library appeared abandoned, yet strangely well-preserved. Books of all sizes lined the shelves – leather-bound tomes with gold lettering, paperbacks with cracked spines, and ancient scrolls carefully stored in glass cases. Alice ran her fingers along the spines, leaving trails in the dust. -She selected a book at random, a midnight-blue volume with silver constellations embossed on its cover. The moment she opened it, the library seemed to shift around her. The light changed, becoming warmer, more golden. Had the rain stopped? -Alice became so absorbed in the book that she didn't notice the elderly librarian who had silently appeared beside her. -"That one's special," he said, causing Alice to nearly drop the book in surprise. -"I'm sorry," she stammered. "I didn't think anyone was here." -The librarian smiled kindly. "The library is always here for those who truly seek knowledge. Most people walk past without ever seeing the door." -Alice looked back toward the entrance, suddenly uncertain how she had found this place. "What do you mean? This building is huge. How could anyone miss it?" -"Perception," the librarian said, "is a curious thing. People often see only what they expect to see." -He gestured toward the book in her hands. "That volume contains constellations that no longer exist in our night sky. Maps to stars that have burned out centuries ago." -Alice opened the book again, marveling at the intricate star maps and handwritten notes in the margins. "Who wrote these notes?" -"Travelers," the librarian replied. "Like yourself." -"I'm not a traveler," Alice said. "I'm just... lost." -The librarian's eyes twinkled with amusement. "Isn't that where all great journeys begin? Being a little lost?" -Alice spent hours in the library that day, moving from book to book. Each volume seemed to contain impossible knowledge – histories of civilizations that never existed, scientific principles that defied known physics, maps to places that couldn't possibly be real. -When she finally decided to leave, the rain had indeed stopped. The setting sun cast long shadows on the cobblestone street. -"You'll come back?" the librarian asked, standing in the doorway. -Alice nodded, certain that she would return to the library. But as she walked home, something strange happened. When she turned back to look at the building, it was gone. In its place stood an ordinary brick wall between a coffee shop and a boutique clothing store. -She blinked in confusion, wondering if she had somehow taken a wrong turn. The book she had borrowed from the library – the one with constellations – was still tucked under her arm, solid and real. -Alice smiled. She would find the library again. After all, as the librarian had said, perception is a curious thing. Sometimes you just need to learn how to see. diff --git a/multi-find/ahoCorasick.h b/multi-find/ahoCorasick.h new file mode 100644 index 0000000..36d4068 --- /dev/null +++ b/multi-find/ahoCorasick.h @@ -0,0 +1,116 @@ +#ifndef AHOCORASICK_H +#define AHOCORASICK_H + +#include +#include +#include +#include +#include + +class AhoCorasick { +private: + struct Node { + std::unordered_map children; + Node* fail = nullptr; + std::vector output; // matched keywords at this node + }; + + Node* root; + std::vector insertedKeywords; + +public: + AhoCorasick() { + root = new Node(); + } + + ~AhoCorasick() { + std::function cleanup = [&](Node* node) { + for (auto& [_, child] : node->children) { + cleanup(child); + } + delete node; + }; + cleanup(root); + } + + // Add a single keyword into the trie + void addKeyword(const std::string& keyword) { + Node* node = root; + for (char ch : keyword) { + if (!node->children.count(ch)) { + node->children[ch] = new Node(); + } + node = node->children[ch]; + } + node->output.push_back(keyword); + insertedKeywords.push_back(keyword); + } + + // Expose inserted keywords (e.g., for color mapping) + const std::vector& getKeywords() const { + return insertedKeywords; + } + + // Build failure links for the trie + void build() { + std::queue q; + root->fail = root; + + // Set level 1 fail links to root + for (auto& [ch, node] : root->children) { + node->fail = root; + q.push(node); + } + + // BFS through the trie + while (!q.empty()) { + Node* current = q.front(); q.pop(); + + for (auto& [ch, child] : current->children) { + Node* fallback = current->fail; + while (fallback != root && !fallback->children.count(ch)) { + fallback = fallback->fail; + } + + if (fallback->children.count(ch) && fallback->children[ch] != child) { + child->fail = fallback->children[ch]; + } else { + child->fail = root; + } + + // Merge output from fail link + for (const std::string& kw : child->fail->output) { + child->output.push_back(kw); + } + + q.push(child); + } + } + } + + // Search for matches in the given text, return (position, matched word) pairs + std::vector> search(const std::string& text) const { + std::vector> results; + Node* node = root; + + for (size_t i = 0; i < text.size(); ++i) { + char ch = text[i]; + + while (node != root && !node->children.count(ch)) { + node = node->fail; + } + + if (node->children.count(ch)) { + node = node->children[ch]; + } + + for (const std::string& match : node->output) { + results.emplace_back(i + 1 - match.size(), match); + } + } + + return results; + } +}; + +#endif // AHOCORASICK_H diff --git a/multi-find/mfind b/multi-find/mfind new file mode 100755 index 0000000..80a6cc2 Binary files /dev/null and b/multi-find/mfind differ diff --git a/multi-find/mfind.cpp b/multi-find/mfind.cpp new file mode 100644 index 0000000..36967f8 --- /dev/null +++ b/multi-find/mfind.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include +#include "ahoCorasick.h" + +// ANSI colors +std::vector colors = { + "\033[31m", // Red + "\033[32m", // Green + "\033[33m", // Yellow + "\033[34m", // Blue + "\033[35m", // Magenta + "\033[36m", // Cyan +}; + +std::string resetColor = "\033[0m"; + +void searchFile(const std::string& filename, const AhoCorasick& ac, + const std::unordered_map& colorMap, + size_t chunkSize = 8192) { + + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Could not open file " << filename << std::endl; + return; + } + + const size_t overlap = 100; + std::vector buffer(chunkSize + overlap); + size_t offset = 0; + + while (!file.eof()) { + if (offset != 0) { + std::copy(buffer.end() - overlap, buffer.end(), buffer.begin()); + } + + file.read(buffer.data() + (offset != 0 ? overlap : 0), chunkSize); + std::streamsize bytesRead = file.gcount(); + if (bytesRead == 0) break; + + size_t totalSize = (offset == 0 ? bytesRead : bytesRead + overlap); + std::string chunkText(buffer.data(), totalSize); + + auto matches = ac.search(chunkText); + for (const auto& [pos, word] : matches) { + size_t absolutePos = offset + pos; + + // Context window + size_t contextSize = 30; + size_t start = (pos > contextSize) ? (pos - contextSize) : 0; + size_t end = std::min(pos + word.length() + contextSize, chunkText.length()); + std::string context = chunkText.substr(start, end - start); + + // Color match + auto it = colorMap.find(word); + std::string color = (it != colorMap.end()) ? it->second : "\033[37m"; + + // Output format: filename | byte offset | colored word | context + std::cout << filename + << " | byte " << absolutePos + << " | " << color << word << resetColor + << " | ..." << context << "..." << std::endl; + } + + offset += bytesRead; + } + + file.close(); +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " [keyword2] ...\n"; + return 1; + } + + std::string path = argv[1]; + AhoCorasick ac; + + // Add keywords to Aho-Corasick + for (int i = 2; i < argc; ++i) { + ac.addKeyword(argv[i]); + } + ac.build(); + + // Assign colors to each keyword + std::unordered_map colorMap; + int colorIndex = 0; + for (const auto& word : ac.getKeywords()) { + colorMap[word] = colors[colorIndex++ % colors.size()]; + } + + if (std::filesystem::is_regular_file(path)) { + searchFile(path, ac, colorMap); + } else if (std::filesystem::is_directory(path)) { + for (const auto& entry : std::filesystem::recursive_directory_iterator(path)) { + if (entry.is_regular_file()) { + searchFile(entry.path().string(), ac, colorMap); + } + } + } else { + std::cerr << "Invalid path.\n"; + return 1; + } + + return 0; +}