aj1126 · aj1126 · Jun 14, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,32 @@
+name: Node.js CI Pipeline
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+permissions:
+  contents: read
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        node-version: [20.x, 22.x]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Use Node.js ${{ matrix.node-version }}
+      uses: actions/setup-node@v3
+      with:
+        node-version: ${{ matrix.node-version }}
+        cache: 'npm'
+
+    - name: Clean Install and Test
+      run: |
+        npm ci
+        npm test
+        npm run docs:check
diff --git a/.gitignore b/.gitignore
@@ -32,6 +32,7 @@ Thumbs.db
 .env.local
 copilot-chat-history.json
 *.traineddata
+.analytics_cache.json
 
 # =========================
 # Bot Specific: Data & Media

diff --git a/.husky/pre-commit b/.husky/pre-commit
@@ -0,0 +1,6 @@
+#!/usr/bin/env sh
+
+npm run docs:generate || exit 1
+git add docs/ || exit 1
+npm test || exit 1
+npm run docs:check || exit 1
diff --git a/README.md b/README.md
@@ -80,14 +80,17 @@ The current Node ingestion pipeline only analyzes text-oriented files.
 | `.csv` | Ingested by the active Node pipeline |
 | `.log` | Ingested by the active Node pipeline |
 | `.pdf` | Ingested by the active Node pipeline |
+| `.png` | Ingested by the active Node pipeline |
+| `.jpg` | Ingested by the active Node pipeline |
+| `.jpeg` | Ingested by the active Node pipeline |
 <!-- GENERATED:supported-file-types:END -->
 
 ## Repository Layout
 
 <!-- GENERATED:repo-layout:START -->
 - `src/index.js` — Node CLI entry point.
 - `src/pipeline.js` — Pipeline coordinator that assembles all analytics tiers.
-- `src/ingestion/file-ingestion.js` — Read-only recursive file ingestion for supported text files.
+- `src/ingestion/file-ingestion.js` — Read-only recursive file ingestion for supported files.
 - `src/analytics/` — Descriptive, diagnostic, predictive, and prescriptive analytics modules.
 - `test/pipeline.test.js` — Node test coverage for core pipeline behavior.
 - `docs/architecture.md` — Hand-authored architecture overview for current and planned system design.
@@ -125,6 +128,99 @@ The bot must never modify, move, or delete ingested source files. Ingestion is r
 - When adding analytics, classify behavior under one of the four analytics tiers.
 - Update [docs/architecture.md](docs/architecture.md) when implementation changes affect current-vs-planned system boundaries.
 
+
+<br>
+
+
+
+## ⚙️ Installation & Setup
+
+**Prerequisites:** Ensure you have [Node.js](https://nodejs.org/) installed (version 18, 20, or 22+ recommended).
+
+1. **Clone the repository:**
+```bash
+git clone https://github.com/aj1126/uap_analyticsbot.git
+cd uap_analyticsbot
+
+```
+
+
+2. **Install dependencies:**
+This project installs as a standard Node.js CLI package, so there are no extra native build steps required for the current worker-thread ingestion flow. Simply run:
+```bash
+npm install
+
+```
+
+
+3. **Verify the installation:**
+Run the local test suite to ensure the multithreaded worker pool and caching engine are functioning correctly on your machine:
+```bash
+npm test
+
+```
+
+
+*(If all tests pass green, you are ready to start analyzing documents!)*
+
+
+---
+
+<br>
+
+
+
+
+## Usage
+
+
+To run the AnalyticsBot, simply pass the target directory containing your text files as the first argument:
+
+```bash
+node src/index.js ./my_folder/
+
+```
+
+By default, this will parse the documents and output a formatted JSON report directly to your console.
+
+### 👀 Watch Mode
+
+Keep the pipeline running in the background. It will automatically re-analyze the documents and recalculate the math whenever you add, edit, or delete a file in the target directory:
+
+```bash
+node src/index.js ./my_folder/ --watch
+
+```
+
+### 🖨️ Report Generation
+
+Instead of dumping JSON directly to the console, you can generate formatted report files that are automatically saved to the `/data_exports/` directory:
+
+```bash
+node src/index.js ./my_folder/ --format=md
+
+```
+
+*(Supports `md` for Markdown or `csv` for spreadsheet datasets).*
+
+
+---
+<br>
+
+### 🚀 Advanced Usage
+
+The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. You can control these via CLI arguments:
+
+* `node src/index.js ./my_folder --workers=4` : Manually set the number of Node.js worker threads (defaults to max CPU cores).
+* `node src/index.js ./my_folder --clear-cache` : Bypasses the `.analytics_cache.json` file and forces a fresh read of all documents.
+* `node src/index.js ./my_folder --format=csv` : Exports the final report as a spreadsheet-compatible `.csv` file.
+
+<br>
+<br>
+<br>
+
+
+
 ## 🚀 Planned Technical Optimizations
 
 ### 1. Performance & Infrastructure

diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md
@@ -56,14 +56,16 @@ npm start -- "C:\Path\To\Folder" > analytics_report.json
 
 ## Supported File Types
 
-Currently, the ingestion engine natively parses the following text-based extensions:
+Currently, the ingestion engine natively parses the following extensions:
 * `.txt`
 * `.md`
 * `.json`
 * `.csv`
 * `.log`
-
-*(Note: Binary and multimedia extraction, such as PDF parsing and Image OCR, are tracked for a future development stage).*
+* `.pdf`
+* `.png`
+* `.jpg`
+* `.jpeg`
 
 ## Testing & Validation
 

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -4,26 +4,31 @@
 
 The repository currently ships a Node.js CLI-centered analytics flow:
 
-1. **CLI Orchestrator (`src/index.js`)** resolves the source directory and writes the final report to stdout.
-2. **Read-Only Ingestion (`src/ingestion/file-ingestion.js`)** recursively scans supported text files, streams file content, and extracts words, dates, locations, and filesystem metadata.
+1. **CLI Orchestrator (`src/index.js`)** resolves the source directory, supports watch mode, and routes report output to stdout or export files.
+2. **Read-Only Ingestion (`src/ingestion/file-ingestion.js`)** recursively scans supported text files, dispatches parsing work to Node.js worker threads, memoizes compatible results in `.analytics_cache.json`, and extracts words, dates, locations, and filesystem metadata.
 3. **Analytics Pipeline (`src/pipeline.js`)** builds the descriptive, diagnostic, predictive, and prescriptive tiers from the ingested file set.
-4. **Output Layer** returns a single structured JSON report for the requested directory.
+4. **Output Layer** returns structured JSON or saves Markdown / CSV exports for the requested directory.
+
+### v1.2.0 Pipeline Architecture
+* **Ingestion (Multithreaded):** Utilizes Node.js `worker_threads` and file-stat fingerprinting (`.analytics_cache.json`) to bypass redundant processing and drastically speed up execution.
+* **Semantic Analytics:** Employs a TF-IDF weighting engine to filter generic stop-words and a Cosine Similarity math engine to automatically cluster related UAP documents based on vector distance.
 
 ## Current Runtime Boundaries
 
 Implemented today:
 
 - recursive read-only ingestion for `.txt`, `.md`, `.json`, `.csv`, and `.log`
+- multithreaded parsing with fingerprint-based cache reuse for compatible ingestions
 - tokenization plus lightweight date/location extraction
 - descriptive, diagnostic, predictive, and prescriptive analytics modules
-- JSON report delivery through the Node CLI
+- JSON, Markdown, and CSV report delivery through the Node CLI
+- directory watch mode that re-runs the pipeline after file changes
 
 Not yet implemented in the active system:
 
 - binary or multimedia extraction
 - Named Entity Recognition (NER)
-- dashboard or alternate export formats
-- background scheduling or directory watching
+- dashboard or background scheduling
 
 ## Planned Expansion
 

diff --git a/docs/docs-source.json b/docs/docs-source.json
@@ -26,7 +26,7 @@
       "description": "Auto-generate CHANGELOG.md, bump the semantic version, and create a Git release tag based on conventional commit history."
     }
   ],
-  "supportedFileTypes": [".txt", ".md", ".json", ".csv", ".log", ".pdf"],
+  "supportedFileTypes": [".txt", ".md", ".json", ".csv", ".log", ".pdf", ".png", ".jpg", ".jpeg"],
   "repoLayout": [
     {
       "path": "src/index.js",
@@ -38,7 +38,7 @@
     },
     {
       "path": "src/ingestion/file-ingestion.js",
-      "description": "Read-only recursive file ingestion for supported text files."
+      "description": "Read-only recursive file ingestion for supported files."
     },
     {
       "path": "src/analytics/",

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -6,9 +6,10 @@
   "main": "src/index.js",
   "scripts": {
     "start": "node src/index.js",
-    "test": "node --test",
+    "test": "node --test --experimental-test-coverage",
     "docs:generate": "node scripts/generate-docs.js",
     "docs:check": "node scripts/generate-docs.js --check && node scripts/validate-docs.js",
+    "prepare": "husky",
     "release": "commit-and-tag-version",
     "postrelease": "git push --follow-tags && gh release create v%npm_package_version% --notes-file CHANGELOG.md --title \"Release v%npm_package_version%\""
   },
@@ -29,6 +30,7 @@
     "tesseract.js": "^7.0.0"
   },
   "devDependencies": {
-    "commit-and-tag-version": "^12.7.3"
+    "commit-and-tag-version": "^12.7.3",
+    "husky": "^9.1.7"
   }
 }
diff --git a/src/analytics/descriptive.js b/src/analytics/descriptive.js
@@ -1,34 +1,43 @@
-function countBy(items) {
-    return items.reduce((counts, item) => {
-        counts[item] = (counts[item] ?? 0) + 1;
-        return counts;
-    }, {});
-}
-
 function sortEntriesDescending(record) {
     return Object.entries(record).sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]));
 }
 
 function buildDescriptiveAnalytics(files) {
-    const allWords = files.flatMap((file) => file.words);
-    const allDates = files.flatMap((file) => file.dates);
-    const allLocations = files.flatMap((file) => file.locations);
+    const allDates = files.flatMap((file) => file.dates || []);
+    const allLocations = files.flatMap((file) => file.locations || []);
+
+    const globalWordFrequency = {};
+    const glossarySet = new Set();
 
-    const wordFrequency = countBy(allWords);
+    // Iterate through files using the new memory-efficient object format
+    files.forEach((file) => {
+        if (file.wordFrequency) {
+            for (const [word, count] of Object.entries(file.wordFrequency)) {
+                globalWordFrequency[word] = (globalWordFrequency[word] || 0) + count;
+                glossarySet.add(word);
+            }
+        } else if (file.words) { 
+            // Backwards compatibility layer
+            for (const word of file.words) {
+                globalWordFrequency[word] = (globalWordFrequency[word] || 0) + 1;
+                glossarySet.add(word);
+            }
+        }
+    });
 
     return {
         fileCount: files.length,
-        glossary: [...new Set(allWords)].sort(),
-        wordFrequency,
-        topWords: sortEntriesDescending(wordFrequency).slice(0, 10).map(([word, count]) => ({ word, count })),
+        glossary: [...glossarySet].sort(),
+        wordFrequency: globalWordFrequency,
+        topWords: sortEntriesDescending(globalWordFrequency).slice(0, 10).map(([word, count]) => ({ word, count })),
         dates: [...new Set(allDates)].sort(),
         locations: [...new Set(allLocations)].sort(),
         files: files.map((file) => ({
             path: file.relativePath,
-            extension: file.extension, // <-- FIX: Added extension propagation
+            extension: file.extension, 
             size: file.size,
             modifiedAt: file.modifiedAt,
-            wordCount: file.words.length,
+            wordCount: file.totalWords || (file.words ? file.words.length : 0),
             dates: file.dates,
             locations: file.locations,
             metadata: file.metadata || {}