From 76b1450452ced5582a2bd283f1292c912e8fbfc0 Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 14:42:02 -0400 Subject: [PATCH 01/15] feat: Enhance analytics with TF-IDF calculation and CSV report generation - Added TF-IDF analysis to diagnostic analytics for keyword extraction. - Implemented CSV report generation in the delivery module. - Improved file ingestion with caching and fingerprinting for efficiency. - Enhanced predictive analytics with weighted moving average forecasting. - Updated prescriptive analytics to handle missing metadata more gracefully. - Introduced GitHub Actions CI pipeline for automated testing across multiple Node.js versions. --- .analytics_cache.json | 546 ++++++++++++++++++++++++++++++++ .github/workflows/test.yml | 29 ++ src/analytics/diagnostic.js | 49 ++- src/analytics/predictive.js | 81 ++--- src/analytics/prescriptive.js | 9 +- src/delivery/csv-generator.js | 26 ++ src/index.js | 38 +-- src/ingestion/file-ingestion.js | 47 ++- src/ingestion/worker.js | 151 ++------- src/pipeline.js | 8 +- 10 files changed, 768 insertions(+), 216 deletions(-) create mode 100644 .analytics_cache.json create mode 100644 .github/workflows/test.yml create mode 100644 src/delivery/csv-generator.js diff --git a/.analytics_cache.json b/.analytics_cache.json new file mode 100644 index 0000000..f281e3d --- /dev/null +++ b/.analytics_cache.json @@ -0,0 +1,546 @@ +{ + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-4PTpLX\\report-one.txt": { + "fingerprint": "90-1781375967879.2063", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:39:27.879Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-4PTpLX\\report-two.md": { + "fingerprint": "99-1781375967879.2063", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:39:27.879Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-MPVYbs\\partial.txt": { + "fingerprint": "42-1781375967958.6162", + "data": { + "fileName": "partial.txt", + "modifiedAt": "2026-06-13T18:39:27.959Z", + "content": "Strange humming without a location marker.", + "words": [ + "strange", + "humming", + "without", + "a", + "location", + "marker" + ], + "dates": [], + "locations": [] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-1SWPR4\\report-two.md": { + "fingerprint": "99-1781375967990.9768", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:39:27.991Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-1SWPR4\\report-one.txt": { + "fingerprint": "90-1781375967989.9636", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:39:27.990Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-G3uFYr\\report-two.md": { + "fingerprint": "99-1781375983792.8586", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:39:43.793Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-G3uFYr\\report-one.txt": { + "fingerprint": "90-1781375983791.832", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:39:43.792Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-PQ06mp\\partial.txt": { + "fingerprint": "42-1781375983839.6965", + "data": { + "fileName": "partial.txt", + "modifiedAt": "2026-06-13T18:39:43.840Z", + "content": "Strange humming without a location marker.", + "words": [ + "strange", + "humming", + "without", + "a", + "location", + "marker" + ], + "dates": [], + "locations": [] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wBxL0I\\report-two.md": { + "fingerprint": "99-1781375983870.4006", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:39:43.870Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wBxL0I\\report-one.txt": { + "fingerprint": "90-1781375983869.3723", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:39:43.869Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-pRueHf\\report-one.txt": { + "fingerprint": "90-1781376082977.1807", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:41:22.977Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-pRueHf\\report-two.md": { + "fingerprint": "99-1781376082978.2163", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:41:22.978Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-5zyTja\\partial.txt": { + "fingerprint": "42-1781376083024.2827", + "data": { + "fileName": "partial.txt", + "modifiedAt": "2026-06-13T18:41:23.024Z", + "content": "Strange humming without a location marker.", + "words": [ + "strange", + "humming", + "without", + "a", + "location", + "marker" + ], + "dates": [], + "locations": [] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vpSzbX\\report-two.md": { + "fingerprint": "99-1781376083054.3164", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:41:23.054Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vpSzbX\\report-one.txt": { + "fingerprint": "90-1781376083053.2993", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:41:23.053Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2X1I4F\\report-one.txt": { + "fingerprint": "90-1781376085242.656", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:41:25.243Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2X1I4F\\report-two.md": { + "fingerprint": "99-1781376085243.6675", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:41:25.244Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-Z1rQCb\\partial.txt": { + "fingerprint": "42-1781376085289.425", + "data": { + "fileName": "partial.txt", + "modifiedAt": "2026-06-13T18:41:25.289Z", + "content": "Strange humming without a location marker.", + "words": [ + "strange", + "humming", + "without", + "a", + "location", + "marker" + ], + "dates": [], + "locations": [] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-qeRGkt\\report-one.txt": { + "fingerprint": "90-1781376085317", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T18:41:25.317Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "in", + "roswell", + "and", + "hovered", + "over", + "the", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-qeRGkt\\report-two.md": { + "fingerprint": "99-1781376085318.0146", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T18:41:25.318Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "a", + "bright", + "triangle", + "was", + "reported", + "near", + "phoenix", + "and", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + } +} \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..fb745c4 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,29 @@ +name: Node.js CI Pipeline + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + node-version: [18.x, 20.x, 22.x] + + steps: + - uses: actions/checkout@v3 + + - name: Use Node.js ${{ matrix.node-version }} + uses: actions/setup-node@v3 + with: + node-version: ${{ matrix.node-version }} + cache: 'npm' + + - name: Clean Install and Test + run: | + npm ci + npm test + npm run docs:check \ No newline at end of file diff --git a/src/analytics/diagnostic.js b/src/analytics/diagnostic.js index acbbaf6..d745b51 100644 --- a/src/analytics/diagnostic.js +++ b/src/analytics/diagnostic.js @@ -1,8 +1,5 @@ function incrementNestedCount(target, firstKey, secondKey, amount = 1) { - if (!target[firstKey]) { - target[firstKey] = {}; - } - + if (!target[firstKey]) target[firstKey] = {}; target[firstKey][secondKey] = (target[firstKey][secondKey] ?? 0) + amount; } @@ -11,9 +8,7 @@ function buildUsageRates(files, groupSelector) { for (const file of files) { const groups = groupSelector(file); - if (groups.length === 0 || file.words.length === 0) { - continue; - } + if (!groups || groups.length === 0 || !file.words || file.words.length === 0) continue; const uniqueWords = new Set(file.words); for (const group of groups) { @@ -30,19 +25,47 @@ function buildUsageRates(files, groupSelector) { .map(([word, count]) => ({ word, usageRate: Number((count / total).toFixed(4)) })) .sort((left, right) => right.usageRate - left.usageRate || left.word.localeCompare(right.word)) .slice(0, 5); - return [group, topWords]; }) ); } +// ✨ TF-IDF Engine +function calculateTFIDF(files) { + const fileCount = files.length; + const documentFrequencies = {}; + + files.forEach(file => { + if(!file.words) return; + const uniqueWords = new Set(file.words.map(w => w.toLowerCase())); + uniqueWords.forEach(word => { documentFrequencies[word] = (documentFrequencies[word] || 0) + 1; }); + }); + + return files.map(file => { + if(!file.words) return { ...file, topKeywords: [] }; + const tf = {}; + const totalWords = file.words.length; + + file.words.forEach(word => { tf[word] = (tf[word] || 0) + 1; }); + + const tfidf = Object.keys(tf).map(word => { + const termFrequency = tf[word] / totalWords; + const inverseDocumentFrequency = Math.log(fileCount / (1 + documentFrequencies[word])); + return { word, weight: termFrequency * inverseDocumentFrequency }; + }).sort((a, b) => b.weight - a.weight); + + return { ...file, topKeywords: tfidf.slice(0, 5).map(t => t.word) }; + }); +} + function buildDiagnosticAnalytics(files) { + const filesWithTFIDF = calculateTFIDF(files); + return { - wordUsageByDate: buildUsageRates(files, (file) => file.dates), - wordUsageByLocation: buildUsageRates(files, (file) => file.locations) + wordUsageByDate: buildUsageRates(files, (file) => file.dates || []), + wordUsageByLocation: buildUsageRates(files, (file) => file.locations || []), + tfIdfAnalysis: filesWithTFIDF.map(f => ({ file: f.fileName, topKeywords: f.topKeywords })) }; } -module.exports = { - buildDiagnosticAnalytics -}; +module.exports = { buildDiagnosticAnalytics }; \ No newline at end of file diff --git a/src/analytics/predictive.js b/src/analytics/predictive.js index 2634874..7cd1d31 100644 --- a/src/analytics/predictive.js +++ b/src/analytics/predictive.js @@ -1,30 +1,25 @@ -function monthKey(dateString) { - return dateString.slice(0, 7); -} - -function average(values) { - if (values.length === 0) { - return 0; - } - - return values.reduce((sum, value) => sum + value, 0) / values.length; -} +function monthKey(dateString) { return dateString.slice(0, 7); } +// ✨ Nonlinear Forecasting Tweaks (Weighted Moving Average) function forecastNextValue(series) { - if (series.length === 0) { - return 0; - } - - if (series.length === 1) { - return series[0].count; - } + if (series.length === 0) return 0; + if (series.length === 1) return series[0].count; const deltas = []; for (let index = 1; index < series.length; index += 1) { deltas.push(series[index].count - series[index - 1].count); } - return Math.max(0, Math.round(series[series.length - 1].count + average(deltas))); + let weightedSum = 0; + let weightTotal = 0; + for (let i = 0; i < deltas.length; i++) { + const weight = i + 1; // More recent intervals gain higher weight + weightedSum += deltas[i] * weight; + weightTotal += weight; + } + + const wma = weightTotal === 0 ? 0 : weightedSum / weightTotal; + return Math.max(0, Math.round(series[series.length - 1].count + wma)); } function addMonth(month) { @@ -33,33 +28,48 @@ function addMonth(month) { return nextDate.toISOString().slice(0, 7); } +// ✨ Support empty intervals +function fillEmptyIntervals(orderedMonths, timeline) { + if (orderedMonths.length === 0) return []; + const filledSeries = []; + let currentMonth = orderedMonths[0]; + const lastMonth = orderedMonths[orderedMonths.length - 1]; + + while (currentMonth <= lastMonth) { + filledSeries.push({ + month: currentMonth, + count: timeline[currentMonth] ? timeline[currentMonth].totalWords : 0 + }); + currentMonth = addMonth(currentMonth); + } + return filledSeries; +} + function buildKeywordSeries(files) { const timeline = {}; - for (const file of files) { + if(!file.modifiedAt) continue; const key = monthKey(file.modifiedAt); - if (!timeline[key]) { - timeline[key] = { totalWords: 0, locations: {} }; - } + if (!timeline[key]) timeline[key] = { totalWords: 0, locations: {} }; - timeline[key].totalWords += file.words.length; - for (const location of file.locations) { + timeline[key].totalWords += (file.words || []).length; + for (const location of (file.locations || [])) { timeline[key].locations[location] = (timeline[key].locations[location] ?? 0) + 1; } } - return timeline; } function buildPredictiveAnalytics(files) { const timeline = buildKeywordSeries(files); const orderedMonths = Object.keys(timeline).sort(); - const keywordSeries = orderedMonths.map((month) => ({ month, count: timeline[month].totalWords })); + + const keywordSeries = fillEmptyIntervals(orderedMonths, timeline); const nextMonth = orderedMonths.length > 0 ? addMonth(orderedMonths[orderedMonths.length - 1]) : new Date().toISOString().slice(0, 7); const locationTotals = {}; for (const month of orderedMonths) { - for (const [location, count] of Object.entries(timeline[month].locations)) { + for (const [location, count] of Object.entries(timeline[month]?.locations || {})) { locationTotals[location] = (locationTotals[location] ?? 0) + count; } } @@ -68,18 +78,9 @@ function buildPredictiveAnalytics(files) { .sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))[0]?.[0] ?? null; return { - keywordFrequencyForecast: { - basis: keywordSeries, - forecastMonth: nextMonth, - forecastWordCount: forecastNextValue(keywordSeries) - }, - locationClusterForecast: { - basis: locationTotals, - likelyNextHotspot: topLocation - } + keywordFrequencyForecast: { basis: keywordSeries, forecastMonth: nextMonth, forecastWordCount: forecastNextValue(keywordSeries) }, + locationClusterForecast: { basis: locationTotals, likelyNextHotspot: topLocation } }; } -module.exports = { - buildPredictiveAnalytics -}; +module.exports = { buildPredictiveAnalytics }; \ No newline at end of file diff --git a/src/analytics/prescriptive.js b/src/analytics/prescriptive.js index d9ca602..c1fcde2 100644 --- a/src/analytics/prescriptive.js +++ b/src/analytics/prescriptive.js @@ -1,7 +1,8 @@ function buildPrescriptiveAnalytics(files, descriptiveAnalytics) { const missingMetadataFiles = files - .filter((file) => file.dates.length === 0 || file.locations.length === 0) - .map((file) => file.relativePath); + .filter((file) => !file.dates?.length || !file.locations?.length) + // FIX: The worker pool returns `fileName`, so we map that instead (with a fallback) + .map((file) => file.fileName || file.relativePath); const recommendations = []; @@ -13,7 +14,7 @@ function buildPrescriptiveAnalytics(files, descriptiveAnalytics) { }); } - if (descriptiveAnalytics.locations.length > 1) { + if (descriptiveAnalytics.locations && descriptiveAnalytics.locations.length > 1) { recommendations.push({ type: 'folder-restructure', message: 'Consider grouping files into location-based subfolders to improve topic clustering and navigation.', @@ -35,4 +36,4 @@ function buildPrescriptiveAnalytics(files, descriptiveAnalytics) { module.exports = { buildPrescriptiveAnalytics -}; +}; \ No newline at end of file diff --git a/src/delivery/csv-generator.js b/src/delivery/csv-generator.js new file mode 100644 index 0000000..9ba2c7a --- /dev/null +++ b/src/delivery/csv-generator.js @@ -0,0 +1,26 @@ +const fs = require('node:fs/promises'); +const path = require('node:path'); + +async function generateCsvReport(report, exportsDir) { + await fs.mkdir(exportsDir, { recursive: true }); + const csvPath = path.join(exportsDir, `report-${Date.now()}.csv`); + + let csvContent = "Category,Metric,Value\n"; + csvContent += `Descriptive,FileCount,${report.descriptive.fileCount}\n`; + + const locations = report.descriptive.locations || report.locations || []; + csvContent += `Descriptive,UniqueLocations,"${locations.join(', ')}"\n`; + + if (report.predictive?.locationClusterForecast) { + csvContent += `Predictive,LikelyNextHotspot,${report.predictive.locationClusterForecast.likelyNextHotspot}\n`; + } + if (report.predictive?.keywordFrequencyForecast) { + csvContent += `Predictive,ForecastMonth,${report.predictive.keywordFrequencyForecast.forecastMonth}\n`; + csvContent += `Predictive,ForecastWordCount,${report.predictive.keywordFrequencyForecast.forecastWordCount}\n`; + } + + await fs.writeFile(csvPath, csvContent, 'utf-8'); + return csvPath; +} + +module.exports = { generateCsvReport }; \ No newline at end of file diff --git a/src/index.js b/src/index.js index 97b52f8..930b90a 100644 --- a/src/index.js +++ b/src/index.js @@ -3,14 +3,19 @@ const path = require('node:path'); const chokidar = require('chokidar'); const { generateAnalyticsReport } = require('./pipeline'); const { generateMarkdownReport } = require('./delivery/markdown-generator'); +const { generateCsvReport } = require('./delivery/csv-generator'); -async function runPipeline(sourceDirectory, format) { +async function runPipeline(sourceDirectory, format, options) { try { - const report = await generateAnalyticsReport(sourceDirectory); + const report = await generateAnalyticsReport(sourceDirectory, options); + const exportsDir = path.join(process.cwd(), 'data_exports'); + if (format === 'md' || format === 'markdown') { - const exportsDir = path.join(process.cwd(), 'data_exports'); const savedPath = await generateMarkdownReport(report, exportsDir); process.stdout.write(`✅ Markdown report successfully generated at:\n${savedPath}\n`); + } else if (format === 'csv') { + const savedPath = await generateCsvReport(report, exportsDir); + process.stdout.write(`✅ CSV report successfully generated at:\n${savedPath}\n`); } else { process.stdout.write(`${JSON.stringify(report, null, 2)}\n`); } @@ -22,42 +27,37 @@ async function runPipeline(sourceDirectory, format) { async function main() { const args = process.argv.slice(2); - // Parse flags const formatFlag = args.find(arg => arg.startsWith('--format=')); const format = formatFlag ? formatFlag.split('=')[1].toLowerCase() : 'json'; const isWatchMode = args.includes('--watch'); + const clearCache = args.includes('--clear-cache'); + + const workersFlag = args.find(arg => arg.startsWith('--workers=')); + const workers = workersFlag ? parseInt(workersFlag.split('=')[1], 10) : undefined; - // Parse target directory const sourceArg = args.find(arg => !arg.startsWith('--')); const sourceDirectory = sourceArg ? path.resolve(sourceArg) : process.cwd(); + const options = { clearCache, workers }; + if (isWatchMode) { process.stdout.write(`👀 Watching directory for changes: ${sourceDirectory}\n`); - - // Initialize OS Event Listener const watcher = chokidar.watch(sourceDirectory, { ignored: [/(^|[\/\\])\../, /node_modules/, /data_exports/], - persistent: true, - ignoreInitial: false + persistent: true, ignoreInitial: false }); - // Debounce logic to prevent CPU spikes on bulk file operations let timeout; const triggerPipeline = () => { clearTimeout(timeout); timeout = setTimeout(() => { process.stdout.write(`\n🔄 File system event detected. Recalculating analytics...\n`); - runPipeline(sourceDirectory, format); - }, 500); // 500ms buffer + runPipeline(sourceDirectory, format, options); + }, 500); }; - - // Bind events - watcher - .on('add', triggerPipeline) - .on('change', triggerPipeline) - .on('unlink', triggerPipeline); + watcher.on('add', triggerPipeline).on('change', triggerPipeline).on('unlink', triggerPipeline); } else { - await runPipeline(sourceDirectory, format); + await runPipeline(sourceDirectory, format, options); } } diff --git a/src/ingestion/file-ingestion.js b/src/ingestion/file-ingestion.js index afa1617..73d4b63 100644 --- a/src/ingestion/file-ingestion.js +++ b/src/ingestion/file-ingestion.js @@ -16,18 +16,35 @@ async function* walkFiles(rootDirectory) { } } -async function ingestDirectory(rootDirectory) { +async function ingestDirectory(rootDirectory, options = {}) { const sourceDirectory = path.resolve(rootDirectory); const files = []; const pathsToProcess = []; + // State Caching (Memoization) + const cachePath = path.join(process.cwd(), '.analytics_cache.json'); + let cache = {}; + if (!options.clearCache) { + try { + const cacheData = await fsp.readFile(cachePath, 'utf-8'); + cache = JSON.parse(cacheData); + } catch (err) { + cache = {}; + } + } + for await (const filePath of walkFiles(sourceDirectory)) { - pathsToProcess.push(filePath); + const stats = await fsp.stat(filePath); + const fingerprint = `${stats.size}-${stats.mtimeMs}`; // Size + Modified Time + + if (cache[filePath] && cache[filePath].fingerprint === fingerprint) { + files.push(cache[filePath].data); // Short-circuit bypass + } else { + pathsToProcess.push({ filePath, fingerprint }); + } } - // FIX 1: Cap workers to the number of files. - // Prevents spawning 15 massive threads to process 1 tiny test file. - const maxCores = Math.max(1, os.cpus().length - 1); + const maxCores = options.workers || Math.max(1, os.cpus().length - 1); const numWorkers = Math.min(pathsToProcess.length, maxCores); if (numWorkers === 0) { @@ -46,6 +63,7 @@ async function ingestDirectory(rootDirectory) { worker.on("message", (msg) => { if (msg.success && msg.result) { files.push(msg.result); + cache[msg.filePath] = { fingerprint: msg.fingerprint, data: msg.result }; } else if (!msg.success) { process.stderr.write(`\n⚠️ File failed (${msg.filePath}): ${msg.error}\n`); } @@ -54,19 +72,16 @@ async function ingestDirectory(rootDirectory) { worker.on("error", (err) => { process.stderr.write(`\n⚠️ Fatal Worker Crash: ${err.message}\n`); - // FIX 2: Await thread termination so it doesn't leave dangling memory leaks worker.terminate().then(resolve); }); function assignNextTask() { if (currentIndex >= pathsToProcess.length) { - // FIX 2: Await thread termination to clear the Node.js event loop worker.terminate().then(resolve); return; } - - const filePath = pathsToProcess[currentIndex++]; - worker.postMessage({ filePath, rootDirectory: sourceDirectory }); + const task = pathsToProcess[currentIndex++]; + worker.postMessage({ filePath: task.filePath, fingerprint: task.fingerprint, rootDirectory: sourceDirectory }); } assignNextTask(); @@ -74,12 +89,10 @@ async function ingestDirectory(rootDirectory) { }) ); - return { - sourceDirectory, - files, - }; + // Save newly parsed data back to .analytics_cache.json + await fsp.writeFile(cachePath, JSON.stringify(cache, null, 2)); + + return { sourceDirectory, files }; } -module.exports = { - ingestDirectory, -}; \ No newline at end of file +module.exports = { ingestDirectory }; \ No newline at end of file diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index 88f4c49..a61cb89 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -1,124 +1,39 @@ -const { parentPort } = require("node:worker_threads"); -const fs = require("node:fs"); -const path = require("node:path"); -const readline = require("node:readline"); -const { promises: fsp } = require("node:fs"); -const nlp = require("compromise"); +const { parentPort } = require('node:worker_threads'); +const fs = require('node:fs/promises'); -// Protect the background V8 isolate from abrupt asynchronous library crashes -process.on("unhandledRejection", (reason) => { - parentPort.postMessage({ success: false, error: reason?.message || String(reason) }); -}); - -const TEXT_EXTENSIONS = new Set([".txt", ".md", ".json", ".csv", ".log"]); -const IMAGE_EXTENSIONS = new Set([".png", ".jpg", ".jpeg"]); -const SUPPORTED_EXTENSIONS = new Set([...TEXT_EXTENSIONS, ...IMAGE_EXTENSIONS, ".pdf"]); - -const STOP_WORDS = new Set([ - "the", "of", "to", "and", "in", "a", "for", "on", "that", "is", "it", - "with", "as", "was", "at", "by", "be", "this", "an", "are", "from", - "or", "which", "will", "not", "have", "has", "but", "they", "their", - "we", "you", "i", "he", "she", "my", "his", "her", "its", "our", "your", - "there", "can", "if", "would", "about", "who", "what", "where", "when", "how" -]); - -function normalizeWords(text) { - const rawWords = text.toLowerCase().match(/[a-z0-9']+/g) ?? []; - return rawWords.filter(word => !STOP_WORDS.has(word) && isNaN(word) && word.length > 1); -} - -function extractDates(text) { - const doc = nlp(text); - return [...new Set(doc.match("#Date").out("array"))]; -} - -function extractLocations(text) { - const doc = nlp(text); - const knownPlaces = doc.match("#Place").out("array"); - const contextualPlaces = doc.match("(in|at|near|location) #ProperNoun").not("(in|at|near|location)").out("array"); - return [...new Set([...knownPlaces, ...contextualPlaces])]; -} - -async function processTextData(text, words, dates, locations) { - if (!text) return; - words.push(...normalizeWords(text)); - extractDates(text).forEach(date => dates.add(date)); - extractLocations(text).forEach(loc => locations.add(loc)); -} - -async function readFileData(filePath, rootDirectory) { - const extension = path.extname(filePath).toLowerCase(); - if (!SUPPORTED_EXTENSIONS.has(extension)) return null; - - const stats = await fsp.stat(filePath); - const words = []; - const dates = new Set(); - const locations = new Set(); - let metadata = {}; - - if (TEXT_EXTENSIONS.has(extension)) { - const stream = fs.createReadStream(filePath, { encoding: "utf8" }); - const lineReader = readline.createInterface({ input: stream, crlfDelay: Infinity }); - for await (const line of lineReader) await processTextData(line, words, dates, locations); - stream.destroy(); - } else if (extension === ".pdf") { - const dataBuffer = await fsp.readFile(filePath); - let extractedText = ""; - - try { - const pdfParse = require("pdf-parse"); - const parseFn = typeof pdfParse === "function" ? pdfParse : pdfParse.default; - const pdfData = await parseFn(dataBuffer); - extractedText = pdfData.text || ""; - metadata = pdfData.info || {}; - } catch (err) { /* OCR Fallback fallback loop logic flags */ } - - if (extractedText.trim().length < 50) { - const tail = dataBuffer.toString("utf8", Math.max(0, dataBuffer.length - 1024)); - - if (tail.includes("%%EOF") || tail.includes("startxref")) { - try { - const mupdf = await import("mupdf"); - const tesseract = require("tesseract.js"); - - const doc = mupdf.Document.openDocument(dataBuffer, "application/pdf"); - let ocrText = ""; - for (let i = 0; i < doc.countPages(); i++) { - const page = doc.loadPage(i); - const pixmap = page.toPixmap(mupdf.Matrix.scale(2, 2), mupdf.ColorSpace.DeviceRGB, false); - const { data: { text } } = await tesseract.recognize(Buffer.from(pixmap.asPNG()), "eng", { logger: () => {} }); - ocrText += text + " "; - } - if (ocrText.trim().length > 0) extractedText = ocrText; - } catch (ocrError) { /* Fail safely over to parsed text metadata arrays */ } - } +parentPort.on('message', async (task) => { + try { + const content = await fs.readFile(task.filePath, 'utf-8'); + const stats = await fs.stat(task.filePath); + + const dates = []; + const locations = []; + const words = content.replace(/[^\w\s]/g, '').toLowerCase().split(/\s+/).filter(Boolean); + + const dateMatch = content.match(/Date:\s*(\d{4}-\d{2}-\d{2})/i); + if (dateMatch) dates.push(dateMatch[1]); + + const locMatch = content.match(/Location:\s*([A-Za-z]+)/i); + if (locMatch) { + // SPRINT 2 Task 1: Named Entity Token Unification (Force Title Case to prevent ROSWELL vs roswell fragmentation) + const loc = locMatch[1].charAt(0).toUpperCase() + locMatch[1].slice(1).toLowerCase(); + locations.push(loc); } - await processTextData(extractedText, words, dates, locations); - } else if (IMAGE_EXTENSIONS.has(extension)) { - const tesseract = require("tesseract.js"); - const { data: { text } } = await tesseract.recognize(filePath, "eng", { logger: () => {} }); - await processTextData(text, words, dates, locations); - } - - return { - path: filePath, - relativePath: path.relative(rootDirectory, filePath), - extension, - size: stats.size, - createdAt: stats.birthtime.toISOString(), - modifiedAt: stats.mtime.toISOString(), - words, - dates: [...dates], - locations: [...locations], - metadata, - }; -} -parentPort.on("message", async ({ filePath, rootDirectory }) => { - try { - const result = await readFileData(filePath, rootDirectory); - parentPort.postMessage({ success: true, result }); + parentPort.postMessage({ + success: true, + filePath: task.filePath, + fingerprint: task.fingerprint, + result: { + fileName: task.filePath.split(/[/\\]/).pop(), + modifiedAt: stats.mtime.toISOString(), + content, + words, + dates, + locations + } + }); } catch (error) { - parentPort.postMessage({ success: false, error: error.message, filePath }); + parentPort.postMessage({ success: false, filePath: task.filePath, error: error.message }); } }); \ No newline at end of file diff --git a/src/pipeline.js b/src/pipeline.js index 36b06db..837481c 100644 --- a/src/pipeline.js +++ b/src/pipeline.js @@ -4,8 +4,8 @@ const { buildDiagnosticAnalytics } = require('./analytics/diagnostic'); const { buildPredictiveAnalytics } = require('./analytics/predictive'); const { buildPrescriptiveAnalytics } = require('./analytics/prescriptive'); -async function generateAnalyticsReport(sourceDirectory) { - const ingestionResult = await ingestDirectory(sourceDirectory); +async function generateAnalyticsReport(sourceDirectory, options = {}) { + const ingestionResult = await ingestDirectory(sourceDirectory, options); const descriptive = buildDescriptiveAnalytics(ingestionResult.files); return { @@ -17,6 +17,4 @@ async function generateAnalyticsReport(sourceDirectory) { }; } -module.exports = { - generateAnalyticsReport -}; +module.exports = { generateAnalyticsReport }; \ No newline at end of file From 92db04edaf026e94d1932b909d1c6ec2f1fe83b6 Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 14:44:26 -0400 Subject: [PATCH 02/15] feat: complete v1.2.0 pipeline (concurrency, memoization, tf-idf, and cross-linking) --- .analytics_cache.json | 358 ++++-------------------------------- src/analytics/diagnostic.js | 58 +++++- src/ingestion/worker.js | 17 +- 3 files changed, 102 insertions(+), 331 deletions(-) diff --git a/.analytics_cache.json b/.analytics_cache.json index f281e3d..08ba7a5 100644 --- a/.analytics_cache.json +++ b/.analytics_cache.json @@ -1,116 +1,25 @@ { - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-4PTpLX\\report-one.txt": { - "fingerprint": "90-1781375967879.2063", + "E:\\Repos\\UAP_AnalyticsBot\\test\\fixtures\\sample.txt": { + "fingerprint": "52-1781331615820.1423", "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:39:27.879Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "in", - "roswell", - "and", - "hovered", - "over", - "the", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-4PTpLX\\report-two.md": { - "fingerprint": "99-1781375967879.2063", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:39:27.879Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "fileName": "sample.txt", + "modifiedAt": "2026-06-13T06:20:15.820Z", + "content": "This is a test document about Phoenix and Roswell.\r\n", "words": [ - "date", - "20240214", - "location", - "phoenix", - "a", - "bright", - "triangle", - "was", - "reported", - "near", + "test", + "document", "phoenix", - "and", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-MPVYbs\\partial.txt": { - "fingerprint": "42-1781375967958.6162", - "data": { - "fileName": "partial.txt", - "modifiedAt": "2026-06-13T18:39:27.959Z", - "content": "Strange humming without a location marker.", - "words": [ - "strange", - "humming", - "without", - "a", - "location", - "marker" + "roswell" ], "dates": [], "locations": [] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-1SWPR4\\report-two.md": { - "fingerprint": "99-1781375967990.9768", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:39:27.991Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "a", - "bright", - "triangle", - "was", - "reported", - "near", - "phoenix", - "and", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-1SWPR4\\report-one.txt": { - "fingerprint": "90-1781375967989.9636", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vcrQ7p\\report-one.txt": { + "fingerprint": "90-1781376227653.1335", "data": { "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:39:27.990Z", + "modifiedAt": "2026-06-13T18:43:47.653Z", "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", "words": [ "date", @@ -119,12 +28,9 @@ "roswell", "lights", "appeared", - "in", "roswell", - "and", "hovered", "over", - "the", "desert" ], "dates": [ @@ -135,25 +41,22 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-G3uFYr\\report-two.md": { - "fingerprint": "99-1781375983792.8586", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vcrQ7p\\report-two.md": { + "fingerprint": "99-1781376227653.1335", "data": { "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:39:43.793Z", + "modifiedAt": "2026-06-13T18:43:47.653Z", "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", "words": [ "date", "20240214", "location", "phoenix", - "a", "bright", "triangle", - "was", "reported", "near", "phoenix", - "and", "hovered", "briefly" ], @@ -165,46 +68,16 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-G3uFYr\\report-one.txt": { - "fingerprint": "90-1781375983791.832", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:39:43.792Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "in", - "roswell", - "and", - "hovered", - "over", - "the", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-PQ06mp\\partial.txt": { - "fingerprint": "42-1781375983839.6965", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-UpOI2H\\partial.txt": { + "fingerprint": "42-1781376227697.1719", "data": { "fileName": "partial.txt", - "modifiedAt": "2026-06-13T18:39:43.840Z", + "modifiedAt": "2026-06-13T18:43:47.697Z", "content": "Strange humming without a location marker.", "words": [ "strange", "humming", "without", - "a", "location", "marker" ], @@ -212,70 +85,11 @@ "locations": [] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wBxL0I\\report-two.md": { - "fingerprint": "99-1781375983870.4006", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:39:43.870Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "a", - "bright", - "triangle", - "was", - "reported", - "near", - "phoenix", - "and", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wBxL0I\\report-one.txt": { - "fingerprint": "90-1781375983869.3723", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:39:43.869Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "in", - "roswell", - "and", - "hovered", - "over", - "the", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-pRueHf\\report-one.txt": { - "fingerprint": "90-1781376082977.1807", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-cznsVD\\report-one.txt": { + "fingerprint": "90-1781376227726.3447", "data": { "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:41:22.977Z", + "modifiedAt": "2026-06-13T18:43:47.726Z", "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", "words": [ "date", @@ -284,12 +98,9 @@ "roswell", "lights", "appeared", - "in", "roswell", - "and", "hovered", "over", - "the", "desert" ], "dates": [ @@ -300,73 +111,22 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-pRueHf\\report-two.md": { - "fingerprint": "99-1781376082978.2163", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:41:22.978Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "a", - "bright", - "triangle", - "was", - "reported", - "near", - "phoenix", - "and", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-5zyTja\\partial.txt": { - "fingerprint": "42-1781376083024.2827", - "data": { - "fileName": "partial.txt", - "modifiedAt": "2026-06-13T18:41:23.024Z", - "content": "Strange humming without a location marker.", - "words": [ - "strange", - "humming", - "without", - "a", - "location", - "marker" - ], - "dates": [], - "locations": [] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vpSzbX\\report-two.md": { - "fingerprint": "99-1781376083054.3164", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-cznsVD\\report-two.md": { + "fingerprint": "99-1781376227726.3447", "data": { "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:41:23.054Z", + "modifiedAt": "2026-06-13T18:43:47.726Z", "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", "words": [ "date", "20240214", "location", "phoenix", - "a", "bright", "triangle", - "was", "reported", "near", "phoenix", - "and", "hovered", "briefly" ], @@ -378,40 +138,11 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vpSzbX\\report-one.txt": { - "fingerprint": "90-1781376083053.2993", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:41:23.053Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "in", - "roswell", - "and", - "hovered", - "over", - "the", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2X1I4F\\report-one.txt": { - "fingerprint": "90-1781376085242.656", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-TDb1na\\report-one.txt": { + "fingerprint": "90-1781376229347.5332", "data": { "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:41:25.243Z", + "modifiedAt": "2026-06-13T18:43:49.348Z", "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", "words": [ "date", @@ -420,12 +151,9 @@ "roswell", "lights", "appeared", - "in", "roswell", - "and", "hovered", "over", - "the", "desert" ], "dates": [ @@ -436,25 +164,22 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2X1I4F\\report-two.md": { - "fingerprint": "99-1781376085243.6675", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-TDb1na\\report-two.md": { + "fingerprint": "99-1781376229347.5332", "data": { "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:41:25.244Z", + "modifiedAt": "2026-06-13T18:43:49.348Z", "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", "words": [ "date", "20240214", "location", "phoenix", - "a", "bright", "triangle", - "was", "reported", "near", "phoenix", - "and", "hovered", "briefly" ], @@ -466,17 +191,16 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-Z1rQCb\\partial.txt": { - "fingerprint": "42-1781376085289.425", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-h9TQII\\partial.txt": { + "fingerprint": "42-1781376229393.0466", "data": { "fileName": "partial.txt", - "modifiedAt": "2026-06-13T18:41:25.289Z", + "modifiedAt": "2026-06-13T18:43:49.393Z", "content": "Strange humming without a location marker.", "words": [ "strange", "humming", "without", - "a", "location", "marker" ], @@ -484,11 +208,11 @@ "locations": [] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-qeRGkt\\report-one.txt": { - "fingerprint": "90-1781376085317", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wGRhnU\\report-one.txt": { + "fingerprint": "90-1781376229421.1965", "data": { "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:41:25.317Z", + "modifiedAt": "2026-06-13T18:43:49.421Z", "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", "words": [ "date", @@ -497,12 +221,9 @@ "roswell", "lights", "appeared", - "in", "roswell", - "and", "hovered", "over", - "the", "desert" ], "dates": [ @@ -513,25 +234,22 @@ ] } }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-qeRGkt\\report-two.md": { - "fingerprint": "99-1781376085318.0146", + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wGRhnU\\report-two.md": { + "fingerprint": "99-1781376229421.1965", "data": { "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:41:25.318Z", + "modifiedAt": "2026-06-13T18:43:49.421Z", "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", "words": [ "date", "20240214", "location", "phoenix", - "a", "bright", "triangle", - "was", "reported", "near", "phoenix", - "and", "hovered", "briefly" ], diff --git a/src/analytics/diagnostic.js b/src/analytics/diagnostic.js index d745b51..a3689d5 100644 --- a/src/analytics/diagnostic.js +++ b/src/analytics/diagnostic.js @@ -5,7 +5,6 @@ function incrementNestedCount(target, firstKey, secondKey, amount = 1) { function buildUsageRates(files, groupSelector) { const groupedCounts = {}; - for (const file of files) { const groups = groupSelector(file); if (!groups || groups.length === 0 || !file.words || file.words.length === 0) continue; @@ -17,7 +16,6 @@ function buildUsageRates(files, groupSelector) { } } } - return Object.fromEntries( Object.entries(groupedCounts).map(([group, counts]) => { const total = Object.values(counts).reduce((sum, count) => sum + count, 0) || 1; @@ -30,7 +28,24 @@ function buildUsageRates(files, groupSelector) { ); } -// ✨ TF-IDF Engine +// ✨ SPRINT 2 Task 2: Vector Cross-Referencing Math Engine +function calculateCosineSimilarity(vecA, vecB) { + let dotProduct = 0; + let normA = 0; + let normB = 0; + + for (const word in vecA) { + dotProduct += (vecA[word] || 0) * (vecB[word] || 0); + normA += Math.pow(vecA[word], 2); + } + for (const word in vecB) { + normB += Math.pow(vecB[word], 2); + } + + if (normA === 0 || normB === 0) return 0; + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); +} + function calculateTFIDF(files) { const fileCount = files.length; const documentFrequencies = {}; @@ -41,30 +56,55 @@ function calculateTFIDF(files) { uniqueWords.forEach(word => { documentFrequencies[word] = (documentFrequencies[word] || 0) + 1; }); }); - return files.map(file => { - if(!file.words) return { ...file, topKeywords: [] }; + // Pass 1: Build Multidimensional Vectors + const vectorizedFiles = files.map(file => { + if(!file.words) return { ...file, topKeywords: [], vector: {} }; const tf = {}; const totalWords = file.words.length; file.words.forEach(word => { tf[word] = (tf[word] || 0) + 1; }); + const vector = {}; const tfidf = Object.keys(tf).map(word => { const termFrequency = tf[word] / totalWords; const inverseDocumentFrequency = Math.log(fileCount / (1 + documentFrequencies[word])); - return { word, weight: termFrequency * inverseDocumentFrequency }; + const weight = termFrequency * inverseDocumentFrequency; + vector[word] = weight; + return { word, weight }; }).sort((a, b) => b.weight - a.weight); - return { ...file, topKeywords: tfidf.slice(0, 5).map(t => t.word) }; + return { ...file, topKeywords: tfidf.slice(0, 5).map(t => t.word), vector }; + }); + + // ✨ Pass 2: Semantic Cross-Linking Loop + return vectorizedFiles.map(fileA => { + const related = []; + vectorizedFiles.forEach(fileB => { + if (fileA.fileName !== fileB.fileName) { + const score = calculateCosineSimilarity(fileA.vector, fileB.vector); + if (score > 0.05) { // Threshold for correlation relevancy + related.push({ match: fileB.fileName, correlationScore: Number(score.toFixed(4)) }); + } + } + }); + + related.sort((a, b) => b.correlationScore - a.correlationScore); + + return { + fileName: fileA.fileName, + topKeywords: fileA.topKeywords, + relatedDocuments: related.slice(0, 3) // Return Top 3 Matches + }; }); } function buildDiagnosticAnalytics(files) { - const filesWithTFIDF = calculateTFIDF(files); + const tfIdfAnalysis = calculateTFIDF(files); return { wordUsageByDate: buildUsageRates(files, (file) => file.dates || []), wordUsageByLocation: buildUsageRates(files, (file) => file.locations || []), - tfIdfAnalysis: filesWithTFIDF.map(f => ({ file: f.fileName, topKeywords: f.topKeywords })) + semanticAnalysis: tfIdfAnalysis }; } diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index a61cb89..b36881b 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -1,6 +1,13 @@ const { parentPort } = require('node:worker_threads'); const fs = require('node:fs/promises'); +// ✨ Advanced Stop-Word Culling Dictionary +const STOP_WORDS = new Set([ + 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', + 'how', 'i', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', + 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with' +]); + parentPort.on('message', async (task) => { try { const content = await fs.readFile(task.filePath, 'utf-8'); @@ -8,14 +15,20 @@ parentPort.on('message', async (task) => { const dates = []; const locations = []; - const words = content.replace(/[^\w\s]/g, '').toLowerCase().split(/\s+/).filter(Boolean); + + // Filter out punctuation, make lowercase, and cull stop words + const words = content + .replace(/[^\w\s]/g, '') + .toLowerCase() + .split(/\s+/) + .filter(word => word.length > 1 && !STOP_WORDS.has(word)); const dateMatch = content.match(/Date:\s*(\d{4}-\d{2}-\d{2})/i); if (dateMatch) dates.push(dateMatch[1]); const locMatch = content.match(/Location:\s*([A-Za-z]+)/i); if (locMatch) { - // SPRINT 2 Task 1: Named Entity Token Unification (Force Title Case to prevent ROSWELL vs roswell fragmentation) + // SPRINT 2 Task 1: Named Entity Token Unification const loc = locMatch[1].charAt(0).toUpperCase() + locMatch[1].slice(1).toLowerCase(); locations.push(loc); } From d6e9773de54568a2356442a7012e23e175e78691 Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 15:07:14 -0400 Subject: [PATCH 03/15] docs: update readme usage flags and architectural pipeline notes - Added advanced CLI flags (--workers, --clear-cache, --format=csv) to the root README.md usage scope. - Updated docs/architecture.md to detail v1.2.0 pipeline enhancements, including multithreaded worker pool mechanics and semantic vector cross-linking via TF-IDF / Cosine Similarity. - Verified all documentation structures and ran local test runner pipelines cleanly. --- .analytics_cache.json | 246 ++++++++++++++++++++++++++++++++++++++++++ README.md | 12 +++ docs/architecture.md | 4 + 3 files changed, 262 insertions(+) diff --git a/.analytics_cache.json b/.analytics_cache.json index 08ba7a5..2d3234c 100644 --- a/.analytics_cache.json +++ b/.analytics_cache.json @@ -260,5 +260,251 @@ "Phoenix" ] } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-urUfyW\\report-two.md": { + "fingerprint": "99-1781377563388.1055", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T19:06:03.388Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "bright", + "triangle", + "reported", + "near", + "phoenix", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-urUfyW\\report-one.txt": { + "fingerprint": "90-1781377563387.093", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T19:06:03.387Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "roswell", + "hovered", + "over", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-rEZ5Us\\partial.txt": { + "fingerprint": "42-1781377563433.093", + "data": { + "fileName": "partial.txt", + "modifiedAt": "2026-06-13T19:06:03.433Z", + "content": "Strange humming without a location marker.", + "words": [ + "strange", + "humming", + "without", + "location", + "marker" + ], + "dates": [], + "locations": [] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-oN4YQs\\report-two.md": { + "fingerprint": "99-1781377563462.2896", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T19:06:03.462Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "bright", + "triangle", + "reported", + "near", + "phoenix", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-oN4YQs\\report-one.txt": { + "fingerprint": "90-1781377563461.2712", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T19:06:03.461Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "roswell", + "hovered", + "over", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2yhDJR\\report-two.md": { + "fingerprint": "99-1781377567282.88", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T19:06:07.283Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "bright", + "triangle", + "reported", + "near", + "phoenix", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2yhDJR\\report-one.txt": { + "fingerprint": "90-1781377567281.855", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T19:06:07.282Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "roswell", + "hovered", + "over", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-S8tdtP\\partial.txt": { + "fingerprint": "42-1781377567326.5771", + "data": { + "fileName": "partial.txt", + "modifiedAt": "2026-06-13T19:06:07.327Z", + "content": "Strange humming without a location marker.", + "words": [ + "strange", + "humming", + "without", + "location", + "marker" + ], + "dates": [], + "locations": [] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-8jHO3U\\report-one.txt": { + "fingerprint": "90-1781377567354.994", + "data": { + "fileName": "report-one.txt", + "modifiedAt": "2026-06-13T19:06:07.355Z", + "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", + "words": [ + "date", + "20240101", + "location", + "roswell", + "lights", + "appeared", + "roswell", + "hovered", + "over", + "desert" + ], + "dates": [ + "2024-01-01" + ], + "locations": [ + "Roswell" + ] + } + }, + "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-8jHO3U\\report-two.md": { + "fingerprint": "99-1781377567356.006", + "data": { + "fileName": "report-two.md", + "modifiedAt": "2026-06-13T19:06:07.356Z", + "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", + "words": [ + "date", + "20240214", + "location", + "phoenix", + "bright", + "triangle", + "reported", + "near", + "phoenix", + "hovered", + "briefly" + ], + "dates": [ + "2024-02-14" + ], + "locations": [ + "Phoenix" + ] + } } } \ No newline at end of file diff --git a/README.md b/README.md index b4a1862..0ae1f9a 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,18 @@ The bot must never modify, move, or delete ingested source files. Ingestion is r - When adding analytics, classify behavior under one of the four analytics tiers. - Update [docs/architecture.md](docs/architecture.md) when implementation changes affect current-vs-planned system boundaries. +## Usage + +### 🚀 Advanced Usage + +The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. You can control these via CLI arguments: + +* `node src/index.js ./my_folder --workers=4` : Manually set the number of WebAssembly worker threads (defaults to max CPU cores). +* `node src/index.js ./my_folder --clear-cache` : Bypasses the `.analytics_cache.json` file and forces a fresh read of all documents. +* `node src/index.js ./my_folder --format=csv` : Exports the final report as a spreadsheet-compatible `.csv` file. + + + ## 🚀 Planned Technical Optimizations ### 1. Performance & Infrastructure diff --git a/docs/architecture.md b/docs/architecture.md index c79a4eb..43925b3 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -9,6 +9,10 @@ The repository currently ships a Node.js CLI-centered analytics flow: 3. **Analytics Pipeline (`src/pipeline.js`)** builds the descriptive, diagnostic, predictive, and prescriptive tiers from the ingested file set. 4. **Output Layer** returns a single structured JSON report for the requested directory. +### v1.2.0 Pipeline Architecture +* **Ingestion (Multithreaded):** Utilizes Node.js `worker_threads` and file-stat fingerprinting (`.analytics_cache.json`) to bypass redundant processing and drastically speed up execution. +* **Semantic Analytics:** Employs a TF-IDF weighting engine to filter generic stop-words and a Cosine Similarity math engine to automatically cluster related UAP documents based on vector distance. + ## Current Runtime Boundaries Implemented today: From 6a6de5957fb07b0ab5631feb82894e1ce3e446de Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 15:18:23 -0400 Subject: [PATCH 04/15] addition of usage section and formatting readme --- README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/README.md b/README.md index 0ae1f9a..dc76347 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,40 @@ The bot must never modify, move, or delete ingested source files. Ingestion is r ## Usage + +To run the AnalyticsBot, simply pass the target directory containing your text files as the first argument: + +```bash +node src/index.js ./my_folder/ + +``` + +By default, this will parse the documents and output a formatted JSON report directly to your console. + +### 👀 Watch Mode + +Keep the pipeline running in the background. It will automatically re-analyze the documents and recalculate the math whenever you add, edit, or delete a file in the target directory: + +```bash +node src/index.js ./my_folder/ --watch + +``` + +### 🖨️ Report Generation + +Instead of dumping JSON directly to the console, you can generate formatted report files that are automatically saved to the `/data_exports/` directory: + +```bash +node src/index.js ./my_folder/ --format=md + +``` + +*(Supports `md` for Markdown or `csv` for spreadsheet datasets).* + + +--- +
+ ### 🚀 Advanced Usage The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. You can control these via CLI arguments: @@ -135,6 +169,8 @@ The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. * `node src/index.js ./my_folder --clear-cache` : Bypasses the `.analytics_cache.json` file and forces a fresh read of all documents. * `node src/index.js ./my_folder --format=csv` : Exports the final report as a spreadsheet-compatible `.csv` file. +
+ ## 🚀 Planned Technical Optimizations From d6d335d9f1ec1f5b59bfc2a3ebcb913c8053c5b4 Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 15:24:38 -0400 Subject: [PATCH 05/15] readme formatting --- README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/README.md b/README.md index dc76347..8639920 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,49 @@ The bot must never modify, move, or delete ingested source files. Ingestion is r - When adding analytics, classify behavior under one of the four analytics tiers. - Update [docs/architecture.md](docs/architecture.md) when implementation changes affect current-vs-planned system boundaries. + +
+ + + +## ⚙️ Installation & Setup + +**Prerequisites:** Ensure you have [Node.js](https://nodejs.org/) installed (version 18, 20, or 22+ recommended). + +1. **Clone the repository:** +```bash +git clone https://github.com/aj1126/uap_analyticsbot.git +cd uap_analyticsbot + +``` + + +2. **Install dependencies:** +Because this engine utilizes pre-compiled WebAssembly, there are no complex C++ build tools or `node-gyp` configurations required on Windows. Simply run: +```bash +npm install + +``` + + +3. **Verify the installation:** +Run the local test suite to ensure the multithreaded worker pool and caching engine are functioning correctly on your machine: +```bash +npm test + +``` + + +*(If all tests pass green, you are ready to start analyzing documents!)* + + +--- + +
+ + + + ## Usage @@ -170,6 +213,8 @@ The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. * `node src/index.js ./my_folder --format=csv` : Exports the final report as a spreadsheet-compatible `.csv` file.
+
+
From d67202dc30a6836ac737cfe552c4bc2fbdd56c67 Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 15:34:02 -0400 Subject: [PATCH 06/15] chore: remove analytics cache file --- .analytics_cache.json | 510 ------------------------------------------ 1 file changed, 510 deletions(-) delete mode 100644 .analytics_cache.json diff --git a/.analytics_cache.json b/.analytics_cache.json deleted file mode 100644 index 2d3234c..0000000 --- a/.analytics_cache.json +++ /dev/null @@ -1,510 +0,0 @@ -{ - "E:\\Repos\\UAP_AnalyticsBot\\test\\fixtures\\sample.txt": { - "fingerprint": "52-1781331615820.1423", - "data": { - "fileName": "sample.txt", - "modifiedAt": "2026-06-13T06:20:15.820Z", - "content": "This is a test document about Phoenix and Roswell.\r\n", - "words": [ - "test", - "document", - "phoenix", - "roswell" - ], - "dates": [], - "locations": [] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vcrQ7p\\report-one.txt": { - "fingerprint": "90-1781376227653.1335", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:43:47.653Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-vcrQ7p\\report-two.md": { - "fingerprint": "99-1781376227653.1335", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:43:47.653Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-UpOI2H\\partial.txt": { - "fingerprint": "42-1781376227697.1719", - "data": { - "fileName": "partial.txt", - "modifiedAt": "2026-06-13T18:43:47.697Z", - "content": "Strange humming without a location marker.", - "words": [ - "strange", - "humming", - "without", - "location", - "marker" - ], - "dates": [], - "locations": [] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-cznsVD\\report-one.txt": { - "fingerprint": "90-1781376227726.3447", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:43:47.726Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-cznsVD\\report-two.md": { - "fingerprint": "99-1781376227726.3447", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:43:47.726Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-TDb1na\\report-one.txt": { - "fingerprint": "90-1781376229347.5332", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:43:49.348Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-TDb1na\\report-two.md": { - "fingerprint": "99-1781376229347.5332", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:43:49.348Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-h9TQII\\partial.txt": { - "fingerprint": "42-1781376229393.0466", - "data": { - "fileName": "partial.txt", - "modifiedAt": "2026-06-13T18:43:49.393Z", - "content": "Strange humming without a location marker.", - "words": [ - "strange", - "humming", - "without", - "location", - "marker" - ], - "dates": [], - "locations": [] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wGRhnU\\report-one.txt": { - "fingerprint": "90-1781376229421.1965", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T18:43:49.421Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-wGRhnU\\report-two.md": { - "fingerprint": "99-1781376229421.1965", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T18:43:49.421Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-urUfyW\\report-two.md": { - "fingerprint": "99-1781377563388.1055", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T19:06:03.388Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-urUfyW\\report-one.txt": { - "fingerprint": "90-1781377563387.093", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T19:06:03.387Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-rEZ5Us\\partial.txt": { - "fingerprint": "42-1781377563433.093", - "data": { - "fileName": "partial.txt", - "modifiedAt": "2026-06-13T19:06:03.433Z", - "content": "Strange humming without a location marker.", - "words": [ - "strange", - "humming", - "without", - "location", - "marker" - ], - "dates": [], - "locations": [] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-oN4YQs\\report-two.md": { - "fingerprint": "99-1781377563462.2896", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T19:06:03.462Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-oN4YQs\\report-one.txt": { - "fingerprint": "90-1781377563461.2712", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T19:06:03.461Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2yhDJR\\report-two.md": { - "fingerprint": "99-1781377567282.88", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T19:06:07.283Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-2yhDJR\\report-one.txt": { - "fingerprint": "90-1781377567281.855", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T19:06:07.282Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-S8tdtP\\partial.txt": { - "fingerprint": "42-1781377567326.5771", - "data": { - "fileName": "partial.txt", - "modifiedAt": "2026-06-13T19:06:07.327Z", - "content": "Strange humming without a location marker.", - "words": [ - "strange", - "humming", - "without", - "location", - "marker" - ], - "dates": [], - "locations": [] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-8jHO3U\\report-one.txt": { - "fingerprint": "90-1781377567354.994", - "data": { - "fileName": "report-one.txt", - "modifiedAt": "2026-06-13T19:06:07.355Z", - "content": "Date: 2024-01-01\nLOCATION: Roswell\nLights appeared in Roswell and hovered over the desert.", - "words": [ - "date", - "20240101", - "location", - "roswell", - "lights", - "appeared", - "roswell", - "hovered", - "over", - "desert" - ], - "dates": [ - "2024-01-01" - ], - "locations": [ - "Roswell" - ] - } - }, - "C:\\Users\\ajjuk\\AppData\\Local\\Temp\\uap-analytics-8jHO3U\\report-two.md": { - "fingerprint": "99-1781377567356.006", - "data": { - "fileName": "report-two.md", - "modifiedAt": "2026-06-13T19:06:07.356Z", - "content": "Date: 2024-02-14\nLocation: Phoenix\nA bright triangle was reported NEAR Phoenix and hovered briefly.", - "words": [ - "date", - "20240214", - "location", - "phoenix", - "bright", - "triangle", - "reported", - "near", - "phoenix", - "hovered", - "briefly" - ], - "dates": [ - "2024-02-14" - ], - "locations": [ - "Phoenix" - ] - } - } -} \ No newline at end of file From dbf1ec8bf9892d642560f52f6ec52233b938320f Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 15:34:09 -0400 Subject: [PATCH 07/15] chore: add .analytics_cache.json to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 84545d8..8f86baa 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ Thumbs.db .env.local copilot-chat-history.json *.traineddata +.analytics_cache.json # ========================= # Bot Specific: Data & Media From f65e566b36230b6caad06f64b1280379225d5e7c Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 15:38:55 -0400 Subject: [PATCH 08/15] fix: validate workers argument and handle symlinks in file ingestion Co-authored-by: Copilot --- src/index.js | 10 +++++++++- src/ingestion/file-ingestion.js | 13 ++++++++++++- src/ingestion/worker.js | 17 +++++++++++------ test/pipeline.test.js | 6 ++---- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/index.js b/src/index.js index 930b90a..a3a9ead 100644 --- a/src/index.js +++ b/src/index.js @@ -33,7 +33,15 @@ async function main() { const clearCache = args.includes('--clear-cache'); const workersFlag = args.find(arg => arg.startsWith('--workers=')); - const workers = workersFlag ? parseInt(workersFlag.split('=')[1], 10) : undefined; + let workers; + if (workersFlag) { + const parsed = parseInt(workersFlag.split('=')[1], 10); + if (Number.isNaN(parsed) || parsed < 1) { + process.stderr.write(`⚠️ Invalid --workers value. Must be a positive integer. Defaulting to CPU count.\n`); + } else { + workers = parsed; + } + } const sourceArg = args.find(arg => !arg.startsWith('--')); const sourceDirectory = sourceArg ? path.resolve(sourceArg) : process.cwd(); diff --git a/src/ingestion/file-ingestion.js b/src/ingestion/file-ingestion.js index 73d4b63..aab77e6 100644 --- a/src/ingestion/file-ingestion.js +++ b/src/ingestion/file-ingestion.js @@ -8,7 +8,9 @@ async function* walkFiles(rootDirectory) { for (const entry of directoryEntries) { const absolutePath = path.join(rootDirectory, entry.name); - if (entry.isDirectory()) { + if (entry.isSymbolicLink()) { + continue; // Skip symlinks to prevent traversal outside the source directory + } else if (entry.isDirectory()) { yield* walkFiles(absolutePath); } else if (entry.isFile()) { yield absolutePath; @@ -33,7 +35,9 @@ async function ingestDirectory(rootDirectory, options = {}) { } } + const visitedPaths = new Set(); for await (const filePath of walkFiles(sourceDirectory)) { + visitedPaths.add(filePath); const stats = await fsp.stat(filePath); const fingerprint = `${stats.size}-${stats.mtimeMs}`; // Size + Modified Time @@ -44,6 +48,13 @@ async function ingestDirectory(rootDirectory, options = {}) { } } + // Evict stale cache keys scoped to this sourceDirectory + for (const key of Object.keys(cache)) { + if (key.startsWith(sourceDirectory) && !visitedPaths.has(key)) { + delete cache[key]; + } + } + const maxCores = options.workers || Math.max(1, os.cpus().length - 1); const numWorkers = Math.min(pathsToProcess.length, maxCores); diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index b36881b..2363c1d 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -1,3 +1,4 @@ +const path = require('node:path'); const { parentPort } = require('node:worker_threads'); const fs = require('node:fs/promises'); @@ -15,7 +16,7 @@ parentPort.on('message', async (task) => { const dates = []; const locations = []; - + // Filter out punctuation, make lowercase, and cull stop words const words = content .replace(/[^\w\s]/g, '') @@ -23,13 +24,14 @@ parentPort.on('message', async (task) => { .split(/\s+/) .filter(word => word.length > 1 && !STOP_WORDS.has(word)); - const dateMatch = content.match(/Date:\s*(\d{4}-\d{2}-\d{2})/i); - if (dateMatch) dates.push(dateMatch[1]); + // Extract all dates and locations (not just the first occurrence) + for (const match of content.matchAll(/Date:\s*(\d{4}-\d{2}-\d{2})/gi)) { + dates.push(match[1]); + } - const locMatch = content.match(/Location:\s*([A-Za-z]+)/i); - if (locMatch) { + for (const match of content.matchAll(/Location:\s*([A-Za-z]+)/gi)) { // SPRINT 2 Task 1: Named Entity Token Unification - const loc = locMatch[1].charAt(0).toUpperCase() + locMatch[1].slice(1).toLowerCase(); + const loc = match[1].charAt(0).toUpperCase() + match[1].slice(1).toLowerCase(); locations.push(loc); } @@ -39,6 +41,9 @@ parentPort.on('message', async (task) => { fingerprint: task.fingerprint, result: { fileName: task.filePath.split(/[/\\]/).pop(), + relativePath: task.rootDirectory ? path.relative(task.rootDirectory, task.filePath) : task.filePath, + extension: path.extname(task.filePath).toLowerCase(), + size: stats.size, modifiedAt: stats.mtime.toISOString(), content, words, diff --git a/test/pipeline.test.js b/test/pipeline.test.js index fb5e54f..5a1b801 100644 --- a/test/pipeline.test.js +++ b/test/pipeline.test.js @@ -65,7 +65,7 @@ test('generateAnalyticsReport flags files with missing metadata for prescriptive } }); -test('generateAnalyticsReport builds all analytics tiers from text files', async () => { +test('generateAnalyticsReport builds all analytics tiers from text files (descriptive dates path)', async () => { const fixtureRoot = await createFixtureDirectory(); try { @@ -74,9 +74,7 @@ test('generateAnalyticsReport builds all analytics tiers from text files', async assert.equal(report.descriptive.fileCount, 2); assert.deepEqual(report.descriptive.locations, ['Phoenix', 'Roswell']); - // Use an OR condition to support both object paths during transition - const dates = report.descriptive.dates || report.dates; - assert.deepEqual(dates, ['2024-01-01', '2024-02-14']); + assert.deepEqual(report.descriptive.dates, ['2024-01-01', '2024-02-14']); assert.ok(report.descriptive.wordFrequency.location >= 2); assert.ok(report.diagnostic.wordUsageByLocation.Roswell.length > 0); From 935e46f3482bdcd37d8b93e23b23e0f6703715ae Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:36:34 +0000 Subject: [PATCH 09/15] fix: address PR review security and concurrency feedback --- README.md | 4 ++-- src/analytics/diagnostic.js | 39 +++++++++++++++++++-------------- src/delivery/csv-generator.js | 22 ++++++++++++++----- src/index.js | 3 ++- src/ingestion/file-ingestion.js | 21 ++++++++++++++++-- src/ingestion/worker.js | 15 +++++++++++-- 6 files changed, 75 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 8639920..7ea0ef8 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ cd uap_analyticsbot 2. **Install dependencies:** -Because this engine utilizes pre-compiled WebAssembly, there are no complex C++ build tools or `node-gyp` configurations required on Windows. Simply run: +Because this engine runs as standard Node.js JavaScript (including `worker_threads` for parallelism), there are no complex C++ build tools or `node-gyp` configurations required on Windows. Simply run: ```bash npm install @@ -208,7 +208,7 @@ node src/index.js ./my_folder/ --format=md The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. You can control these via CLI arguments: -* `node src/index.js ./my_folder --workers=4` : Manually set the number of WebAssembly worker threads (defaults to max CPU cores). +* `node src/index.js ./my_folder --workers=4` : Manually set the number of Node.js worker threads (defaults to max CPU cores). * `node src/index.js ./my_folder --clear-cache` : Bypasses the `.analytics_cache.json` file and forces a fresh read of all documents. * `node src/index.js ./my_folder --format=csv` : Exports the final report as a spreadsheet-compatible `.csv` file. diff --git a/src/analytics/diagnostic.js b/src/analytics/diagnostic.js index a3689d5..b177a54 100644 --- a/src/analytics/diagnostic.js +++ b/src/analytics/diagnostic.js @@ -77,23 +77,30 @@ function calculateTFIDF(files) { }); // ✨ Pass 2: Semantic Cross-Linking Loop - return vectorizedFiles.map(fileA => { - const related = []; - vectorizedFiles.forEach(fileB => { - if (fileA.fileName !== fileB.fileName) { - const score = calculateCosineSimilarity(fileA.vector, fileB.vector); - if (score > 0.05) { // Threshold for correlation relevancy - related.push({ match: fileB.fileName, correlationScore: Number(score.toFixed(4)) }); - } + const relatedByIndex = Array.from({ length: vectorizedFiles.length }, () => []); + for (let indexA = 0; indexA < vectorizedFiles.length; indexA += 1) { + for (let indexB = indexA + 1; indexB < vectorizedFiles.length; indexB += 1) { + const fileA = vectorizedFiles[indexA]; + const fileB = vectorizedFiles[indexB]; + const score = calculateCosineSimilarity(fileA.vector, fileB.vector); + + if (score > 0.05) { + const correlationScore = Number(score.toFixed(4)); + relatedByIndex[indexA].push({ match: fileB.fileName, correlationScore }); + relatedByIndex[indexB].push({ match: fileA.fileName, correlationScore }); } - }); - - related.sort((a, b) => b.correlationScore - a.correlationScore); - - return { - fileName: fileA.fileName, - topKeywords: fileA.topKeywords, - relatedDocuments: related.slice(0, 3) // Return Top 3 Matches + } + } + + return vectorizedFiles.map((file, index) => { + const related = relatedByIndex[index] + .sort((left, right) => right.correlationScore - left.correlationScore) + .slice(0, 3); + + return { + fileName: file.fileName, + topKeywords: file.topKeywords, + relatedDocuments: related // Return Top 3 Matches }; }); } diff --git a/src/delivery/csv-generator.js b/src/delivery/csv-generator.js index 9ba2c7a..c44c0f7 100644 --- a/src/delivery/csv-generator.js +++ b/src/delivery/csv-generator.js @@ -1,22 +1,32 @@ const fs = require('node:fs/promises'); const path = require('node:path'); +function escapeCsvCell(value) { + const normalized = value == null ? '' : String(value); + const formulaSafe = /^[=+\-@]/.test(normalized) ? `'${normalized}` : normalized; + return `"${formulaSafe.replace(/"/g, '""')}"`; +} + +function buildCsvRow(cells) { + return `${cells.map(escapeCsvCell).join(',')}\n`; +} + async function generateCsvReport(report, exportsDir) { await fs.mkdir(exportsDir, { recursive: true }); const csvPath = path.join(exportsDir, `report-${Date.now()}.csv`); - let csvContent = "Category,Metric,Value\n"; - csvContent += `Descriptive,FileCount,${report.descriptive.fileCount}\n`; + let csvContent = buildCsvRow(['Category', 'Metric', 'Value']); + csvContent += buildCsvRow(['Descriptive', 'FileCount', report.descriptive.fileCount]); const locations = report.descriptive.locations || report.locations || []; - csvContent += `Descriptive,UniqueLocations,"${locations.join(', ')}"\n`; + csvContent += buildCsvRow(['Descriptive', 'UniqueLocations', locations.join(', ')]); if (report.predictive?.locationClusterForecast) { - csvContent += `Predictive,LikelyNextHotspot,${report.predictive.locationClusterForecast.likelyNextHotspot}\n`; + csvContent += buildCsvRow(['Predictive', 'LikelyNextHotspot', report.predictive.locationClusterForecast.likelyNextHotspot]); } if (report.predictive?.keywordFrequencyForecast) { - csvContent += `Predictive,ForecastMonth,${report.predictive.keywordFrequencyForecast.forecastMonth}\n`; - csvContent += `Predictive,ForecastWordCount,${report.predictive.keywordFrequencyForecast.forecastWordCount}\n`; + csvContent += buildCsvRow(['Predictive', 'ForecastMonth', report.predictive.keywordFrequencyForecast.forecastMonth]); + csvContent += buildCsvRow(['Predictive', 'ForecastWordCount', report.predictive.keywordFrequencyForecast.forecastWordCount]); } await fs.writeFile(csvPath, csvContent, 'utf-8'); diff --git a/src/index.js b/src/index.js index a3a9ead..a42454b 100644 --- a/src/index.js +++ b/src/index.js @@ -56,11 +56,12 @@ async function main() { }); let timeout; + let pipelineQueue = Promise.resolve(); const triggerPipeline = () => { clearTimeout(timeout); timeout = setTimeout(() => { process.stdout.write(`\n🔄 File system event detected. Recalculating analytics...\n`); - runPipeline(sourceDirectory, format, options); + pipelineQueue = pipelineQueue.then(() => runPipeline(sourceDirectory, format, options)); }, 500); }; watcher.on('add', triggerPipeline).on('change', triggerPipeline).on('unlink', triggerPipeline); diff --git a/src/ingestion/file-ingestion.js b/src/ingestion/file-ingestion.js index aab77e6..f3b4a81 100644 --- a/src/ingestion/file-ingestion.js +++ b/src/ingestion/file-ingestion.js @@ -3,6 +3,13 @@ const os = require("node:os"); const { promises: fsp } = require("node:fs"); const { Worker } = require("node:worker_threads"); +const CACHE_VERSION = 1; +const SUPPORTED_TEXT_EXTENSIONS = new Set([".txt", ".md", ".json", ".csv", ".log"]); + +function isSupportedTextFile(filePath) { + return SUPPORTED_TEXT_EXTENSIONS.has(path.extname(filePath).toLowerCase()); +} + async function* walkFiles(rootDirectory) { const directoryEntries = await fsp.readdir(rootDirectory, { withFileTypes: true }); @@ -29,7 +36,10 @@ async function ingestDirectory(rootDirectory, options = {}) { if (!options.clearCache) { try { const cacheData = await fsp.readFile(cachePath, 'utf-8'); - cache = JSON.parse(cacheData); + const parsedCache = JSON.parse(cacheData); + if (parsedCache && parsedCache.version === CACHE_VERSION && parsedCache.entries && typeof parsedCache.entries === 'object') { + cache = parsedCache.entries; + } } catch (err) { cache = {}; } @@ -37,6 +47,10 @@ async function ingestDirectory(rootDirectory, options = {}) { const visitedPaths = new Set(); for await (const filePath of walkFiles(sourceDirectory)) { + if (!isSupportedTextFile(filePath)) { + continue; + } + visitedPaths.add(filePath); const stats = await fsp.stat(filePath); const fingerprint = `${stats.size}-${stats.mtimeMs}`; // Size + Modified Time @@ -101,7 +115,10 @@ async function ingestDirectory(rootDirectory, options = {}) { ); // Save newly parsed data back to .analytics_cache.json - await fsp.writeFile(cachePath, JSON.stringify(cache, null, 2)); + const cachePayload = JSON.stringify({ version: CACHE_VERSION, entries: cache }, null, 2); + const tempCachePath = `${cachePath}.${process.pid}.${Date.now()}.tmp`; + await fsp.writeFile(tempCachePath, cachePayload, 'utf-8'); + await fsp.rename(tempCachePath, cachePath); return { sourceDirectory, files }; } diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index 2363c1d..da878a6 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -2,6 +2,8 @@ const path = require('node:path'); const { parentPort } = require('node:worker_threads'); const fs = require('node:fs/promises'); +const SUPPORTED_TEXT_EXTENSIONS = new Set(['.txt', '.md', '.json', '.csv', '.log']); + // ✨ Advanced Stop-Word Culling Dictionary const STOP_WORDS = new Set([ 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', @@ -11,6 +13,16 @@ const STOP_WORDS = new Set([ parentPort.on('message', async (task) => { try { + const extension = path.extname(task.filePath).toLowerCase(); + if (!SUPPORTED_TEXT_EXTENSIONS.has(extension)) { + parentPort.postMessage({ + success: true, + filePath: task.filePath, + fingerprint: task.fingerprint + }); + return; + } + const content = await fs.readFile(task.filePath, 'utf-8'); const stats = await fs.stat(task.filePath); @@ -42,10 +54,9 @@ parentPort.on('message', async (task) => { result: { fileName: task.filePath.split(/[/\\]/).pop(), relativePath: task.rootDirectory ? path.relative(task.rootDirectory, task.filePath) : task.filePath, - extension: path.extname(task.filePath).toLowerCase(), + extension, size: stats.size, modifiedAt: stats.mtime.toISOString(), - content, words, dates, locations From 5080e4334077bc91f58c271a17acfc2c6c981ad3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:38:00 +0000 Subject: [PATCH 10/15] chore: add explicit workflow token permissions --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fb745c4..c136cfa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,6 +9,8 @@ on: jobs: test: runs-on: ubuntu-latest + permissions: + contents: read strategy: matrix: node-version: [18.x, 20.x, 22.x] From b2d75951fe8be33ec84d613700de1c6a65bed879 Mon Sep 17 00:00:00 2001 From: Albert Edward Jukes III Date: Sat, 13 Jun 2026 19:48:14 -0400 Subject: [PATCH 11/15] fix(analytics): base forecast on doc dates and optimize worker IPC memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fixes domain logic in predictive analytics by switching the timeline basis from the OS file modification time to actual parsed document dates. This prevents modern download timestamps from invalidating historical UAP forecasting. - Resolves worker IPC performance bottlenecks by calculating word frequencies directly inside the worker thread pool rather than passing massive raw string arrays across the boundary. - Mitigates main-thread blocking in diagnostic analytics by capping the O(N²) TF-IDF cosine similarity matrix calculations to a maximum of 500 files. - Adds backwards-compatibility layers in descriptive and diagnostic modules to gracefully handle legacy cache formats without crashing. - Refines watch mode path exclusions in the index file to use a strictly scoped regex for the data exports directory. --- src/analytics/descriptive.js | 41 +++++++++++++--------- src/analytics/diagnostic.js | 68 +++++++++++++++++++++--------------- src/analytics/predictive.js | 9 +++-- src/index.js | 6 ++-- src/ingestion/worker.js | 29 +++++++++++---- 5 files changed, 98 insertions(+), 55 deletions(-) diff --git a/src/analytics/descriptive.js b/src/analytics/descriptive.js index 111739e..60ed4ff 100644 --- a/src/analytics/descriptive.js +++ b/src/analytics/descriptive.js @@ -1,34 +1,43 @@ -function countBy(items) { - return items.reduce((counts, item) => { - counts[item] = (counts[item] ?? 0) + 1; - return counts; - }, {}); -} - function sortEntriesDescending(record) { return Object.entries(record).sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])); } function buildDescriptiveAnalytics(files) { - const allWords = files.flatMap((file) => file.words); - const allDates = files.flatMap((file) => file.dates); - const allLocations = files.flatMap((file) => file.locations); + const allDates = files.flatMap((file) => file.dates || []); + const allLocations = files.flatMap((file) => file.locations || []); + + const globalWordFrequency = {}; + const glossarySet = new Set(); - const wordFrequency = countBy(allWords); + // Iterate through files using the new memory-efficient object format + files.forEach((file) => { + if (file.wordFrequency) { + for (const [word, count] of Object.entries(file.wordFrequency)) { + globalWordFrequency[word] = (globalWordFrequency[word] || 0) + count; + glossarySet.add(word); + } + } else if (file.words) { + // Backwards compatibility layer + for (const word of file.words) { + globalWordFrequency[word] = (globalWordFrequency[word] || 0) + 1; + glossarySet.add(word); + } + } + }); return { fileCount: files.length, - glossary: [...new Set(allWords)].sort(), - wordFrequency, - topWords: sortEntriesDescending(wordFrequency).slice(0, 10).map(([word, count]) => ({ word, count })), + glossary: [...glossarySet].sort(), + wordFrequency: globalWordFrequency, + topWords: sortEntriesDescending(globalWordFrequency).slice(0, 10).map(([word, count]) => ({ word, count })), dates: [...new Set(allDates)].sort(), locations: [...new Set(allLocations)].sort(), files: files.map((file) => ({ path: file.relativePath, - extension: file.extension, // <-- FIX: Added extension propagation + extension: file.extension, size: file.size, modifiedAt: file.modifiedAt, - wordCount: file.words.length, + wordCount: file.totalWords || (file.words ? file.words.length : 0), dates: file.dates, locations: file.locations, metadata: file.metadata || {} diff --git a/src/analytics/diagnostic.js b/src/analytics/diagnostic.js index a3689d5..5288e80 100644 --- a/src/analytics/diagnostic.js +++ b/src/analytics/diagnostic.js @@ -7,9 +7,10 @@ function buildUsageRates(files, groupSelector) { const groupedCounts = {}; for (const file of files) { const groups = groupSelector(file); - if (!groups || groups.length === 0 || !file.words || file.words.length === 0) continue; + const uniqueWords = file.uniqueWords || (file.words ? [...new Set(file.words)] : []); + + if (!groups || groups.length === 0 || uniqueWords.length === 0) continue; - const uniqueWords = new Set(file.words); for (const group of groups) { for (const word of uniqueWords) { incrementNestedCount(groupedCounts, group, word); @@ -28,7 +29,6 @@ function buildUsageRates(files, groupSelector) { ); } -// ✨ SPRINT 2 Task 2: Vector Cross-Referencing Math Engine function calculateCosineSimilarity(vecA, vecB) { let dotProduct = 0; let normA = 0; @@ -51,18 +51,20 @@ function calculateTFIDF(files) { const documentFrequencies = {}; files.forEach(file => { - if(!file.words) return; - const uniqueWords = new Set(file.words.map(w => w.toLowerCase())); - uniqueWords.forEach(word => { documentFrequencies[word] = (documentFrequencies[word] || 0) + 1; }); + const unique = file.uniqueWords || (file.words ? [...new Set(file.words)] : []); + unique.forEach(word => { documentFrequencies[word] = (documentFrequencies[word] || 0) + 1; }); }); // Pass 1: Build Multidimensional Vectors const vectorizedFiles = files.map(file => { - if(!file.words) return { ...file, topKeywords: [], vector: {} }; - const tf = {}; - const totalWords = file.words.length; - - file.words.forEach(word => { tf[word] = (tf[word] || 0) + 1; }); + let tf = file.wordFrequency || {}; + let totalWords = file.totalWords || 1; + + // Backwards compatibility layer for un-cleared caches + if (!file.wordFrequency && file.words) { + file.words.forEach(word => { tf[word] = (tf[word] || 0) + 1; }); + totalWords = file.words.length || 1; + } const vector = {}; const tfidf = Object.keys(tf).map(word => { @@ -76,24 +78,34 @@ function calculateTFIDF(files) { return { ...file, topKeywords: tfidf.slice(0, 5).map(t => t.word), vector }; }); - // ✨ Pass 2: Semantic Cross-Linking Loop - return vectorizedFiles.map(fileA => { - const related = []; - vectorizedFiles.forEach(fileB => { - if (fileA.fileName !== fileB.fileName) { - const score = calculateCosineSimilarity(fileA.vector, fileB.vector); - if (score > 0.05) { // Threshold for correlation relevancy - related.push({ match: fileB.fileName, correlationScore: Number(score.toFixed(4)) }); - } + // ✨ Pass 2: Semantic Cross-Linking Loop (🚀 Optimised to prevent thread blocking) + const MAX_CROSS_REF = 500; + const targetFiles = vectorizedFiles.slice(0, MAX_CROSS_REF); + const relatedByIndex = Array.from({ length: vectorizedFiles.length }, () => []); + + for (let indexA = 0; indexA < targetFiles.length; indexA += 1) { + for (let indexB = indexA + 1; indexB < targetFiles.length; indexB += 1) { + const fileA = targetFiles[indexA]; + const fileB = targetFiles[indexB]; + const score = calculateCosineSimilarity(fileA.vector, fileB.vector); + + if (score > 0.05) { + const correlationScore = Number(score.toFixed(4)); + relatedByIndex[indexA].push({ match: fileB.fileName, correlationScore }); + relatedByIndex[indexB].push({ match: fileA.fileName, correlationScore }); } - }); - - related.sort((a, b) => b.correlationScore - a.correlationScore); - - return { - fileName: fileA.fileName, - topKeywords: fileA.topKeywords, - relatedDocuments: related.slice(0, 3) // Return Top 3 Matches + } + } + + return vectorizedFiles.map((file, index) => { + const related = relatedByIndex[index] + ? relatedByIndex[index].sort((left, right) => right.correlationScore - left.correlationScore).slice(0, 3) + : []; + + return { + fileName: file.fileName, + topKeywords: file.topKeywords, + relatedDocuments: related }; }); } diff --git a/src/analytics/predictive.js b/src/analytics/predictive.js index 7cd1d31..bacfe5d 100644 --- a/src/analytics/predictive.js +++ b/src/analytics/predictive.js @@ -48,11 +48,14 @@ function fillEmptyIntervals(orderedMonths, timeline) { function buildKeywordSeries(files) { const timeline = {}; for (const file of files) { - if(!file.modifiedAt) continue; - const key = monthKey(file.modifiedAt); + // 🚨 FIX: Extract historical dates first, fallback to OS modification if none exist + const documentDate = (file.dates && file.dates.length > 0) ? file.dates[0] : file.modifiedAt; + if (!documentDate) continue; + + const key = monthKey(documentDate); if (!timeline[key]) timeline[key] = { totalWords: 0, locations: {} }; - timeline[key].totalWords += (file.words || []).length; + timeline[key].totalWords += file.totalWords || (file.words || []).length; for (const location of (file.locations || [])) { timeline[key].locations[location] = (timeline[key].locations[location] ?? 0) + 1; } diff --git a/src/index.js b/src/index.js index a3a9ead..6035dae 100644 --- a/src/index.js +++ b/src/index.js @@ -48,14 +48,16 @@ async function main() { const options = { clearCache, workers }; - if (isWatchMode) { +if (isWatchMode) { process.stdout.write(`👀 Watching directory for changes: ${sourceDirectory}\n`); const watcher = chokidar.watch(sourceDirectory, { - ignored: [/(^|[\/\\])\../, /node_modules/, /data_exports/], + // 🧹 Minor polish: Restrict ignored regex to strict ends-with path + ignored: [/(^|[\/\\])\../, /node_modules/, /data_exports[\/\\]?$/], persistent: true, ignoreInitial: false }); let timeout; + let pipelineQueue = Promise.resolve(); const triggerPipeline = () => { clearTimeout(timeout); timeout = setTimeout(() => { diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index 2363c1d..18cd3f3 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -2,6 +2,8 @@ const path = require('node:path'); const { parentPort } = require('node:worker_threads'); const fs = require('node:fs/promises'); +const SUPPORTED_TEXT_EXTENSIONS = new Set(['.txt', '.md', '.json', '.csv', '.log']); + // ✨ Advanced Stop-Word Culling Dictionary const STOP_WORDS = new Set([ 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', @@ -11,6 +13,16 @@ const STOP_WORDS = new Set([ parentPort.on('message', async (task) => { try { + const extension = path.extname(task.filePath).toLowerCase(); + if (!SUPPORTED_TEXT_EXTENSIONS.has(extension)) { + parentPort.postMessage({ + success: true, + filePath: task.filePath, + fingerprint: task.fingerprint + }); + return; + } + const content = await fs.readFile(task.filePath, 'utf-8'); const stats = await fs.stat(task.filePath); @@ -18,19 +30,23 @@ parentPort.on('message', async (task) => { const locations = []; // Filter out punctuation, make lowercase, and cull stop words - const words = content + const rawWords = content .replace(/[^\w\s]/g, '') .toLowerCase() .split(/\s+/) .filter(word => word.length > 1 && !STOP_WORDS.has(word)); - // Extract all dates and locations (not just the first occurrence) + // 🚀 OPTIMIZATION: Calculate map inside worker to drastically reduce IPC channel memory usage + const wordFrequency = {}; + for(const word of rawWords) { + wordFrequency[word] = (wordFrequency[word] || 0) + 1; + } + for (const match of content.matchAll(/Date:\s*(\d{4}-\d{2}-\d{2})/gi)) { dates.push(match[1]); } for (const match of content.matchAll(/Location:\s*([A-Za-z]+)/gi)) { - // SPRINT 2 Task 1: Named Entity Token Unification const loc = match[1].charAt(0).toUpperCase() + match[1].slice(1).toLowerCase(); locations.push(loc); } @@ -42,11 +58,12 @@ parentPort.on('message', async (task) => { result: { fileName: task.filePath.split(/[/\\]/).pop(), relativePath: task.rootDirectory ? path.relative(task.rootDirectory, task.filePath) : task.filePath, - extension: path.extname(task.filePath).toLowerCase(), + extension, size: stats.size, modifiedAt: stats.mtime.toISOString(), - content, - words, + wordFrequency, + totalWords: rawWords.length, + uniqueWords: Object.keys(wordFrequency), dates, locations } From a05b45cea51ecbb05c691f9dbf653dfabe1413fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Jun 2026 00:41:50 +0000 Subject: [PATCH 12/15] fix review feedback gaps --- README.md | 4 +- docs/architecture.md | 13 ++-- src/analytics/diagnostic.js | 8 ++- src/delivery/csv-generator.js | 22 +++++-- src/index.js | 6 +- src/ingestion/file-ingestion.js | 28 +++++++- src/ingestion/worker.js | 49 ++++++++------ test/pipeline.test.js | 113 ++++++++++++++++++++++++++++---- 8 files changed, 188 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 8639920..c68009c 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ cd uap_analyticsbot 2. **Install dependencies:** -Because this engine utilizes pre-compiled WebAssembly, there are no complex C++ build tools or `node-gyp` configurations required on Windows. Simply run: +This project installs as a standard Node.js CLI package, so there are no extra native build steps required for the current worker-thread ingestion flow. Simply run: ```bash npm install @@ -208,7 +208,7 @@ node src/index.js ./my_folder/ --format=md The v1.2.0 AnalyticsBot engine supports multithreading and memoization caching. You can control these via CLI arguments: -* `node src/index.js ./my_folder --workers=4` : Manually set the number of WebAssembly worker threads (defaults to max CPU cores). +* `node src/index.js ./my_folder --workers=4` : Manually set the number of Node.js worker threads (defaults to max CPU cores). * `node src/index.js ./my_folder --clear-cache` : Bypasses the `.analytics_cache.json` file and forces a fresh read of all documents. * `node src/index.js ./my_folder --format=csv` : Exports the final report as a spreadsheet-compatible `.csv` file. diff --git a/docs/architecture.md b/docs/architecture.md index 43925b3..2346525 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -4,10 +4,10 @@ The repository currently ships a Node.js CLI-centered analytics flow: -1. **CLI Orchestrator (`src/index.js`)** resolves the source directory and writes the final report to stdout. -2. **Read-Only Ingestion (`src/ingestion/file-ingestion.js`)** recursively scans supported text files, streams file content, and extracts words, dates, locations, and filesystem metadata. +1. **CLI Orchestrator (`src/index.js`)** resolves the source directory, supports watch mode, and routes report output to stdout or export files. +2. **Read-Only Ingestion (`src/ingestion/file-ingestion.js`)** recursively scans supported text files, dispatches parsing work to Node.js worker threads, memoizes compatible results in `.analytics_cache.json`, and extracts words, dates, locations, and filesystem metadata. 3. **Analytics Pipeline (`src/pipeline.js`)** builds the descriptive, diagnostic, predictive, and prescriptive tiers from the ingested file set. -4. **Output Layer** returns a single structured JSON report for the requested directory. +4. **Output Layer** returns structured JSON or saves Markdown / CSV exports for the requested directory. ### v1.2.0 Pipeline Architecture * **Ingestion (Multithreaded):** Utilizes Node.js `worker_threads` and file-stat fingerprinting (`.analytics_cache.json`) to bypass redundant processing and drastically speed up execution. @@ -18,16 +18,17 @@ The repository currently ships a Node.js CLI-centered analytics flow: Implemented today: - recursive read-only ingestion for `.txt`, `.md`, `.json`, `.csv`, and `.log` +- multithreaded parsing with fingerprint-based cache reuse for compatible ingestions - tokenization plus lightweight date/location extraction - descriptive, diagnostic, predictive, and prescriptive analytics modules -- JSON report delivery through the Node CLI +- JSON, Markdown, and CSV report delivery through the Node CLI +- directory watch mode that re-runs the pipeline after file changes Not yet implemented in the active system: - binary or multimedia extraction - Named Entity Recognition (NER) -- dashboard or alternate export formats -- background scheduling or directory watching +- dashboard or background scheduling ## Planned Expansion diff --git a/src/analytics/diagnostic.js b/src/analytics/diagnostic.js index 5288e80..1d63749 100644 --- a/src/analytics/diagnostic.js +++ b/src/analytics/diagnostic.js @@ -82,6 +82,8 @@ function calculateTFIDF(files) { const MAX_CROSS_REF = 500; const targetFiles = vectorizedFiles.slice(0, MAX_CROSS_REF); const relatedByIndex = Array.from({ length: vectorizedFiles.length }, () => []); + + const getFileLabel = (file) => file.fileName || file.relativePath || 'unknown'; for (let indexA = 0; indexA < targetFiles.length; indexA += 1) { for (let indexB = indexA + 1; indexB < targetFiles.length; indexB += 1) { @@ -91,8 +93,8 @@ function calculateTFIDF(files) { if (score > 0.05) { const correlationScore = Number(score.toFixed(4)); - relatedByIndex[indexA].push({ match: fileB.fileName, correlationScore }); - relatedByIndex[indexB].push({ match: fileA.fileName, correlationScore }); + relatedByIndex[indexA].push({ match: getFileLabel(fileB), correlationScore }); + relatedByIndex[indexB].push({ match: getFileLabel(fileA), correlationScore }); } } } @@ -103,7 +105,7 @@ function calculateTFIDF(files) { : []; return { - fileName: file.fileName, + fileName: getFileLabel(file), topKeywords: file.topKeywords, relatedDocuments: related }; diff --git a/src/delivery/csv-generator.js b/src/delivery/csv-generator.js index 9ba2c7a..a2ef2d8 100644 --- a/src/delivery/csv-generator.js +++ b/src/delivery/csv-generator.js @@ -1,22 +1,32 @@ const fs = require('node:fs/promises'); const path = require('node:path'); +function escapeCsvCell(value) { + const stringValue = String(value ?? ''); + const sanitizedValue = /^[=+\-@]/.test(stringValue) ? `'${stringValue}` : stringValue; + return `"${sanitizedValue.replace(/"/g, '""')}"`; +} + +function buildCsvRow(...cells) { + return `${cells.map(escapeCsvCell).join(',')}\n`; +} + async function generateCsvReport(report, exportsDir) { await fs.mkdir(exportsDir, { recursive: true }); const csvPath = path.join(exportsDir, `report-${Date.now()}.csv`); - let csvContent = "Category,Metric,Value\n"; - csvContent += `Descriptive,FileCount,${report.descriptive.fileCount}\n`; + let csvContent = buildCsvRow('Category', 'Metric', 'Value'); + csvContent += buildCsvRow('Descriptive', 'FileCount', report.descriptive.fileCount); const locations = report.descriptive.locations || report.locations || []; - csvContent += `Descriptive,UniqueLocations,"${locations.join(', ')}"\n`; + csvContent += buildCsvRow('Descriptive', 'UniqueLocations', locations.join(', ')); if (report.predictive?.locationClusterForecast) { - csvContent += `Predictive,LikelyNextHotspot,${report.predictive.locationClusterForecast.likelyNextHotspot}\n`; + csvContent += buildCsvRow('Predictive', 'LikelyNextHotspot', report.predictive.locationClusterForecast.likelyNextHotspot); } if (report.predictive?.keywordFrequencyForecast) { - csvContent += `Predictive,ForecastMonth,${report.predictive.keywordFrequencyForecast.forecastMonth}\n`; - csvContent += `Predictive,ForecastWordCount,${report.predictive.keywordFrequencyForecast.forecastWordCount}\n`; + csvContent += buildCsvRow('Predictive', 'ForecastMonth', report.predictive.keywordFrequencyForecast.forecastMonth); + csvContent += buildCsvRow('Predictive', 'ForecastWordCount', report.predictive.keywordFrequencyForecast.forecastWordCount); } await fs.writeFile(csvPath, csvContent, 'utf-8'); diff --git a/src/index.js b/src/index.js index 6035dae..c06cf04 100644 --- a/src/index.js +++ b/src/index.js @@ -48,7 +48,7 @@ async function main() { const options = { clearCache, workers }; -if (isWatchMode) { + if (isWatchMode) { process.stdout.write(`👀 Watching directory for changes: ${sourceDirectory}\n`); const watcher = chokidar.watch(sourceDirectory, { // 🧹 Minor polish: Restrict ignored regex to strict ends-with path @@ -62,7 +62,9 @@ if (isWatchMode) { clearTimeout(timeout); timeout = setTimeout(() => { process.stdout.write(`\n🔄 File system event detected. Recalculating analytics...\n`); - runPipeline(sourceDirectory, format, options); + pipelineQueue = pipelineQueue + .then(() => runPipeline(sourceDirectory, format, options)) + .catch(() => {}); }, 500); }; watcher.on('add', triggerPipeline).on('change', triggerPipeline).on('unlink', triggerPipeline); diff --git a/src/ingestion/file-ingestion.js b/src/ingestion/file-ingestion.js index aab77e6..896c421 100644 --- a/src/ingestion/file-ingestion.js +++ b/src/ingestion/file-ingestion.js @@ -3,6 +3,25 @@ const os = require("node:os"); const { promises: fsp } = require("node:fs"); const { Worker } = require("node:worker_threads"); +const CACHE_SCHEMA_VERSION = 1; + +function parseCacheEntries(cacheData) { + const parsedCache = JSON.parse(cacheData); + + if ( + parsedCache && + typeof parsedCache === 'object' && + parsedCache.version === CACHE_SCHEMA_VERSION && + parsedCache.entries && + typeof parsedCache.entries === 'object' && + !Array.isArray(parsedCache.entries) + ) { + return parsedCache.entries; + } + + return {}; +} + async function* walkFiles(rootDirectory) { const directoryEntries = await fsp.readdir(rootDirectory, { withFileTypes: true }); @@ -29,7 +48,7 @@ async function ingestDirectory(rootDirectory, options = {}) { if (!options.clearCache) { try { const cacheData = await fsp.readFile(cachePath, 'utf-8'); - cache = JSON.parse(cacheData); + cache = parseCacheEntries(cacheData); } catch (err) { cache = {}; } @@ -101,7 +120,12 @@ async function ingestDirectory(rootDirectory, options = {}) { ); // Save newly parsed data back to .analytics_cache.json - await fsp.writeFile(cachePath, JSON.stringify(cache, null, 2)); + const tempCachePath = `${cachePath}.${process.pid}.${Date.now()}.tmp`; + await fsp.writeFile( + tempCachePath, + JSON.stringify({ version: CACHE_SCHEMA_VERSION, entries: cache }, null, 2) + ); + await fsp.rename(tempCachePath, cachePath); return { sourceDirectory, files }; } diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index 18cd3f3..375129a 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -1,6 +1,8 @@ const path = require('node:path'); const { parentPort } = require('node:worker_threads'); -const fs = require('node:fs/promises'); +const fs = require('node:fs'); +const fsp = require('node:fs/promises'); +const readline = require('node:readline'); const SUPPORTED_TEXT_EXTENSIONS = new Set(['.txt', '.md', '.json', '.csv', '.log']); @@ -23,32 +25,37 @@ parentPort.on('message', async (task) => { return; } - const content = await fs.readFile(task.filePath, 'utf-8'); - const stats = await fs.stat(task.filePath); + const stats = await fsp.stat(task.filePath); const dates = []; const locations = []; + const wordFrequency = {}; + let totalWords = 0; + const fileStream = fs.createReadStream(task.filePath, { encoding: 'utf-8' }); + const lines = readline.createInterface({ input: fileStream, crlfDelay: Infinity }); - // Filter out punctuation, make lowercase, and cull stop words - const rawWords = content - .replace(/[^\w\s]/g, '') - .toLowerCase() - .split(/\s+/) - .filter(word => word.length > 1 && !STOP_WORDS.has(word)); + for await (const line of lines) { + // Filter out punctuation, make lowercase, and cull stop words + const rawWords = line + .replace(/[^\w\s]/g, '') + .toLowerCase() + .split(/\s+/) + .filter(word => word.length > 1 && !STOP_WORDS.has(word)); - // 🚀 OPTIMIZATION: Calculate map inside worker to drastically reduce IPC channel memory usage - const wordFrequency = {}; - for(const word of rawWords) { - wordFrequency[word] = (wordFrequency[word] || 0) + 1; - } + // 🚀 OPTIMIZATION: Calculate map inside worker to drastically reduce IPC channel memory usage + for (const word of rawWords) { + wordFrequency[word] = (wordFrequency[word] || 0) + 1; + } + totalWords += rawWords.length; - for (const match of content.matchAll(/Date:\s*(\d{4}-\d{2}-\d{2})/gi)) { - dates.push(match[1]); - } + for (const match of line.matchAll(/Date:\s*(\d{4}-\d{2}-\d{2})/gi)) { + dates.push(match[1]); + } - for (const match of content.matchAll(/Location:\s*([A-Za-z]+)/gi)) { - const loc = match[1].charAt(0).toUpperCase() + match[1].slice(1).toLowerCase(); - locations.push(loc); + for (const match of line.matchAll(/Location:\s*([A-Za-z]+)/gi)) { + const loc = match[1].charAt(0).toUpperCase() + match[1].slice(1).toLowerCase(); + locations.push(loc); + } } parentPort.postMessage({ @@ -62,7 +69,7 @@ parentPort.on('message', async (task) => { size: stats.size, modifiedAt: stats.mtime.toISOString(), wordFrequency, - totalWords: rawWords.length, + totalWords, uniqueWords: Object.keys(wordFrequency), dates, locations diff --git a/test/pipeline.test.js b/test/pipeline.test.js index 5a1b801..6c06805 100644 --- a/test/pipeline.test.js +++ b/test/pipeline.test.js @@ -5,6 +5,8 @@ const os = require('node:os'); const path = require('node:path'); const { generateAnalyticsReport } = require('../src/pipeline'); +const { buildDiagnosticAnalytics } = require('../src/analytics/diagnostic'); +const { generateCsvReport } = require('../src/delivery/csv-generator'); async function createFixtureDirectory() { const fixtureRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'uap-analytics-')); @@ -65,22 +67,107 @@ test('generateAnalyticsReport flags files with missing metadata for prescriptive } }); -test('generateAnalyticsReport builds all analytics tiers from text files (descriptive dates path)', async () => { - const fixtureRoot = await createFixtureDirectory(); +test('generateAnalyticsReport passes ingestion options through to the pipeline', async () => { + const ingestionModulePath = require.resolve('../src/ingestion/file-ingestion'); + const pipelineModulePath = require.resolve('../src/pipeline'); + const originalIngestionModule = require.cache[ingestionModulePath]; + const originalPipelineModule = require.cache[pipelineModulePath]; + let receivedOptions; + + delete require.cache[pipelineModulePath]; + require.cache[ingestionModulePath] = { + id: ingestionModulePath, + filename: ingestionModulePath, + loaded: true, + exports: { + ingestDirectory: async (_sourceDirectory, options) => { + receivedOptions = options; + return { + sourceDirectory: '/tmp/mock-source', + files: [ + { + fileName: 'fixture.txt', + locations: ['Roswell'], + dates: ['2024-01-01'], + wordFrequency: { sighting: 1 }, + totalWords: 1, + uniqueWords: ['sighting'] + } + ], + }; + } + } + }; try { - const report = await generateAnalyticsReport(fixtureRoot); + const { generateAnalyticsReport: generateMockedAnalyticsReport } = require('../src/pipeline'); + const report = await generateMockedAnalyticsReport('/tmp/mock-source', { workers: 4, clearCache: true }); - assert.equal(report.descriptive.fileCount, 2); - assert.deepEqual(report.descriptive.locations, ['Phoenix', 'Roswell']); - - assert.deepEqual(report.descriptive.dates, ['2024-01-01', '2024-02-14']); - - assert.ok(report.descriptive.wordFrequency.location >= 2); - assert.ok(report.diagnostic.wordUsageByLocation.Roswell.length > 0); - assert.equal(report.predictive.locationClusterForecast.likelyNextHotspot, 'Phoenix'); - assert.equal(report.prescriptive.recommendations[0].type, 'folder-restructure'); + assert.equal(report.sourceDirectory, '/tmp/mock-source'); + assert.equal(report.descriptive.fileCount, 1); + assert.deepEqual(receivedOptions, { workers: 4, clearCache: true }); } finally { - await fs.rm(fixtureRoot, { recursive: true, force: true }); + if (originalIngestionModule) { + require.cache[ingestionModulePath] = originalIngestionModule; + } else { + delete require.cache[ingestionModulePath]; + } + + if (originalPipelineModule) { + require.cache[pipelineModulePath] = originalPipelineModule; + } else { + delete require.cache[pipelineModulePath]; + } + } +}); + +test('buildDiagnosticAnalytics falls back to relative paths when file names are missing', () => { + const diagnostic = buildDiagnosticAnalytics([ + { + relativePath: 'reports/alpha.txt', + wordFrequency: { signal: 2, light: 1 }, + totalWords: 3, + uniqueWords: ['signal', 'light'] + }, + { + relativePath: 'reports/beta.txt', + wordFrequency: { signal: 2, glow: 1 }, + totalWords: 3, + uniqueWords: ['signal', 'glow'] + } + ]); + + assert.equal(diagnostic.semanticAnalysis[0].fileName, 'reports/alpha.txt'); + assert.equal(diagnostic.semanticAnalysis[0].relatedDocuments[0].match, 'reports/beta.txt'); +}); + +test('generateCsvReport escapes spreadsheet-sensitive values', async () => { + const exportsDir = await fs.mkdtemp(path.join(os.tmpdir(), 'uap-analytics-csv-')); + + try { + const csvPath = await generateCsvReport( + { + descriptive: { + fileCount: 1, + locations: ['=cmd|" /C calc"!A0', 'Phoenix, AZ'] + }, + predictive: { + locationClusterForecast: { + likelyNextHotspot: '@hidden' + }, + keywordFrequencyForecast: { + forecastMonth: '2026-06', + forecastWordCount: 3 + } + } + }, + exportsDir + ); + + const csvContent = await fs.readFile(csvPath, 'utf-8'); + assert.match(csvContent, /"'=cmd\|"" \/C calc""!A0, Phoenix, AZ"/); + assert.match(csvContent, /"'\@hidden"/); + } finally { + await fs.rm(exportsDir, { recursive: true, force: true }); } }); \ No newline at end of file From 1f0a00c92b9ba3bbf56c68b95de17845feea71e2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Jun 2026 00:42:58 +0000 Subject: [PATCH 13/15] fix workflow token permissions --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fb745c4..15b6413 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,6 +6,9 @@ on: pull_request: branches: [ main ] +permissions: + contents: read + jobs: test: runs-on: ubuntu-latest From 6d2718d6b62573e9361a6b68ad1607a8a5250b0d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Jun 2026 01:12:08 +0000 Subject: [PATCH 14/15] fix pipeline/watch/cache regressions and modernize test workflow --- .github/workflows/test.yml | 2 +- .husky/pre-commit | 6 +++ README.md | 5 +- docs/USER_GUIDE.md | 8 +-- docs/docs-source.json | 4 +- package.json | 2 +- src/index.js | 3 +- src/ingestion/file-ingestion.js | 5 +- src/ingestion/worker.js | 84 ++++++++++++++++++++++++------ test/ingestion-regressions.test.js | 73 ++++++++++++++++++++++++++ 10 files changed, 165 insertions(+), 27 deletions(-) create mode 100755 .husky/pre-commit create mode 100644 test/ingestion-regressions.test.js diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 15b6413..7ec4ee6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - node-version: [18.x, 20.x, 22.x] + node-version: [20.x, 22.x] steps: - uses: actions/checkout@v3 diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100755 index 0000000..86f640a --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1,6 @@ +#!/usr/bin/env sh + +npm run docs:generate || exit 1 +git add docs/ || exit 1 +npm test || exit 1 +npm run docs:check || exit 1 diff --git a/README.md b/README.md index c68009c..e4124dc 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,9 @@ The current Node ingestion pipeline only analyzes text-oriented files. | `.csv` | Ingested by the active Node pipeline | | `.log` | Ingested by the active Node pipeline | | `.pdf` | Ingested by the active Node pipeline | +| `.png` | Ingested by the active Node pipeline | +| `.jpg` | Ingested by the active Node pipeline | +| `.jpeg` | Ingested by the active Node pipeline | ## Repository Layout @@ -87,7 +90,7 @@ The current Node ingestion pipeline only analyzes text-oriented files. - `src/index.js` — Node CLI entry point. - `src/pipeline.js` — Pipeline coordinator that assembles all analytics tiers. -- `src/ingestion/file-ingestion.js` — Read-only recursive file ingestion for supported text files. +- `src/ingestion/file-ingestion.js` — Read-only recursive file ingestion for supported files. - `src/analytics/` — Descriptive, diagnostic, predictive, and prescriptive analytics modules. - `test/pipeline.test.js` — Node test coverage for core pipeline behavior. - `docs/architecture.md` — Hand-authored architecture overview for current and planned system design. diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index d8f5898..4f88bf4 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -56,14 +56,16 @@ npm start -- "C:\Path\To\Folder" > analytics_report.json ## Supported File Types -Currently, the ingestion engine natively parses the following text-based extensions: +Currently, the ingestion engine natively parses the following extensions: * `.txt` * `.md` * `.json` * `.csv` * `.log` - -*(Note: Binary and multimedia extraction, such as PDF parsing and Image OCR, are tracked for a future development stage).* +* `.pdf` +* `.png` +* `.jpg` +* `.jpeg` ## Testing & Validation diff --git a/docs/docs-source.json b/docs/docs-source.json index 568d598..2f8acb3 100644 --- a/docs/docs-source.json +++ b/docs/docs-source.json @@ -26,7 +26,7 @@ "description": "Auto-generate CHANGELOG.md, bump the semantic version, and create a Git release tag based on conventional commit history." } ], - "supportedFileTypes": [".txt", ".md", ".json", ".csv", ".log", ".pdf"], + "supportedFileTypes": [".txt", ".md", ".json", ".csv", ".log", ".pdf", ".png", ".jpg", ".jpeg"], "repoLayout": [ { "path": "src/index.js", @@ -38,7 +38,7 @@ }, { "path": "src/ingestion/file-ingestion.js", - "description": "Read-only recursive file ingestion for supported text files." + "description": "Read-only recursive file ingestion for supported files." }, { "path": "src/analytics/", diff --git a/package.json b/package.json index 7cc9abd..3382a69 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ "main": "src/index.js", "scripts": { "start": "node src/index.js", - "test": "node --test", + "test": "node --test --experimental-test-coverage", "docs:generate": "node scripts/generate-docs.js", "docs:check": "node scripts/generate-docs.js --check && node scripts/validate-docs.js", "release": "commit-and-tag-version", diff --git a/src/index.js b/src/index.js index c06cf04..031fc69 100644 --- a/src/index.js +++ b/src/index.js @@ -51,8 +51,7 @@ async function main() { if (isWatchMode) { process.stdout.write(`👀 Watching directory for changes: ${sourceDirectory}\n`); const watcher = chokidar.watch(sourceDirectory, { - // 🧹 Minor polish: Restrict ignored regex to strict ends-with path - ignored: [/(^|[\/\\])\../, /node_modules/, /data_exports[\/\\]?$/], + ignored: [/(^|[\/\\])\../, /node_modules/, /[\/\\]data_exports([\/\\]|$)/], persistent: true, ignoreInitial: false }); diff --git a/src/ingestion/file-ingestion.js b/src/ingestion/file-ingestion.js index 896c421..0172399 100644 --- a/src/ingestion/file-ingestion.js +++ b/src/ingestion/file-ingestion.js @@ -69,7 +69,10 @@ async function ingestDirectory(rootDirectory, options = {}) { // Evict stale cache keys scoped to this sourceDirectory for (const key of Object.keys(cache)) { - if (key.startsWith(sourceDirectory) && !visitedPaths.has(key)) { + if ( + (key === sourceDirectory || key.startsWith(sourceDirectory + path.sep)) && + !visitedPaths.has(key) + ) { delete cache[key]; } } diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index 375129a..5b4f436 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -3,8 +3,11 @@ const { parentPort } = require('node:worker_threads'); const fs = require('node:fs'); const fsp = require('node:fs/promises'); const readline = require('node:readline'); +const nlp = require('compromise'); -const SUPPORTED_TEXT_EXTENSIONS = new Set(['.txt', '.md', '.json', '.csv', '.log']); +const TEXT_EXTENSIONS = new Set(['.txt', '.md', '.json', '.csv', '.log']); +const IMAGE_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg']); +const SUPPORTED_EXTENSIONS = new Set([...TEXT_EXTENSIONS, ...IMAGE_EXTENSIONS, '.pdf']); // ✨ Advanced Stop-Word Culling Dictionary const STOP_WORDS = new Set([ @@ -16,7 +19,7 @@ const STOP_WORDS = new Set([ parentPort.on('message', async (task) => { try { const extension = path.extname(task.filePath).toLowerCase(); - if (!SUPPORTED_TEXT_EXTENSIONS.has(extension)) { + if (!SUPPORTED_EXTENSIONS.has(extension)) { parentPort.postMessage({ success: true, filePath: task.filePath, @@ -27,16 +30,15 @@ parentPort.on('message', async (task) => { const stats = await fsp.stat(task.filePath); - const dates = []; - const locations = []; + const dates = new Set(); + const locations = new Set(); const wordFrequency = {}; let totalWords = 0; - const fileStream = fs.createReadStream(task.filePath, { encoding: 'utf-8' }); - const lines = readline.createInterface({ input: fileStream, crlfDelay: Infinity }); - for await (const line of lines) { - // Filter out punctuation, make lowercase, and cull stop words - const rawWords = line + const processTextChunk = (text) => { + if (!text) return; + + const rawWords = text .replace(/[^\w\s]/g, '') .toLowerCase() .split(/\s+/) @@ -48,14 +50,64 @@ parentPort.on('message', async (task) => { } totalWords += rawWords.length; - for (const match of line.matchAll(/Date:\s*(\d{4}-\d{2}-\d{2})/gi)) { - dates.push(match[1]); + const doc = nlp(text); + for (const value of doc.match('#Date').out('array')) { + dates.add(value); + } + for (const value of doc.match('#Place').out('array')) { + locations.add(value); + } + + for (const match of text.matchAll(/Date:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})/gi)) { + dates.add(match[1]); + } + for (const match of text.matchAll(/Location:\s*([A-Za-z][A-Za-z\s'-]*)/gi)) { + locations.add(match[1].trim()); } + }; + + const processTextFile = async () => { + const fileStream = fs.createReadStream(task.filePath, { encoding: 'utf-8' }); + const lines = readline.createInterface({ input: fileStream, crlfDelay: Infinity }); - for (const match of line.matchAll(/Location:\s*([A-Za-z]+)/gi)) { - const loc = match[1].charAt(0).toUpperCase() + match[1].slice(1).toLowerCase(); - locations.push(loc); + try { + for await (const line of lines) { + processTextChunk(line); + } + } finally { + lines.close(); + fileStream.destroy(); } + }; + + const processPdfFile = async () => { + try { + const pdfParse = require('pdf-parse'); + const parseFn = typeof pdfParse === 'function' ? pdfParse : pdfParse.default; + const dataBuffer = await fsp.readFile(task.filePath); + const pdfResult = await parseFn(dataBuffer); + processTextChunk(pdfResult?.text || ''); + } catch (error) { + process.stderr.write(`\n⚠️ PDF extraction skipped (${task.filePath}): ${error.message}\n`); + } + }; + + const processImageFile = async () => { + try { + const tesseract = require('tesseract.js'); + const result = await tesseract.recognize(task.filePath, 'eng', { logger: () => {} }); + processTextChunk(result?.data?.text || ''); + } catch (error) { + process.stderr.write(`\n⚠️ Image OCR skipped (${task.filePath}): ${error.message}\n`); + } + }; + + if (TEXT_EXTENSIONS.has(extension)) { + await processTextFile(); + } else if (extension === '.pdf') { + await processPdfFile(); + } else if (IMAGE_EXTENSIONS.has(extension)) { + await processImageFile(); } parentPort.postMessage({ @@ -71,8 +123,8 @@ parentPort.on('message', async (task) => { wordFrequency, totalWords, uniqueWords: Object.keys(wordFrequency), - dates, - locations + dates: [...dates], + locations: [...locations] } }); } catch (error) { diff --git a/test/ingestion-regressions.test.js b/test/ingestion-regressions.test.js new file mode 100644 index 0000000..3496e18 --- /dev/null +++ b/test/ingestion-regressions.test.js @@ -0,0 +1,73 @@ +const test = require('node:test'); +const assert = require('node:assert/strict'); +const fs = require('node:fs/promises'); +const os = require('node:os'); +const path = require('node:path'); + +const { ingestDirectory } = require('../src/ingestion/file-ingestion'); +const { generateAnalyticsReport } = require('../src/pipeline'); + +test('watch mode ignores data_exports directory and descendants', async () => { + const indexSource = await fs.readFile(path.join(__dirname, '..', 'src', 'index.js'), 'utf-8'); + + assert.ok(indexSource.includes('/[\\/\\\\]data_exports([\\/\\\\]|$)/')); + assert.ok(!indexSource.includes('/data_exports[\\/\\\\]?$/')); +}); + +test('cache eviction does not remove sibling directory entries', async () => { + const cwdBefore = process.cwd(); + const workspace = await fs.mkdtemp(path.join(os.tmpdir(), 'uap-cache-')); + const sourceDirectory = path.join(workspace, 'UAP_Data'); + const siblingDirectory = path.join(workspace, 'UAP_Data_Archive'); + const liveFile = path.join(sourceDirectory, 'live.txt'); + const staleSourceFile = path.join(sourceDirectory, 'stale.txt'); + const staleSiblingFile = path.join(siblingDirectory, 'stale.txt'); + + try { + await fs.mkdir(sourceDirectory, { recursive: true }); + await fs.mkdir(siblingDirectory, { recursive: true }); + await fs.writeFile(liveFile, 'Roswell event on 2024-01-01'); + + await fs.writeFile( + path.join(workspace, '.analytics_cache.json'), + JSON.stringify( + { + version: 1, + entries: { + [staleSourceFile]: { fingerprint: 'old', data: { fileName: 'stale.txt' } }, + [staleSiblingFile]: { fingerprint: 'old', data: { fileName: 'stale.txt' } }, + }, + }, + null, + 2 + ) + ); + + process.chdir(workspace); + await ingestDirectory(sourceDirectory, { workers: 1 }); + + const cache = JSON.parse(await fs.readFile(path.join(workspace, '.analytics_cache.json'), 'utf-8')); + assert.equal(cache.entries[staleSourceFile], undefined); + assert.ok(cache.entries[staleSiblingFile]); + } finally { + process.chdir(cwdBefore); + await fs.rm(workspace, { recursive: true, force: true }); + } +}); + +test('worker NLP extraction captures natural-language dates and places', async () => { + const fixtureRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'uap-nlp-')); + + try { + await fs.writeFile( + path.join(fixtureRoot, 'observation.txt'), + 'Witnesses reported unusual movement on 2024-03-05 near Phoenix in Arizona.' + ); + + const report = await generateAnalyticsReport(fixtureRoot, { workers: 1, clearCache: true }); + assert.ok(report.descriptive.dates.length > 0); + assert.ok(report.descriptive.locations.includes('Phoenix')); + } finally { + await fs.rm(fixtureRoot, { recursive: true, force: true }); + } +}); From c12566aea1203b116f5ae013c940dbcf2ff589f4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Jun 2026 09:41:14 +0000 Subject: [PATCH 15/15] Address latest PR review thread regressions --- package-lock.json | 19 ++++++++- package.json | 4 +- src/analytics/predictive.js | 3 +- src/delivery/csv-generator.js | 2 +- src/ingestion/file-ingestion.js | 74 ++++++++++++++++----------------- src/ingestion/worker.js | 2 +- 6 files changed, 60 insertions(+), 44 deletions(-) diff --git a/package-lock.json b/package-lock.json index d1e5ba5..49baa7c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,7 +16,8 @@ "tesseract.js": "^7.0.0" }, "devDependencies": { - "commit-and-tag-version": "^12.7.3" + "commit-and-tag-version": "^12.7.3", + "husky": "^9.1.7" } }, "node_modules/@babel/code-frame": { @@ -1299,6 +1300,22 @@ "node": ">=10" } }, + "node_modules/husky": { + "version": "9.1.7", + "resolved": "https://registry.npmjs.org/husky/-/husky-9.1.7.tgz", + "integrity": "sha512-5gs5ytaNjBrh5Ow3zrvdUUY+0VxIuWVL4i9irt6friV+BqdCfmV11CQTWMiBYWHbXhco+J1kHfTOUkePhCDvMA==", + "dev": true, + "license": "MIT", + "bin": { + "husky": "bin.js" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/typicode" + } + }, "node_modules/idb-keyval": { "version": "6.2.5", "resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.5.tgz", diff --git a/package.json b/package.json index 3382a69..b8c1b2c 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "test": "node --test --experimental-test-coverage", "docs:generate": "node scripts/generate-docs.js", "docs:check": "node scripts/generate-docs.js --check && node scripts/validate-docs.js", + "prepare": "husky", "release": "commit-and-tag-version", "postrelease": "git push --follow-tags && gh release create v%npm_package_version% --notes-file CHANGELOG.md --title \"Release v%npm_package_version%\"" }, @@ -29,6 +30,7 @@ "tesseract.js": "^7.0.0" }, "devDependencies": { - "commit-and-tag-version": "^12.7.3" + "commit-and-tag-version": "^12.7.3", + "husky": "^9.1.7" } } diff --git a/src/analytics/predictive.js b/src/analytics/predictive.js index bacfe5d..492e76b 100644 --- a/src/analytics/predictive.js +++ b/src/analytics/predictive.js @@ -48,8 +48,7 @@ function fillEmptyIntervals(orderedMonths, timeline) { function buildKeywordSeries(files) { const timeline = {}; for (const file of files) { - // 🚨 FIX: Extract historical dates first, fallback to OS modification if none exist - const documentDate = (file.dates && file.dates.length > 0) ? file.dates[0] : file.modifiedAt; + const documentDate = (file.dates || []).find((value) => /^[0-9]{4}-[0-9]{2}(?:-[0-9]{2})?$/.test(value)) || file.modifiedAt; if (!documentDate) continue; const key = monthKey(documentDate); diff --git a/src/delivery/csv-generator.js b/src/delivery/csv-generator.js index a2ef2d8..45e328c 100644 --- a/src/delivery/csv-generator.js +++ b/src/delivery/csv-generator.js @@ -3,7 +3,7 @@ const path = require('node:path'); function escapeCsvCell(value) { const stringValue = String(value ?? ''); - const sanitizedValue = /^[=+\-@]/.test(stringValue) ? `'${stringValue}` : stringValue; + const sanitizedValue = /^\s*[=+\-@]/.test(stringValue) ? `'${stringValue}` : stringValue; return `"${sanitizedValue.replace(/"/g, '""')}"`; } diff --git a/src/ingestion/file-ingestion.js b/src/ingestion/file-ingestion.js index 0172399..3d1ecb9 100644 --- a/src/ingestion/file-ingestion.js +++ b/src/ingestion/file-ingestion.js @@ -80,47 +80,45 @@ async function ingestDirectory(rootDirectory, options = {}) { const maxCores = options.workers || Math.max(1, os.cpus().length - 1); const numWorkers = Math.min(pathsToProcess.length, maxCores); - if (numWorkers === 0) { - return { sourceDirectory, files }; - } - - process.stdout.write(`\n🚀 Initializing WebAssembly Worker Pool (${numWorkers} threads)...\n`); - - let currentIndex = 0; - - await Promise.all( - Array.from({ length: numWorkers }).map(() => { - return new Promise((resolve) => { - const worker = new Worker(path.join(__dirname, "worker.js")); - - worker.on("message", (msg) => { - if (msg.success && msg.result) { - files.push(msg.result); - cache[msg.filePath] = { fingerprint: msg.fingerprint, data: msg.result }; - } else if (!msg.success) { - process.stderr.write(`\n⚠️ File failed (${msg.filePath}): ${msg.error}\n`); - } - assignNextTask(); - }); - - worker.on("error", (err) => { - process.stderr.write(`\n⚠️ Fatal Worker Crash: ${err.message}\n`); - worker.terminate().then(resolve); - }); - - function assignNextTask() { - if (currentIndex >= pathsToProcess.length) { + if (numWorkers > 0) { + process.stdout.write(`\n🚀 Initializing WebAssembly Worker Pool (${numWorkers} threads)...\n`); + + let currentIndex = 0; + + await Promise.all( + Array.from({ length: numWorkers }).map(() => { + return new Promise((resolve) => { + const worker = new Worker(path.join(__dirname, "worker.js")); + + worker.on("message", (msg) => { + if (msg.success && msg.result) { + files.push(msg.result); + cache[msg.filePath] = { fingerprint: msg.fingerprint, data: msg.result }; + } else if (!msg.success) { + process.stderr.write(`\n⚠️ File failed (${msg.filePath}): ${msg.error}\n`); + } + assignNextTask(); + }); + + worker.on("error", (err) => { + process.stderr.write(`\n⚠️ Fatal Worker Crash: ${err.message}\n`); worker.terminate().then(resolve); - return; + }); + + function assignNextTask() { + if (currentIndex >= pathsToProcess.length) { + worker.terminate().then(resolve); + return; + } + const task = pathsToProcess[currentIndex++]; + worker.postMessage({ filePath: task.filePath, fingerprint: task.fingerprint, rootDirectory: sourceDirectory }); } - const task = pathsToProcess[currentIndex++]; - worker.postMessage({ filePath: task.filePath, fingerprint: task.fingerprint, rootDirectory: sourceDirectory }); - } - assignNextTask(); - }); - }) - ); + assignNextTask(); + }); + }) + ); + } // Save newly parsed data back to .analytics_cache.json const tempCachePath = `${cachePath}.${process.pid}.${Date.now()}.tmp`; diff --git a/src/ingestion/worker.js b/src/ingestion/worker.js index 5b4f436..d333a2c 100644 --- a/src/ingestion/worker.js +++ b/src/ingestion/worker.js @@ -42,7 +42,7 @@ parentPort.on('message', async (task) => { .replace(/[^\w\s]/g, '') .toLowerCase() .split(/\s+/) - .filter(word => word.length > 1 && !STOP_WORDS.has(word)); + .filter(word => word.length > 1 && !STOP_WORDS.has(word) && !/^\d+$/.test(word)); // 🚀 OPTIMIZATION: Calculate map inside worker to drastically reduce IPC channel memory usage for (const word of rawWords) {