From e9985633653e9692a1de31b012a1c86a8ab4ac7d Mon Sep 17 00:00:00 2001 From: harshpreet931 Date: Wed, 17 Sep 2025 13:16:22 +0530 Subject: [PATCH 1/3] feat: add PDF processing support and enhance document handling - Updated allowed document MIME types to include 'application/pdf'. - Implemented PDF content extraction in a new module (pdf-parser.ts). - Integrated PDF processing into the document extraction workflow. - Enhanced error handling for PDF processing, including password protection. - Added functions for normalizing and cleaning text extracted from PDFs. - Implemented chunking of text for better handling of large documents. - Introduced image extraction markers and descriptions for images in PDFs. --- examples/attachment-demo-server.ts | 46 ++- package.json | 2 + pnpm-lock.yaml | 333 ++++++++++++++++++ src/utils/attachments.ts | 3 +- src/utils/document-processor.ts | 8 +- src/utils/pdf-parser.ts | 546 +++++++++++++++++++++++++++++ 6 files changed, 932 insertions(+), 6 deletions(-) create mode 100644 src/utils/pdf-parser.ts diff --git a/examples/attachment-demo-server.ts b/examples/attachment-demo-server.ts index 03e44bf..b4b50e7 100644 --- a/examples/attachment-demo-server.ts +++ b/examples/attachment-demo-server.ts @@ -9,7 +9,7 @@ When users send you attachments, analyze them carefully and provide helpful, det For images: Describe what you see in detail. For documents: Analyze and summarize the content, structure, or data as appropriate. -Supported document types: DOCX, XLSX, CSV, TXT, JSON, ZIP files.`, +Supported document types: PDF, DOCX, XLSX, CSV, TXT, JSON, ZIP files.`, modelConfig: { name: 'claude-sonnet-4', temperature: 0.7, @@ -144,6 +144,48 @@ server.start().then(() => { ] }'\n`); + console.log('5.5. PDF document:'); + console.log(`curl -X POST http://localhost:3002/chat \\ + -H "Content-Type: application/json" \\ + -d '{ + "agentName": "attachment-analyst", + "messages": [ + { + "role": "user", + "content": "Please analyze this PDF document", + "attachments": [ + { + "kind": "document", + "mimeType": "application/pdf", + "name": "sample-document.pdf", + "url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + } + ] + } + ] + }'\n`); + + console.log('5.6. PDF with base64 data:'); + console.log(`curl -X POST http://localhost:3002/chat \\ + -H "Content-Type: application/json" \\ + -d '{ + "agentName": "attachment-analyst", + "messages": [ + { + "role": "user", + "content": "Extract text from this PDF document", + "attachments": [ + { + "kind": "document", + "mimeType": "application/pdf", + "name": "test-document.pdf", + "data": "JVBERi0xLjQKMSAwIG9iago8PAovVHlwZSAvQ2F0YWxvZwovUGFnZXMgMiAwIFIKPj4KZW5kb2JqCjIgMCBvYmoKPDwKL1R5cGUgL1BhZ2VzCi9LaWRzIFsgMyAwIFIgXQovQ291bnQgMQo+PgplbmRvYmoKMyAwIG9iago8PAovVHlwZSAvUGFnZQovUGFyZW50IDIgMCBSCi9NZWRpYUJveCBbIDAgMCA2MTIgNzkyIF0KL1Jlc291cmNlcyA8PAovRm9udCA8PAovRjEgNCAwIFIKPj4KPj4KL0NvbnRlbnRzIDUgMCBSCj4+CmVuZG9iago0IDAgb2JqCjw8Ci9UeXBlIC9Gb250Ci9TdWJ0eXBlIC9UeXBlMQovQmFzZUZvbnQgL0hlbHZldGljYQo+PgplbmRvYmoKNSAwIG9iago8PAovTGVuZ3RoIDQ0Cj4+CnN0cmVhbQpCVApxCjcyIDcwMCBUZAovRjEgMTIgVGYKKEhlbGxvLCBQREYgV29ybGQhKSBUagpFVApRCmVuZHN0cmVhbQplbmRvYmoKeHJlZgowIDYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDAwMDEwIDAwMDAwIG4gCjAwMDAwMDAwNzkgMDAwMDAgbiAKMDAwMDAwMDE3MyAwMDAwMCBuIAowMDAwMDAwMzAxIDAwMDAwIG4gCjAwMDAwMDAzODAgMDAwMDAgbiAKdHJhaWxlcgo8PAovU2l6ZSA2Ci9Sb290IDEgMCBSCj4+CnN0YXJ0eHJlZgo0NzQKJSVFT0Y=" + } + ] + } + ] + }'\n`); + console.log('6. CSV data:'); console.log(`curl -X POST http://localhost:3002/chat \\ -H "Content-Type: application/json" \\ @@ -286,7 +328,7 @@ server.start().then(() => { console.log('Configuration:'); console.log('- Use Ctrl+C to stop the server'); console.log('- Image attachments: Full visual analysis'); - console.log('- Document attachments: Text extraction and analysis for DOCX, XLSX, CSV, TXT, JSON, ZIP'); + console.log('- Document attachments: Text extraction and analysis for PDF, DOCX, XLSX, CSV, TXT, JSON, ZIP'); console.log('- LiteLLM format: Use "useLiteLLMFormat": true for efficient large file processing'); console.log(' * Large documents: No context window waste, native model processing'); console.log(' * Better layout understanding, tables, images preserved'); diff --git a/package.json b/package.json index fa67b7e..1b742d2 100644 --- a/package.json +++ b/package.json @@ -143,6 +143,7 @@ "@modelcontextprotocol/sdk": "^1.17.4", "@types/yauzl": "^2.10.3", "ai": "^5.0.35", + "canvas": "^3.2.0", "eventsource": "^2.0.2", "express": "^4.18.2", "fastify": "^4.29.1", @@ -150,6 +151,7 @@ "mathjs": "^14.6.0", "openai": "^4.0.0", "papaparse": "^5.5.3", + "pdfjs-dist": "^5.4.149", "tunnel": "^0.0.6", "uuid": "^9.0.0", "xlsx": "^0.18.5", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5cac41d..5c44b27 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,9 @@ importers: ai: specifier: ^5.0.35 version: 5.0.39(zod@3.25.76) + canvas: + specifier: ^3.2.0 + version: 3.2.0 eventsource: specifier: ^2.0.2 version: 2.0.2 @@ -41,6 +44,9 @@ importers: papaparse: specifier: ^5.5.3 version: 5.5.3 + pdfjs-dist: + specifier: ^5.4.149 + version: 5.4.149 tunnel: specifier: ^0.0.6 version: 0.0.6 @@ -965,6 +971,70 @@ packages: resolution: {integrity: sha512-QakrKIGniGuRVfWBdMsDea/dx1PNE739QJ7gCM41s9q+qaCYTHCdsIBXQVVXry3mfWAiaM9kT22Hyz53Uw8mfg==} engines: {node: '>=18'} + '@napi-rs/canvas-android-arm64@0.1.80': + resolution: {integrity: sha512-sk7xhN/MoXeuExlggf91pNziBxLPVUqF2CAVnB57KLG/pz7+U5TKG8eXdc3pm0d7Od0WreB6ZKLj37sX9muGOQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [android] + + '@napi-rs/canvas-darwin-arm64@0.1.80': + resolution: {integrity: sha512-O64APRTXRUiAz0P8gErkfEr3lipLJgM6pjATwavZ22ebhjYl/SUbpgM0xcWPQBNMP1n29afAC/Us5PX1vg+JNQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [darwin] + + '@napi-rs/canvas-darwin-x64@0.1.80': + resolution: {integrity: sha512-FqqSU7qFce0Cp3pwnTjVkKjjOtxMqRe6lmINxpIZYaZNnVI0H5FtsaraZJ36SiTHNjZlUB69/HhxNDT1Aaa9vA==} + engines: {node: '>= 10'} + cpu: [x64] + os: [darwin] + + '@napi-rs/canvas-linux-arm-gnueabihf@0.1.80': + resolution: {integrity: sha512-eyWz0ddBDQc7/JbAtY4OtZ5SpK8tR4JsCYEZjCE3dI8pqoWUC8oMwYSBGCYfsx2w47cQgQCgMVRVTFiiO38hHQ==} + engines: {node: '>= 10'} + cpu: [arm] + os: [linux] + + '@napi-rs/canvas-linux-arm64-gnu@0.1.80': + resolution: {integrity: sha512-qwA63t8A86bnxhuA/GwOkK3jvb+XTQaTiVML0vAWoHyoZYTjNs7BzoOONDgTnNtr8/yHrq64XXzUoLqDzU+Uuw==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + + '@napi-rs/canvas-linux-arm64-musl@0.1.80': + resolution: {integrity: sha512-1XbCOz/ymhj24lFaIXtWnwv/6eFHXDrjP0jYkc6iHQ9q8oXKzUX1Lc6bu+wuGiLhGh2GS/2JlfORC5ZcXimRcg==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + + '@napi-rs/canvas-linux-riscv64-gnu@0.1.80': + resolution: {integrity: sha512-XTzR125w5ZMs0lJcxRlS1K3P5RaZ9RmUsPtd1uGt+EfDyYMu4c6SEROYsxyatbbu/2+lPe7MPHOO/0a0x7L/gw==} + engines: {node: '>= 10'} + cpu: [riscv64] + os: [linux] + + '@napi-rs/canvas-linux-x64-gnu@0.1.80': + resolution: {integrity: sha512-BeXAmhKg1kX3UCrJsYbdQd3hIMDH/K6HnP/pG2LuITaXhXBiNdh//TVVVVCBbJzVQaV5gK/4ZOCMrQW9mvuTqA==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + + '@napi-rs/canvas-linux-x64-musl@0.1.80': + resolution: {integrity: sha512-x0XvZWdHbkgdgucJsRxprX/4o4sEed7qo9rCQA9ugiS9qE2QvP0RIiEugtZhfLH3cyI+jIRFJHV4Fuz+1BHHMg==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + + '@napi-rs/canvas-win32-x64-msvc@0.1.80': + resolution: {integrity: sha512-Z8jPsM6df5V8B1HrCHB05+bDiCxjE9QA//3YrkKIdVDEwn5RKaqOxCJDRJkl48cJbylcrJbW4HxZbTte8juuPg==} + engines: {node: '>= 10'} + cpu: [x64] + os: [win32] + + '@napi-rs/canvas@0.1.80': + resolution: {integrity: sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==} + engines: {node: '>= 10'} + '@nodelib/fs.scandir@2.1.5': resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==} engines: {node: '>= 8'} @@ -1523,6 +1593,9 @@ packages: bignumber.js@9.3.1: resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==} + bl@4.1.0: + resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} + bluebird@3.4.7: resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} @@ -1565,6 +1638,9 @@ packages: buffer-from@1.1.2: resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} + buffer@5.7.1: + resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==} + bytes@3.1.2: resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==} engines: {node: '>= 0.8'} @@ -1592,6 +1668,10 @@ packages: caniuse-lite@1.0.30001735: resolution: {integrity: sha512-EV/laoX7Wq2J9TQlyIXRxTJqIw4sxfXS4OYgudGxBYRuTv0q7AM6yMEpU/Vo1I94thg9U6EZ2NfZx9GJq83u7w==} + canvas@3.2.0: + resolution: {integrity: sha512-jk0GxrLtUEmW/TmFsk2WghvgHe8B0pxGilqCL21y8lHkPUGa6FTsnCNtHPOzT8O3y+N+m3espawV80bbBlgfTA==} + engines: {node: ^18.12.0 || >= 20.9.0} + cfb@1.2.2: resolution: {integrity: sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==} engines: {node: '>=0.8'} @@ -1604,6 +1684,9 @@ packages: resolution: {integrity: sha512-kWWXztvZ5SBQV+eRgKFeh8q5sLuZY2+8WUIzlxWVTg+oGwY14qylx1KbKzHd8P6ZYkAg0xyIDU9JMHhyJMZ1jw==} engines: {node: '>=10'} + chownr@1.1.4: + resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} + ci-info@3.9.0: resolution: {integrity: sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ==} engines: {node: '>=8'} @@ -1722,6 +1805,10 @@ packages: decimal.js@10.6.0: resolution: {integrity: sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==} + decompress-response@6.0.0: + resolution: {integrity: sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==} + engines: {node: '>=10'} + dedent@1.6.0: resolution: {integrity: sha512-F1Z+5UCFpmQUzJa11agbyPVMbpgT/qA3/SKyJ1jyBgm7dUcUEa8v9JwDkerSQXfakBwFljIxhOJqGkjUwZ9FSA==} peerDependencies: @@ -1730,6 +1817,10 @@ packages: babel-plugin-macros: optional: true + deep-extend@0.6.0: + resolution: {integrity: sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==} + engines: {node: '>=4.0.0'} + deep-is@0.1.4: resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==} @@ -1757,6 +1848,10 @@ packages: resolution: {integrity: sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==} engines: {node: '>= 0.8', npm: 1.2.8000 || >= 1.4.16} + detect-libc@2.1.0: + resolution: {integrity: sha512-vEtk+OcP7VBRtQZ1EJ3bdgzSfBjgnEalLTp5zjJrS+2Z1w2KZly4SBdac/WDU3hhsNAZ9E8SC96ME4Ey8MZ7cg==} + engines: {node: '>=8'} + detect-newline@3.1.0: resolution: {integrity: sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==} engines: {node: '>=8'} @@ -1815,6 +1910,9 @@ packages: resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==} engines: {node: '>= 0.8'} + end-of-stream@1.4.5: + resolution: {integrity: sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==} + error-ex@1.3.2: resolution: {integrity: sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==} @@ -1924,6 +2022,10 @@ packages: resolution: {integrity: sha512-Zk/eNKV2zbjpKzrsQ+n1G6poVbErQxJ0LBOJXaKZ1EViLzH+hrLu9cdXI4zw9dBQJslwBEpbQ2P1oS7nDxs6jQ==} engines: {node: '>= 0.8.0'} + expand-template@2.0.3: + resolution: {integrity: sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==} + engines: {node: '>=6'} + expect@29.7.0: resolution: {integrity: sha512-2Zks0hf1VLFYI1kbh0I5jP3KHHyCHpkfyHBzsSXRFgl/Bg9mWYfMW8oD+PdMPlEwy5HNsR9JutYy6pMeOh61nw==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} @@ -2070,6 +2172,9 @@ packages: resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==} engines: {node: '>= 0.8'} + fs-constants@1.0.0: + resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} + fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2120,6 +2225,9 @@ packages: get-tsconfig@4.10.1: resolution: {integrity: sha512-auHyJ4AgMz7vgS8Hp3N6HXSmlMdUyhSUrfBF16w153rxtLIEOE+HGqaBppczZvnHLqQJfiHotCYpNhl0lUROFQ==} + github-from-package@0.0.0: + resolution: {integrity: sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==} + glob-parent@5.1.2: resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==} engines: {node: '>= 6'} @@ -2209,6 +2317,9 @@ packages: resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==} engines: {node: '>=0.10.0'} + ieee754@1.2.1: + resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==} + ignore@5.3.2: resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==} engines: {node: '>= 4'} @@ -2239,6 +2350,9 @@ packages: inherits@2.0.4: resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} + ini@1.3.8: + resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==} + ioredis@5.7.0: resolution: {integrity: sha512-NUcA93i1lukyXU+riqEyPtSEkyFq8tX90uL659J+qpCZ3rEdViB/APC58oAhIh3+bJln2hzdlZbBZsGNrlsR8g==} engines: {node: '>=12.22.0'} @@ -2646,6 +2760,10 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} + mimic-response@3.1.0: + resolution: {integrity: sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==} + engines: {node: '>=10'} + minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -2656,6 +2774,9 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} + mkdirp-classic@0.5.3: + resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==} + mnemonist@0.39.6: resolution: {integrity: sha512-A/0v5Z59y63US00cRSLiloEIw3t5G+MiKz4BhX21FI+YBJXBOGW0ohFxTxO08dsOYlzxo87T7vGfZKYp2bcAWA==} @@ -2671,6 +2792,9 @@ packages: ms@2.1.3: resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==} + napi-build-utils@2.0.0: + resolution: {integrity: sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==} + natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -2685,6 +2809,13 @@ packages: neo-async@2.6.2: resolution: {integrity: sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==} + node-abi@3.77.0: + resolution: {integrity: sha512-DSmt0OEcLoK4i3NuscSbGjOf3bqiDEutejqENSplMSFA/gmB8mkED9G4pKWnPl7MDU4rSHebKPHeitpDfyH0cQ==} + engines: {node: '>=10'} + + node-addon-api@7.1.1: + resolution: {integrity: sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==} + node-domexception@1.0.0: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} @@ -2821,6 +2952,10 @@ packages: resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} engines: {node: '>=8'} + pdfjs-dist@5.4.149: + resolution: {integrity: sha512-Xe8/1FMJEQPUVSti25AlDpwpUm2QAVmNOpFP0SIahaPIOKBKICaefbzogLdwey3XGGoaP4Lb9wqiw2e9Jqp0LA==} + engines: {node: '>=20.16.0 || >=22.3.0'} + pend@1.2.0: resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} @@ -2903,6 +3038,11 @@ packages: resolution: {integrity: sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==} engines: {node: '>=0.10.0'} + prebuild-install@7.1.3: + resolution: {integrity: sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==} + engines: {node: '>=10'} + hasBin: true + prelude-ls@1.2.1: resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==} engines: {node: '>= 0.8.0'} @@ -2935,6 +3075,9 @@ packages: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} + pump@3.0.3: + resolution: {integrity: sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==} + punycode@2.3.1: resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} engines: {node: '>=6'} @@ -2968,12 +3111,20 @@ packages: resolution: {integrity: sha512-RmkhL8CAyCRPXCE28MMH0z2PNWQBNk2Q09ZdxM9IOOXwxwZbN+qbWaatPkdkWIKL2ZVDImrN/pK5HTRz2PcS4g==} engines: {node: '>= 0.8'} + rc@1.2.8: + resolution: {integrity: sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==} + hasBin: true + react-is@18.3.1: resolution: {integrity: sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==} readable-stream@2.3.8: resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} + readable-stream@3.6.2: + resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==} + engines: {node: '>= 6'} + real-require@0.2.0: resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==} engines: {node: '>= 12.13.0'} @@ -3144,6 +3295,12 @@ packages: signal-exit@3.0.7: resolution: {integrity: sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==} + simple-concat@1.0.1: + resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} + + simple-get@4.0.1: + resolution: {integrity: sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==} + sisteransi@1.0.5: resolution: {integrity: sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==} @@ -3206,6 +3363,10 @@ packages: resolution: {integrity: sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==} engines: {node: '>=6'} + strip-json-comments@2.0.1: + resolution: {integrity: sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==} + engines: {node: '>=0.10.0'} + strip-json-comments@3.1.1: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} engines: {node: '>=8'} @@ -3222,6 +3383,13 @@ packages: resolution: {integrity: sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==} engines: {node: '>= 0.4'} + tar-fs@2.1.4: + resolution: {integrity: sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==} + + tar-stream@2.2.0: + resolution: {integrity: sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==} + engines: {node: '>=6'} + test-exclude@6.0.0: resolution: {integrity: sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==} engines: {node: '>=8'} @@ -3291,6 +3459,9 @@ packages: engines: {node: '>=18.0.0'} hasBin: true + tunnel-agent@0.6.0: + resolution: {integrity: sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==} + tunnel@0.0.6: resolution: {integrity: sha512-1h/Lnq9yajKY2PEbBadPXj3VxsDDu844OnaAo52UVmIzIvwwtBPIuNvkjuzBlTWpfJyUbG3ez0KSBibQkj4ojg==} engines: {node: '>=0.6.11 <=0.7.0 || >=0.7.3'} @@ -4122,6 +4293,50 @@ snapshots: transitivePeerDependencies: - supports-color + '@napi-rs/canvas-android-arm64@0.1.80': + optional: true + + '@napi-rs/canvas-darwin-arm64@0.1.80': + optional: true + + '@napi-rs/canvas-darwin-x64@0.1.80': + optional: true + + '@napi-rs/canvas-linux-arm-gnueabihf@0.1.80': + optional: true + + '@napi-rs/canvas-linux-arm64-gnu@0.1.80': + optional: true + + '@napi-rs/canvas-linux-arm64-musl@0.1.80': + optional: true + + '@napi-rs/canvas-linux-riscv64-gnu@0.1.80': + optional: true + + '@napi-rs/canvas-linux-x64-gnu@0.1.80': + optional: true + + '@napi-rs/canvas-linux-x64-musl@0.1.80': + optional: true + + '@napi-rs/canvas-win32-x64-msvc@0.1.80': + optional: true + + '@napi-rs/canvas@0.1.80': + optionalDependencies: + '@napi-rs/canvas-android-arm64': 0.1.80 + '@napi-rs/canvas-darwin-arm64': 0.1.80 + '@napi-rs/canvas-darwin-x64': 0.1.80 + '@napi-rs/canvas-linux-arm-gnueabihf': 0.1.80 + '@napi-rs/canvas-linux-arm64-gnu': 0.1.80 + '@napi-rs/canvas-linux-arm64-musl': 0.1.80 + '@napi-rs/canvas-linux-riscv64-gnu': 0.1.80 + '@napi-rs/canvas-linux-x64-gnu': 0.1.80 + '@napi-rs/canvas-linux-x64-musl': 0.1.80 + '@napi-rs/canvas-win32-x64-msvc': 0.1.80 + optional: true + '@nodelib/fs.scandir@2.1.5': dependencies: '@nodelib/fs.stat': 2.0.5 @@ -4831,6 +5046,12 @@ snapshots: bignumber.js@9.3.1: {} + bl@4.1.0: + dependencies: + buffer: 5.7.1 + inherits: 2.0.4 + readable-stream: 3.6.2 + bluebird@3.4.7: {} body-parser@1.20.3: @@ -4898,6 +5119,11 @@ snapshots: buffer-from@1.1.2: {} + buffer@5.7.1: + dependencies: + base64-js: 1.5.1 + ieee754: 1.2.1 + bytes@3.1.2: {} call-bind-apply-helpers@1.0.2: @@ -4918,6 +5144,11 @@ snapshots: caniuse-lite@1.0.30001735: {} + canvas@3.2.0: + dependencies: + node-addon-api: 7.1.1 + prebuild-install: 7.1.3 + cfb@1.2.2: dependencies: adler-32: 1.3.1 @@ -4930,6 +5161,8 @@ snapshots: char-regex@1.0.2: {} + chownr@1.1.4: {} + ci-info@3.9.0: {} cjs-module-lexer@1.4.3: {} @@ -5024,8 +5257,14 @@ snapshots: decimal.js@10.6.0: {} + decompress-response@6.0.0: + dependencies: + mimic-response: 3.1.0 + dedent@1.6.0: {} + deep-extend@0.6.0: {} + deep-is@0.1.4: {} deepmerge@4.3.1: {} @@ -5040,6 +5279,8 @@ snapshots: destroy@1.2.0: {} + detect-libc@2.1.0: {} + detect-newline@3.1.0: {} diff-sequences@29.6.3: {} @@ -5084,6 +5325,10 @@ snapshots: encodeurl@2.0.0: {} + end-of-stream@1.4.5: + dependencies: + once: 1.4.0 + error-ex@1.3.2: dependencies: is-arrayish: 0.2.1 @@ -5238,6 +5483,8 @@ snapshots: exit@0.1.2: {} + expand-template@2.0.3: {} + expect@29.7.0: dependencies: '@jest/expect-utils': 29.7.0 @@ -5502,6 +5749,8 @@ snapshots: fresh@2.0.0: {} + fs-constants@1.0.0: {} + fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -5562,6 +5811,8 @@ snapshots: dependencies: resolve-pkg-maps: 1.0.0 + github-from-package@0.0.0: {} + glob-parent@5.1.2: dependencies: is-glob: 4.0.3 @@ -5672,6 +5923,8 @@ snapshots: dependencies: safer-buffer: 2.1.2 + ieee754@1.2.1: {} + ignore@5.3.2: {} immediate@3.0.6: {} @@ -5703,6 +5956,8 @@ snapshots: inherits@2.0.4: {} + ini@1.3.8: {} + ioredis@5.7.0: dependencies: '@ioredis/commands': 1.3.0 @@ -6292,6 +6547,8 @@ snapshots: mimic-fn@2.1.0: {} + mimic-response@3.1.0: {} + minimatch@3.1.2: dependencies: brace-expansion: 1.1.12 @@ -6302,6 +6559,8 @@ snapshots: minimist@1.2.8: {} + mkdirp-classic@0.5.3: {} + mnemonist@0.39.6: dependencies: obliterator: 2.0.5 @@ -6317,6 +6576,8 @@ snapshots: ms@2.1.3: {} + napi-build-utils@2.0.0: {} + natural-compare@1.4.0: {} negotiator@0.6.3: {} @@ -6325,6 +6586,12 @@ snapshots: neo-async@2.6.2: {} + node-abi@3.77.0: + dependencies: + semver: 7.7.2 + + node-addon-api@7.1.1: {} + node-domexception@1.0.0: {} node-fetch@2.7.0: @@ -6436,6 +6703,10 @@ snapshots: path-type@4.0.0: {} + pdfjs-dist@5.4.149: + optionalDependencies: + '@napi-rs/canvas': 0.1.80 + pend@1.2.0: {} pg-cloudflare@1.2.7: @@ -6515,6 +6786,21 @@ snapshots: dependencies: xtend: 4.0.2 + prebuild-install@7.1.3: + dependencies: + detect-libc: 2.1.0 + expand-template: 2.0.3 + github-from-package: 0.0.0 + minimist: 1.2.8 + mkdirp-classic: 0.5.3 + napi-build-utils: 2.0.0 + node-abi: 3.77.0 + pump: 3.0.3 + rc: 1.2.8 + simple-get: 4.0.1 + tar-fs: 2.1.4 + tunnel-agent: 0.6.0 + prelude-ls@1.2.1: {} pretty-format@29.7.0: @@ -6557,6 +6843,11 @@ snapshots: forwarded: 0.2.0 ipaddr.js: 1.9.1 + pump@3.0.3: + dependencies: + end-of-stream: 1.4.5 + once: 1.4.0 + punycode@2.3.1: {} pure-rand@6.1.0: {} @@ -6589,6 +6880,13 @@ snapshots: iconv-lite: 0.6.3 unpipe: 1.0.0 + rc@1.2.8: + dependencies: + deep-extend: 0.6.0 + ini: 1.3.8 + minimist: 1.2.8 + strip-json-comments: 2.0.1 + react-is@18.3.1: {} readable-stream@2.3.8: @@ -6601,6 +6899,12 @@ snapshots: string_decoder: 1.1.1 util-deprecate: 1.0.2 + readable-stream@3.6.2: + dependencies: + inherits: 2.0.4 + string_decoder: 1.1.1 + util-deprecate: 1.0.2 + real-require@0.2.0: {} redis-errors@1.2.0: {} @@ -6799,6 +7103,14 @@ snapshots: signal-exit@3.0.7: {} + simple-concat@1.0.1: {} + + simple-get@4.0.1: + dependencies: + decompress-response: 6.0.0 + once: 1.4.0 + simple-concat: 1.0.1 + sisteransi@1.0.5: {} slash@3.0.0: {} @@ -6853,6 +7165,8 @@ snapshots: strip-final-newline@2.0.0: {} + strip-json-comments@2.0.1: {} + strip-json-comments@3.1.1: {} supports-color@7.2.0: @@ -6865,6 +7179,21 @@ snapshots: supports-preserve-symlinks-flag@1.0.0: {} + tar-fs@2.1.4: + dependencies: + chownr: 1.1.4 + mkdirp-classic: 0.5.3 + pump: 3.0.3 + tar-stream: 2.2.0 + + tar-stream@2.2.0: + dependencies: + bl: 4.1.0 + end-of-stream: 1.4.5 + fs-constants: 1.0.0 + inherits: 2.0.4 + readable-stream: 3.6.2 + test-exclude@6.0.0: dependencies: '@istanbuljs/schema': 0.1.3 @@ -6922,6 +7251,10 @@ snapshots: optionalDependencies: fsevents: 2.3.3 + tunnel-agent@0.6.0: + dependencies: + safe-buffer: 5.2.1 + tunnel@0.0.6: {} type-check@0.4.0: diff --git a/src/utils/attachments.ts b/src/utils/attachments.ts index c822f7c..250653e 100644 --- a/src/utils/attachments.ts +++ b/src/utils/attachments.ts @@ -17,7 +17,8 @@ const ALLOWED_IMAGE_MIME_TYPES = [ const ALLOWED_DOCUMENT_MIME_TYPES = [ 'text/plain', 'text/csv', 'application/json', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/pdf' ]; // Validation helpers diff --git a/src/utils/document-processor.ts b/src/utils/document-processor.ts index b1e2441..a13fa0e 100644 --- a/src/utils/document-processor.ts +++ b/src/utils/document-processor.ts @@ -3,6 +3,7 @@ import * as XLSX from 'xlsx'; import mammoth from 'mammoth'; import Papa from 'papaparse'; import yauzl from 'yauzl'; +import { extractPdfContent } from './pdf-parser.js'; const FETCH_TIMEOUT = 30000; const MAX_DOCUMENT_SIZE = 25 * 1024 * 1024; @@ -98,7 +99,7 @@ export async function extractDocumentContent(attachment: Attachment): Promise(); + +export interface PdfProcessingResult { + text_chunks: string[]; + image_chunks: string[]; + text_chunk_pos: number[]; + image_chunk_pos: number[]; +} + +export interface ProcessedPdfDocument { + content: string; + metadata?: { + pages?: number; + images?: number; + textChunks?: number; + imageChunks?: number; + }; +} + +class PdfProcessingError extends Error { + constructor(message: string, public readonly cause?: unknown) { + super(message); + this.name = 'PdfProcessingError'; + } +} + +/** + * Normalize text by handling Unicode and control characters + */ +export function normalizeText(input: string): string { + if (!input) return ''; + + let normalized = input.normalize('NFC'); + + // Strip control chars except newline/tab + normalized = normalized.replace(/[^\P{C}\n\t]/gu, ''); + + // Normalize whitespace + normalized = normalized.replace(/\u00A0/g, ' '); // nbsp → space + normalized = normalized.replace(/\u200B/g, ''); // zero-width space + normalized = normalized.replace(/\t+/g, ' '); // tabs → single space + + return normalized.trim(); +} + +/** + * Smart letter-spacing collapse for spaced letters like "N A S A" -> "NASA" + */ +function smartDespaceLine(line: string): string { + if (!line) return line; + + const parts = line.split(/(\s+)/); + const out: string[] = []; + + const isSingleAllowed = (s: string) => + s.length === 1 && /[\p{L}\p{N}'']/u.test(s); + + const isSingleLowerLetter = (s: string) => s.length === 1 && /\p{Ll}/u.test(s); + + let i = 0; + while (i < parts.length) { + const tok = parts[i]; + + if (!/\s+/.test(tok) && isSingleAllowed(tok)) { + const runTokens: string[] = [tok]; + let j = i + 1; + + while ( + j + 1 < parts.length && + parts[j] === ' ' && + !/\s+/.test(parts[j + 1]) && + isSingleAllowed(parts[j + 1]) + ) { + runTokens.push(parts[j + 1]); + j += 2; + } + + // Join spaced letters like "N A S A" -> "NASA" + if (runTokens.length >= 3) { + out.push(runTokens.join('')); + i = j; + continue; + } + + // Join two-letter lowercase sequences like "i s" -> "is" + if ( + runTokens.length === 2 && + isSingleLowerLetter(runTokens[0]) && + isSingleLowerLetter(runTokens[1]) + ) { + out.push(runTokens.join('')); + i = j; + continue; + } + } + + out.push(tok); + i += 1; + } + + return out.join(''); +} + +/** + * Clean text by fixing hyphenation, normalizing whitespace, and applying smart spacing + */ +export function cleanText(input: string): string { + let s = normalizeText(input); + + // Fix hyphenation across line breaks + s = s.replace(/(\p{L})-\n(\p{L})/gu, '$1$2'); + + // Trim spaces around newlines + s = s.replace(/[ \t]*\n[ \t]*/g, '\n'); + + // Turn intra-paragraph newlines into spaces, preserve paragraph breaks + const uniqueParaPlaceholder = `\uE000XYNE_PARA_BREAK_${Math.random().toString(36).substring(2)}\uE001`; + s = s.replace(/\n{2,}/g, uniqueParaPlaceholder); + s = s.replace(/\n+/g, ' '); + s = s.replace( + new RegExp( + uniqueParaPlaceholder.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), + 'g' + ), + '\n\n' + ); + + // Apply line-wise despacing + s = s + .split('\n') + .map((line) => smartDespaceLine(line)) + .join('\n'); + + // Remove spaces before punctuation + s = s.replace(/\s+([.,;:!?])/g, '$1'); + + // Cap extreme space runs, preserve 2–4 spaces + s = s.replace(/[ ]{5,}/g, ' '); + + // Trim lines & drop empties + s = s + .split('\n') + .map((l) => l.trim()) + .filter((l) => l.length > 0) + .join('\n'); + + return s.trim(); +} + +/** + * Chunk text by paragraphs with specified max length and overlap + */ +function chunkTextByParagraph(text: string, maxLength: number = 512, overlap: number = 128): string[] { + if (!text || text.length <= maxLength) { + return text ? [text] : []; + } + + const paragraphs = text.split('\n\n').filter(p => p.trim().length > 0); + const chunks: string[] = []; + let currentChunk = ''; + + for (const paragraph of paragraphs) { + const cleanParagraph = paragraph.trim(); + + if (!cleanParagraph) continue; + + // If paragraph alone exceeds maxLength, split it + if (cleanParagraph.length > maxLength) { + // Save current chunk if not empty + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()); + currentChunk = ''; + } + + // Split large paragraph into sentences or by periods + const sentences = cleanParagraph.split(/(?<=[.!?])\s+/); + let sentenceChunk = ''; + + for (const sentence of sentences) { + if (sentenceChunk.length + sentence.length + 1 <= maxLength) { + sentenceChunk += (sentenceChunk ? ' ' : '') + sentence; + } else { + if (sentenceChunk) { + chunks.push(sentenceChunk); + // Add overlap from previous chunk + const overlapText = sentenceChunk.slice(-overlap); + sentenceChunk = overlapText + ' ' + sentence; + } else { + // Single sentence is too long, force split + if (sentence.length > maxLength) { + const words = sentence.split(' '); + let wordChunk = ''; + for (const word of words) { + if (wordChunk.length + word.length + 1 <= maxLength) { + wordChunk += (wordChunk ? ' ' : '') + word; + } else { + if (wordChunk) chunks.push(wordChunk); + wordChunk = word; + } + } + if (wordChunk) sentenceChunk = wordChunk; + } else { + sentenceChunk = sentence; + } + } + } + } + + if (sentenceChunk.trim()) { + currentChunk = sentenceChunk; + } + } else { + // Normal paragraph that fits within maxLength + if (currentChunk.length + cleanParagraph.length + 2 <= maxLength) { + currentChunk += (currentChunk ? '\n\n' : '') + cleanParagraph; + } else { + // Save current chunk and start new one + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()); + } + currentChunk = cleanParagraph; + } + } + } + + // Add final chunk + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()); + } + + return chunks.length > 0 ? chunks : [text]; +} + +/** + * Multiply two 2D transformation matrices + */ +function multiplyMatrices( + m1: number[], + m2: number[] +): [number, number, number, number, number, number] { + const [a1, b1, c1, d1, e1, f1] = m1 as [number, number, number, number, number, number]; + const [a2, b2, c2, d2, e2, f2] = m2 as [number, number, number, number, number, number]; + return [ + a1 * a2 + c1 * b2, + b1 * a2 + d1 * b2, + a1 * c2 + c1 * d2, + b1 * c2 + d1 * d2, + a1 * e2 + c1 * f2 + e1, + b1 * e2 + d1 * f2 + f1, + ]; +} + +/** + * Process collected paragraphs into chunks + */ +function processTextParagraphs( + paragraphs: string[], + text_chunks: string[], + text_chunk_pos: number[], + globalSeq: { value: number }, + overlapBytes: number = 32 +): string { + if (paragraphs.length === 0) return ''; + + const cleanedParagraphs = paragraphs + .map(cleanText) + .filter((p) => p.length > 0); + + if (cleanedParagraphs.length === 0) return ''; + + const cleanedText = cleanedParagraphs.join('\n'); + const chunks = chunkTextByParagraph(cleanedText, 512, 128); + + for (const chunk of chunks) { + text_chunks.push(chunk); + text_chunk_pos.push(globalSeq.value); + globalSeq.value++; + } + + // Return overlap text for continuity across pages + let overlapText = ''; + let overlapLen = 0; + + for (let i = cleanedText.length - 1; i >= 0; i--) { + const charBytes = Buffer.byteLength(cleanedText[i], 'utf8'); + if (overlapLen + charBytes > overlapBytes) { + break; + } + overlapText = cleanedText[i] + overlapText; + overlapLen += charBytes; + } + + return overlapText; +} + +/** + * Simple image description function (placeholder) + * In a real implementation, this would call an AI service + */ +async function describeImageWithLLM(buffer: Buffer): Promise { + // This is a placeholder. In your actual implementation, you would: + // 1. Send the image to a vision-capable AI model + // 2. Get a description back + // For now, return a basic description + return 'This is an image extracted from the PDF document.'; +} + +/** + * Extract text and images from PDF with chunking + */ +export async function extractTextAndImagesWithChunksFromPDF( + data: Uint8Array, + docid: string = crypto.randomUUID(), + extractImages: boolean = false, + describeImages: boolean = true, + includeImageMarkersInText: boolean = true +): Promise { + const loadingTask = pdfjsLib.getDocument({ + data, + // Note: Using empty paths since legacy build handles WASM differently + verbosity: pdfjsLib.VerbosityLevel.ERRORS, + }); + + let pdfDocument: pdfjsLib.PDFDocumentProxy; + try { + pdfDocument = await loadingTask.promise; + } catch (error) { + const { name, message } = error as Error; + if (message.includes('PasswordException') || name.includes('PasswordException')) { + throw new PdfProcessingError('PDF is password protected'); + } else { + throw new PdfProcessingError(`Failed to load PDF: ${message}`, error); + } + } + + try { + const text_chunks: string[] = []; + const image_chunks: string[] = []; + const text_chunk_pos: number[] = []; + const image_chunk_pos: number[] = []; + + const globalSeq = { value: 0 }; + let pageOverlap = ''; + + // Build paragraphs from page using textContent API + const buildParagraphsFromPage = async (page: pdfjsLib.PDFPageProxy): Promise => { + const textContent = await page.getTextContent({ + includeMarkedContent: false, + disableNormalization: false, + }); + + // Build lines using hasEOL and Y-position changes + const lines: string[] = []; + let current = ''; + let prevY: number | null = null; + let prevH: number | null = null; + + for (const item of textContent.items as any[]) { + const str: string = item && typeof item.str === 'string' ? item.str : ''; + if (!str) continue; + + const tr = Array.isArray(item.transform) ? item.transform : []; + const y = typeof tr[5] === 'number' ? tr[5] : null; + const h = typeof item.height === 'number' ? item.height : null; + + let newLine = false; + if (prevY != null && y != null) { + const tol = Math.max(prevH || 0, h || 0, 10) * 0.4; + if (Math.abs(y - prevY) > tol) newLine = true; + } + + if (newLine || (item as any).hasEOL) { + if (current.length > 0) lines.push(current); + current = str; + } else { + current += str; + } + + prevY = y; + prevH = h; + } + if (current.trim().length > 0) lines.push(current); + + // Group lines into paragraphs + const paragraphs: string[] = []; + let buf: string[] = []; + const pushPara = () => { + if (buf.length === 0) return; + paragraphs.push(buf.join('\n')); + buf = []; + }; + + for (const ln of lines) { + if (ln.trim().length === 0) { + pushPara(); + } else { + buf.push(ln); + } + } + pushPara(); + + return paragraphs.filter((p) => p.trim().length > 0); + }; + + // Process each page + for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { + const page = await pdfDocument.getPage(pageNum); + + try { + // Extract text paragraphs + let paragraphs: string[] = await buildParagraphsFromPage(page); + + // Handle page overlap for continuity + if (pageOverlap && paragraphs.length > 0) { + paragraphs[0] = `${pageOverlap} ${paragraphs[0]}`; + pageOverlap = ''; + } else if (pageOverlap) { + paragraphs = [pageOverlap]; + pageOverlap = ''; + } + + // Extract images if requested + if (extractImages) { + const opList = await page.getOperatorList(); + let currentCTM: [number, number, number, number, number, number] = [1, 0, 0, 1, 0, 0]; + const ctmStack: [number, number, number, number, number, number][] = []; + + for (let i = 0; i < opList.fnArray.length; i++) { + const fnId = opList.fnArray[i]; + const args = opList.argsArray[i]; + + switch (fnId) { + case pdfjsLib.OPS.transform: + try { + if (Array.isArray(args) && args.length >= 6 && args.every((n: any) => typeof n === 'number')) { + currentCTM = multiplyMatrices(currentCTM, args as number[]); + } + } catch { + // Silently ignore matrix transformation errors + } + break; + + case pdfjsLib.OPS.save: + ctmStack.push([...currentCTM]); + break; + + case pdfjsLib.OPS.restore: + if (ctmStack.length) currentCTM = ctmStack.pop()!; + break; + + case pdfjsLib.OPS.paintImageXObject: + case pdfjsLib.OPS.paintImageXObjectRepeat: + case pdfjsLib.OPS.paintInlineImageXObject: + case pdfjsLib.OPS.paintImageMaskXObject: + // Extract image processing logic would go here + // For brevity, we'll add a simple image marker + if (includeImageMarkersInText) { + text_chunks.push(`[[IMG#${globalSeq.value}]]`); + text_chunk_pos.push(globalSeq.value); + } + + if (describeImages) { + image_chunks.push('Image extracted from PDF page ' + pageNum); + image_chunk_pos.push(globalSeq.value); + } + + globalSeq.value++; + break; + } + } + } + + // Process text paragraphs + const overlapText = processTextParagraphs( + paragraphs, + text_chunks, + text_chunk_pos, + globalSeq + ); + + pageOverlap = overlapText.trim(); + } finally { + page.cleanup(); + } + } + + return { + text_chunks, + image_chunks, + text_chunk_pos, + image_chunk_pos, + }; + } finally { + await pdfDocument.destroy(); + } +} + +/** + * Extract PDF content for document processor integration + */ +export async function extractPdfContent(buffer: Buffer): Promise { + try { + const uint8Data = new Uint8Array(buffer); + const result = await extractTextAndImagesWithChunksFromPDF( + uint8Data, + crypto.randomUUID(), + false, // Don't extract images for basic content extraction + false, // Don't describe images + false // Don't include image markers + ); + + // Combine all text chunks into content + const content = result.text_chunks.join('\n\n'); + + return { + content: content.trim(), + metadata: { + pages: undefined, // Would need to track from PDF document + textChunks: result.text_chunks.length, + imageChunks: result.image_chunks.length, + } + }; + } catch (error) { + throw new PdfProcessingError( + `Failed to extract PDF content: ${error instanceof Error ? error.message : 'Unknown error'}`, + error + ); + } +} \ No newline at end of file From 2b6a8cffa0446b6534142a610290f25440a6a37e Mon Sep 17 00:00:00 2001 From: harshpreet931 Date: Wed, 17 Sep 2025 14:08:15 +0530 Subject: [PATCH 2/3] feat: add mocks for pdfjs-dist and canvas to resolve Jest import issues --- jest.config.js | 2 ++ src/__mocks__/canvas.ts | 31 ++++++++++++++++ src/__mocks__/pdfjs-dist.ts | 70 +++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 src/__mocks__/canvas.ts create mode 100644 src/__mocks__/pdfjs-dist.ts diff --git a/jest.config.js b/jest.config.js index d2e885b..426e56c 100644 --- a/jest.config.js +++ b/jest.config.js @@ -18,6 +18,8 @@ module.exports = { ], moduleNameMapper: { '^(\\.{1,2}/.*)\\.js$': '$1', + '^pdfjs-dist/legacy/build/pdf\\.mjs$': '/src/__mocks__/pdfjs-dist.ts', + '^canvas$': '/src/__mocks__/canvas.ts', }, collectCoverageFrom: [ 'src/**/*.ts', diff --git a/src/__mocks__/canvas.ts b/src/__mocks__/canvas.ts new file mode 100644 index 0000000..012923f --- /dev/null +++ b/src/__mocks__/canvas.ts @@ -0,0 +1,31 @@ +// Mock for canvas to avoid Jest import issues + +export const createCanvas = jest.fn().mockImplementation(() => ({ + width: 100, + height: 100, + getContext: jest.fn().mockReturnValue({ + drawImage: jest.fn(), + putImageData: jest.fn(), + }), + toBuffer: jest.fn().mockReturnValue(Buffer.from('mock-image-data')), +})); + +export const Image = jest.fn().mockImplementation(() => ({ + onload: null, + onerror: null, + src: null, + width: 100, + height: 100, +})); + +export const ImageData = jest.fn().mockImplementation((data: any, width: number, height: number) => ({ + data, + width, + height, +})); + +export default { + createCanvas, + Image, + ImageData, +}; \ No newline at end of file diff --git a/src/__mocks__/pdfjs-dist.ts b/src/__mocks__/pdfjs-dist.ts new file mode 100644 index 0000000..66c1fa4 --- /dev/null +++ b/src/__mocks__/pdfjs-dist.ts @@ -0,0 +1,70 @@ +// Mock for pdfjs-dist to avoid Jest import issues + +export const VerbosityLevel = { + INFOS: 1, + WARNINGS: 1, + ERRORS: 0, +}; + +export const ImageKind = { + GRAYSCALE_1BPP: 1, + RGB_24BPP: 2, + RGBA_32BPP: 3, +}; + +export const OPS = { + save: 10, + restore: 11, + transform: 12, + showText: 92, + showSpacedText: 93, + nextLine: 91, + nextLineShowText: 94, + nextLineSetSpacingShowText: 95, + setTextMatrix: 79, + moveText: 88, + paintImageXObject: 39, + paintInlineImageXObject: 40, + paintImageMaskXObject: 41, + paintImageXObjectRepeat: 43, + stroke: 20, + closeStroke: 21, + fill: 22, + eoFill: 23, + fillStroke: 24, + eoFillStroke: 25, + closeFillStroke: 26, + closeEOFillStroke: 27, + clip: 29, + eoClip: 30, + rectangle: 19, + shadingFill: 59, + rawFillPath: 122, + paintFormXObjectBegin: 31, + paintFormXObjectEnd: 32, + constructPath: 46, +}; + +export interface PDFDocumentProxy { + numPages: number; + getPage(pageNumber: number): Promise; + destroy(): Promise; +} + +export interface PDFPageProxy { + pageNumber: number; + getTextContent(params?: any): Promise; + getOperatorList(): Promise; + cleanup(): void; +} + +export const getDocument = jest.fn().mockImplementation(() => ({ + promise: Promise.reject(new Error('PDF parsing not available in test environment')) +})); + +export default { + VerbosityLevel, + ImageKind, + OPS, + getDocument, +}; \ No newline at end of file From feaa75d766f22c487d3cb677bb2d1588f252dfa3 Mon Sep 17 00:00:00 2001 From: harshpreet931 Date: Wed, 17 Sep 2025 14:38:04 +0530 Subject: [PATCH 3/3] fix: improve PDF parser based on code review feedback - Add proper documentation for WASM path configuration - Fix pages metadata to return actual page count from PDF document - Improve placeholder image description function documentation - Address Copilot code review suggestions for better clarity --- src/utils/pdf-parser.ts | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/utils/pdf-parser.ts b/src/utils/pdf-parser.ts index 769b377..22a95ba 100644 --- a/src/utils/pdf-parser.ts +++ b/src/utils/pdf-parser.ts @@ -10,7 +10,10 @@ const MIN_IMAGE_DIM_PX = parseInt(process.env.MIN_IMAGE_DIM_PX || '150', 10); const MAX_IMAGE_FILE_SIZE_MB = 25; const SUPPORTED_IMAGE_TYPES = new Set(['image/png', 'image/jpeg', 'image/jpg', 'image/gif', 'image/webp']); -// PDF.js setup - use empty paths since we're using legacy build +// PDF.js setup: +// The legacy build of pdfjs-dist (imported from 'pdfjs-dist/legacy/build/pdf.mjs') does not use WASM modules for openjpeg or qcms, +// so it is safe to set these WASM paths to empty strings. If you switch to a non-legacy build, you must provide valid WASM paths, +// or PDF.js may fail to load certain image types (e.g., JPEG2000 or color-managed images). const openjpegWasmPath = ''; const qcmsWasmPath = ''; @@ -315,10 +318,9 @@ function processTextParagraphs( * In a real implementation, this would call an AI service */ async function describeImageWithLLM(buffer: Buffer): Promise { - // This is a placeholder. In your actual implementation, you would: - // 1. Send the image to a vision-capable AI model - // 2. Get a description back - // For now, return a basic description + // TODO: Implement image description using a vision-capable AI model. + // This is a placeholder function that should be implemented with actual AI vision capabilities. + // For now, return a basic description to avoid breaking existing functionality. return 'This is an image extracted from the PDF document.'; } @@ -518,6 +520,17 @@ export async function extractTextAndImagesWithChunksFromPDF( export async function extractPdfContent(buffer: Buffer): Promise { try { const uint8Data = new Uint8Array(buffer); + + // Get page count first + const loadingTask = pdfjsLib.getDocument({ + data: uint8Data, + verbosity: pdfjsLib.VerbosityLevel.ERRORS, + }); + const pdfDocument = await loadingTask.promise; + const pageCount = pdfDocument.numPages; + await pdfDocument.destroy(); + + // Now extract content const result = await extractTextAndImagesWithChunksFromPDF( uint8Data, crypto.randomUUID(), @@ -532,7 +545,7 @@ export async function extractPdfContent(buffer: Buffer): Promise