From 007f8123cfe204783041acaf11996d06b6f4fa68 Mon Sep 17 00:00:00 2001 From: umerkay Date: Fri, 1 May 2026 12:41:42 +0500 Subject: [PATCH] feat(research): add Olostep web research endpoints --- backend/.env.example | 3 + backend/package-lock.json | 13 +++ backend/package.json | 3 + backend/src/index.ts | 2 + backend/src/lib/olostep.ts | 189 +++++++++++++++++++++++++++++++++ backend/src/routes/research.ts | 166 +++++++++++++++++++++++++++++ 6 files changed, 376 insertions(+) create mode 100644 backend/src/lib/olostep.ts create mode 100644 backend/src/routes/research.ts diff --git a/backend/.env.example b/backend/.env.example index 1db370a9..1d07f1e8 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -12,3 +12,6 @@ GEMINI_API_KEY=your-gemini-key ANTHROPIC_API_KEY=your-anthropic-key OPENROUTER_API_KEY=your-openrouter-key RESEND_API_KEY=your-resend-key + +# Olostep (optional — enables web research feature) +OLOSTEP_API_KEY= diff --git a/backend/package-lock.json b/backend/package-lock.json index 86f82382..1997d400 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -24,6 +24,7 @@ "libreoffice-convert": "^1.6.0", "mammoth": "^1.9.0", "multer": "^1.4.5-lts.2", + "olostep": "^1.0.0", "pdfjs-dist": "^4.10.38", "resend": "^4.5.1" }, @@ -4123,6 +4124,18 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/olostep": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/olostep/-/olostep-1.0.0.tgz", + "integrity": "sha512-ENxoOGbsJ8N87rboZI5SryF60pqz6yDCcsOKW6OqGOKgPehl/dmw4sW+UiepiBO+VPsOz+GYddAEmnInsKCsjg==", + "license": "MIT", + "dependencies": { + "node-fetch": "^3.3.2" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/on-finished": { "version": "2.4.1", "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", diff --git a/backend/package.json b/backend/package.json index 50dfb585..faed22c1 100644 --- a/backend/package.json +++ b/backend/package.json @@ -27,6 +27,9 @@ "pdfjs-dist": "^4.10.38", "resend": "^4.5.1" }, + "optionalDependencies": { + "olostep": "^1.0.0" + }, "devDependencies": { "@types/cors": "^2.8.17", "@types/express": "^4.17.21", diff --git a/backend/src/index.ts b/backend/src/index.ts index 0e99fffb..adf719e2 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -9,6 +9,7 @@ import { tabularRouter } from "./routes/tabular"; import { workflowsRouter } from "./routes/workflows"; import { userRouter } from "./routes/user"; import { downloadsRouter } from "./routes/downloads"; +import { researchRouter } from "./routes/research"; const app = express(); const PORT = process.env.PORT ?? 3001; @@ -31,6 +32,7 @@ app.use("/workflows", workflowsRouter); app.use("/user", userRouter); app.use("/users", userRouter); app.use("/download", downloadsRouter); +app.use("/api/research", researchRouter); app.get("/health", (_req, res) => res.json({ ok: true })); diff --git a/backend/src/lib/olostep.ts b/backend/src/lib/olostep.ts new file mode 100644 index 00000000..36922a85 --- /dev/null +++ b/backend/src/lib/olostep.ts @@ -0,0 +1,189 @@ +type OlostepModule = typeof import("olostep"); + +let cachedModule: OlostepModule | null | undefined; + +async function loadOlostep(): Promise { + if (cachedModule !== undefined) return cachedModule; + try { + cachedModule = await import("olostep"); + return cachedModule; + } catch { + cachedModule = null; + return null; + } +} + +const apiKey = process.env.OLOSTEP_API_KEY ?? ""; + +/** + * Fetches clean Markdown content from a URL using the Olostep scrape API. + * Returns the markdown string on success, or a string starting with "Error:" on failure. + */ +export async function scrapeUrl(url: string): Promise { + const olostepModule = await loadOlostep(); + if (!olostepModule) { + return "Error: Olostep is not installed. Install it with: npm install olostep"; + } + + if (!apiKey) { + return "Error: OLOSTEP_API_KEY is not configured in environment."; + } + + try { + const { default: Olostep, Format } = olostepModule; + const client = new Olostep({ apiKey }); + const result = await client.scrapes.create({ + url, + formats: [Format.MARKDOWN], + removeImages: true, + }); + + return ( + result.markdown_content ?? "Error: Olostep returned no markdown content." + ); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return `Error: ${msg}`; + } +} + +/** + * Fetches clean HTML content from a URL using the Olostep scrape API. + * Returns the html string on success, or a string starting with "Error:" on failure. + */ +export async function scrapeUrlHtml(url: string): Promise { + const olostepModule = await loadOlostep(); + if (!olostepModule) { + return "Error: Olostep is not installed. Install it with: npm install olostep"; + } + + if (!apiKey) { + return "Error: OLOSTEP_API_KEY is not configured in environment."; + } + + try { + const { default: Olostep, Format } = olostepModule; + const client = new Olostep({ apiKey }); + const result = await client.scrapes.create({ + url, + formats: [Format.HTML], + }); + + return result.html_content ?? "Error: Olostep returned no HTML content."; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return `Error: ${msg}`; + } +} + +/** + * Uses Olostep's AI-powered answers endpoint to search and extract information from a URL. + * Returns structured answer with sources on success, or a string starting with "Error:" on failure. + */ +export async function answerQuestion( + url: string, + question: string, +): Promise<{ answer: string; sources: string[] } | { error: string }> { + const olostepModule = await loadOlostep(); + if (!olostepModule) { + return { + error: "Olostep is not installed. Install it with: npm install olostep", + }; + } + + if (!apiKey) { + return { error: "OLOSTEP_API_KEY is not configured in environment." }; + } + + try { + const { default: Olostep } = olostepModule; + const client = new Olostep({ apiKey }); + const task = `${question}\n\nContext URL: ${url}`; + const result = await client.answers.create(task); + + return { + answer: result.answer ?? "", + sources: result.sources ?? [], + }; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return { error: `Error: ${msg}` }; + } +} + +/** + * Crawls a website starting from a URL, following links up to maxPages and maxDepth. + * Returns crawl job ID for async tracking. + */ +export async function crawlWebsite( + url: string, + maxPages: number = 50, + maxDepth: number = 2, +): Promise<{ crawlId: string } | { error: string }> { + const olostepModule = await loadOlostep(); + if (!olostepModule) { + return { + error: "Olostep is not installed. Install it with: npm install olostep", + }; + } + + if (!apiKey) { + return { error: "OLOSTEP_API_KEY is not configured in environment." }; + } + + try { + const { default: Olostep } = olostepModule; + const client = new Olostep({ apiKey }); + const crawl = await client.crawls.create({ + url, + maxPages, + maxDepth, + }); + + return { crawlId: crawl.id }; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return { error: `Error: ${msg}` }; + } +} + +/** + * Maps/indexes URLs from a website using Olostep's site mapping API. + * Returns up to topN most relevant URLs based on optional search query. + */ +export async function mapSiteUrls( + url: string, + topN: number = 50, + searchQuery?: string, +): Promise { + const olostepModule = await loadOlostep(); + if (!olostepModule) { + return { + error: "Olostep is not installed. Install it with: npm install olostep", + }; + } + + if (!apiKey) { + return { error: "OLOSTEP_API_KEY is not configured in environment." }; + } + + try { + const { default: Olostep } = olostepModule; + const client = new Olostep({ apiKey }); + const map = await client.maps.create({ + url, + topN, + searchQuery, + includeSubdomain: true, + }); + + const urls: string[] = []; + for await (const pageUrl of map.urls()) { + urls.push(pageUrl); + } + return urls; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return { error: `Error: ${msg}` }; + } +} diff --git a/backend/src/routes/research.ts b/backend/src/routes/research.ts new file mode 100644 index 00000000..9bafd18e --- /dev/null +++ b/backend/src/routes/research.ts @@ -0,0 +1,166 @@ +import { Router } from "express"; +import { requireAuth } from "../middleware/auth"; +import { + scrapeUrl, + scrapeUrlHtml, + answerQuestion, + crawlWebsite, + mapSiteUrls, +} from "../lib/olostep"; + +export const researchRouter = Router(); + +// POST /api/research/scrape +// Fetch clean Markdown content from a URL +researchRouter.post("/scrape", requireAuth, async (req, res) => { + const { url } = req.body as { url?: unknown }; + + if (typeof url !== "string" || !url.trim()) { + return void res.status(400).json({ error: "url is required" }); + } + + const normalizedUrl = url.trim(); + if ( + !normalizedUrl.startsWith("http://") && + !normalizedUrl.startsWith("https://") + ) { + return void res + .status(400) + .json({ error: "url must start with http:// or https://" }); + } + + try { + const content = await scrapeUrl(normalizedUrl); + + if (content.startsWith("Error:")) { + return void res.status(500).json({ error: content }); + } + + return void res.status(200).json({ content }); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + return void res.status(500).json({ error: message }); + } +}); + +// POST /api/research/search +// Ask AI-powered questions about a URL's content +researchRouter.post("/search", requireAuth, async (req, res) => { + const { url, question } = req.body as { + url?: unknown; + question?: unknown; + }; + + if (typeof url !== "string" || !url.trim()) { + return void res.status(400).json({ error: "url is required" }); + } + + if (typeof question !== "string" || !question.trim()) { + return void res.status(400).json({ error: "question is required" }); + } + + const normalizedUrl = url.trim(); + if ( + !normalizedUrl.startsWith("http://") && + !normalizedUrl.startsWith("https://") + ) { + return void res + .status(400) + .json({ error: "url must start with http:// or https://" }); + } + + try { + const result = await answerQuestion(normalizedUrl, question.trim()); + + if ("error" in result) { + return void res.status(500).json({ error: result.error }); + } + + return void res.status(200).json(result); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + return void res.status(500).json({ error: message }); + } +}); + +// POST /api/research/crawl +// Start crawling a website +researchRouter.post("/crawl", requireAuth, async (req, res) => { + const { url, maxPages, maxDepth } = req.body as { + url?: unknown; + maxPages?: unknown; + maxDepth?: unknown; + }; + + if (typeof url !== "string" || !url.trim()) { + return void res.status(400).json({ error: "url is required" }); + } + + const normalizedUrl = url.trim(); + if ( + !normalizedUrl.startsWith("http://") && + !normalizedUrl.startsWith("https://") + ) { + return void res + .status(400) + .json({ error: "url must start with http:// or https://" }); + } + + const pages = + typeof maxPages === "number" && maxPages > 0 ? maxPages : 50; + const depth = + typeof maxDepth === "number" && maxDepth > 0 ? maxDepth : 2; + + try { + const result = await crawlWebsite(normalizedUrl, pages, depth); + + if ("error" in result) { + return void res.status(500).json({ error: result.error }); + } + + return void res.status(200).json(result); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + return void res.status(500).json({ error: message }); + } +}); + +// POST /api/research/map +// Map/index URLs on a website +researchRouter.post("/map", requireAuth, async (req, res) => { + const { url, topN, searchQuery } = req.body as { + url?: unknown; + topN?: unknown; + searchQuery?: unknown; + }; + + if (typeof url !== "string" || !url.trim()) { + return void res.status(400).json({ error: "url is required" }); + } + + const normalizedUrl = url.trim(); + if ( + !normalizedUrl.startsWith("http://") && + !normalizedUrl.startsWith("https://") + ) { + return void res + .status(400) + .json({ error: "url must start with http:// or https://" }); + } + + const n = typeof topN === "number" && topN > 0 ? topN : 50; + const query = typeof searchQuery === "string" ? searchQuery : undefined; + + try { + const result = await mapSiteUrls(normalizedUrl, n, query); + + if ("error" in result) { + return void res.status(500).json({ error: result.error }); + } + + return void res.status(200).json({ urls: result }); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : String(err); + return void res.status(500).json({ error: message }); + } +});