From 4dba651237af7f214cf35d296d4b19b78007c8bf Mon Sep 17 00:00:00 2001 From: "novgorodcev.k" Date: Sun, 29 Mar 2026 20:06:41 +0300 Subject: [PATCH 1/2] front --- .env.example | 11 ++ .gitattributes | 6 ++ .gitignore | 6 +- Dockerfile | 33 ++++++ docker-compose.yml | 43 ++++++++ front/app/app.meta.tree | 1 + front/app/app.view.css.ts | 42 ++++++++ front/app/app.view.tree | 94 ++++++++++++++++ front/app/app.view.ts | 149 +++++++++++++++++++++++++ front/app/index.html | 14 +++ server/main.py | 221 ++++++++++++++++++++++++++++++++++++++ server/requirements.txt | 4 + 12 files changed, 623 insertions(+), 1 deletion(-) create mode 100644 .env.example create mode 100644 .gitattributes create mode 100644 Dockerfile create mode 100644 docker-compose.yml create mode 100644 front/app/app.meta.tree create mode 100644 front/app/app.view.css.ts create mode 100644 front/app/app.view.tree create mode 100644 front/app/app.view.ts create mode 100644 front/app/index.html create mode 100644 server/main.py create mode 100644 server/requirements.txt diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..117d133 --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +LLM_API_KEY=github_pat_... +LLM_BASE_URL=https://models.github.ai/inference +LLM_MODEL=openai/gpt-4.1-mini +LLM_RPM=60 + +EMBEDDER_API_KEY=github_pat_... +EMBEDDER_BASE_URL=https://models.github.ai/inference +EMBEDDER_MODEL=text-embedding-3-large +EMBEDDER_DIM=3072 + +RAGU_STORAGE=ragu_data diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a2da00f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +* -text +*.woff binary +*.woff2 binary +*.ttf binary +*.eot binary +*.otf binary diff --git a/.gitignore b/.gitignore index c3e69b5..d7c3f2f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# $mol +-* +.DS_Store + checkpoints/ benchmark/*.json ragu_working_dir/ @@ -175,4 +179,4 @@ cython_debug/ .ruff_cache/ # PyPI configuration file -.pypirc \ No newline at end of file +.pypirc diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c7b74e0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +# --- Frontend --- +FROM node:20-alpine AS frontend + +RUN apk add --no-cache git + +WORKDIR /app +RUN git clone --depth 1 https://github.com/hyoo-ru/mam.git . \ + && npm install + +COPY front/ bog/RAGU/front/ + +RUN npx mam bog/RAGU/front/app + +EXPOSE 9080 + +CMD ["npm", "start"] + + +# --- API --- +FROM python:3.12-slim AS api + +WORKDIR /app + +COPY pyproject.toml ./ +COPY ragu/ ./ragu/ +RUN pip install --no-cache-dir . + +COPY server/ ./server/ +RUN pip install --no-cache-dir -r server/requirements.txt + +EXPOSE 8000 + +CMD ["uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3b18626 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,43 @@ +services: + web: + build: + context: . + dockerfile: Dockerfile + target: frontend + ports: + - "9081:9080" + restart: unless-stopped + tty: true + stdin_open: true + develop: + watch: + - action: sync + path: ./front + target: /app/bog/RAGU/front + + # http://localhost:9081/bog/RAGU/front/app/-/test.html + + api: + platform: linux/arm64 + build: + context: . + dockerfile: Dockerfile + target: api + ports: + - "8100:8000" + restart: unless-stopped + env_file: .env + environment: + - NUMBA_CPU_NAME=generic + volumes: + - ragu_data:/app/ragu_data + develop: + watch: + - action: sync+restart + path: ./server + target: /app/server + + # http://localhost:8100/api/status + +volumes: + ragu_data: diff --git a/front/app/app.meta.tree b/front/app/app.meta.tree new file mode 100644 index 0000000..49489eb --- /dev/null +++ b/front/app/app.meta.tree @@ -0,0 +1 @@ +include \/mol/offline/install diff --git a/front/app/app.view.css.ts b/front/app/app.view.css.ts new file mode 100644 index 0000000..712a8f9 --- /dev/null +++ b/front/app/app.view.css.ts @@ -0,0 +1,42 @@ +namespace $.$$ { + + $mol_style_define( $bog_RAGU_front_app, { + + Documents: { + flex: { + basis: '30rem', + grow: 1, + }, + Body: { + flex: { + grow: 1, + }, + }, + }, + + Doc_text: { + flex: { + grow: 1, + }, + minHeight: '20rem', + }, + + Doc_file: { + alignItems: 'center', + gap: '.5rem', + }, + + Index_record: { + alignItems: 'center', + gap: '.5rem', + }, + + Settings_page: { + flex: { + basis: '25rem', + }, + }, + + } ) + +} diff --git a/front/app/app.view.tree b/front/app/app.view.tree new file mode 100644 index 0000000..38f13d3 --- /dev/null +++ b/front/app/app.view.tree @@ -0,0 +1,94 @@ +$bog_RAGU_front_app $giper_bot + dialog_title @ \RAGU + api_url \http://localhost:8100 + history? / + doc_text? \ + doc_files? / + index_message? \ + index_records? / + config_message? \ + llm_api_key? \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 + llm_base_url? \https://models.github.ai/inference + llm_model? \openai/gpt-4.1-mini + llm_rpm? \60 + embedder_api_key? \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 + embedder_base_url? \https://models.github.ai/inference + embedder_model? \text-embedding-3-large + embedder_dim? \3072 + Doc_file* $mol_row + sub / + <= Doc_file_icon* $mol_icon_file + <= Doc_file_name* $mol_view + sub / + <= doc_file_name* \ + <= Doc_file_remove* $mol_button_minor + click? <=> doc_file_remove*? null + sub / + <= Doc_file_remove_icon* $mol_icon_close + Index_record* $mol_row + sub / + <= Index_record_icon* $mol_icon_check + <= Index_record_info* $mol_view + sub / + <= index_record_text* \ + Documents $mol_page + title @ \Documents + body / + <= Doc_text $mol_textarea + hint @ \Paste text to build knowledge graph... + value? <=> doc_text? + <= Doc_open $mol_button_open + title @ \Add Files + accept \.txt,.md,.csv,.json,.xml,.html,.docx + files? <=> doc_files_add? null + <= Doc_file_list $mol_list + rows <= doc_file_rows / + <= Index_submit $mol_button_major + title @ \Build Knowledge Graph + click? <=> index_submit? null + <= Index_message $mol_text + text <= index_message? + <= Index_record_list $mol_list + rows <= index_record_rows / + Settings_page $mol_page + title @ \Settings + body / + <= Llm_api_key $mol_form_field + name @ \LLM API Key + Content <= Llm_api_key_input $mol_string + hint \sk-... + value? <=> llm_api_key? + <= Llm_base_url $mol_form_field + name @ \LLM Base URL + Content <= Llm_base_url_input $mol_string + value? <=> llm_base_url? + <= Llm_model $mol_form_field + name @ \LLM Model + Content <= Llm_model_input $mol_string + value? <=> llm_model? + <= Llm_rpm $mol_form_field + name @ \LLM RPM + Content <= Llm_rpm_input $mol_string + value? <=> llm_rpm? + <= Embedder_api_key $mol_form_field + name @ \Embedder API Key + Content <= Embedder_api_key_input $mol_string + hint \sk-... + value? <=> embedder_api_key? + <= Embedder_base_url $mol_form_field + name @ \Embedder Base URL + Content <= Embedder_base_url_input $mol_string + value? <=> embedder_base_url? + <= Embedder_model $mol_form_field + name @ \Embedder Model + Content <= Embedder_model_input $mol_string + value? <=> embedder_model? + <= Embedder_dim $mol_form_field + name @ \Embedder Dim + Content <= Embedder_dim_input $mol_string + value? <=> embedder_dim? + <= Config_save $mol_button_major + title @ \Save + click? <=> config_save? null + <= Config_message $mol_text + text <= config_message? diff --git a/front/app/app.view.ts b/front/app/app.view.ts new file mode 100644 index 0000000..c413099 --- /dev/null +++ b/front/app/app.view.ts @@ -0,0 +1,149 @@ +namespace $.$$ { + + type Request = { + message: string + files: string[] + } + + type IndexRecord = { + count: number + names: string[] + } + + export class $bog_RAGU_front_app extends $.$bog_RAGU_front_app { + + @ $mol_mem + config_synced() { + this.push_config() + return true + } + + @ $mol_mem + override pages() { + this.config_synced() + return [ + this.Settings_page(), + this.Documents(), + this.Dialog(), + ... this.result() ? [ this.Result_page( this.version() ) ] : [], + ] + } + + @ $mol_action + override doc_files_add( next: readonly File[] ) { + if( !next?.length ) return + this.doc_files([ ... this.doc_files(), ... next ]) + } + + override doc_file_rows() { + return this.doc_files().map( ( _, i ) => this.Doc_file( i ) ) + } + + override doc_file_name( index: number ) { + return ( this.doc_files()[ index ] as File ).name + } + + @ $mol_action + override doc_file_remove( index: number ) { + const files = [ ... this.doc_files() ] + files.splice( index, 1 ) + this.doc_files( files ) + } + + override index_record_rows() { + return this.index_records().map( ( _, i ) => this.Index_record( i ) ) + } + + override index_record_text( index: number ) { + const rec = this.index_records()[ index ] as IndexRecord + return `${ rec.count } doc(s): ${ rec.names.join( ', ' ) }` + } + + @ $mol_mem + override communication() { + + const history = this.history() + if( history.length % 2 === 0 ) return + + const last = history[ history.length - 1 ] as Request + + try { + const resp = $mol_fetch.json( + this.api_url() + '/api/query', + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query: last.message }), + }, + ) + this.history([ ... history, resp ]) + } catch( error: any ) { + if( $mol_promise_like( error ) ) $mol_fail_hidden( error ) + if( $mol_fail_log( error ) ) { + this.history([ ... history, { message: '\u{1F6D1}' + error.message, files: [] } ]) + } + } + + } + + @ $mol_action + override index_submit() { + const text = this.doc_text() + const files = this.doc_files() as File[] + + if( !text && !files.length ) return + + const form = new FormData() + if( text ) form.append( 'text', text ) + for( const file of files ) { + form.append( 'files', file ) + } + + const resp = $mol_fetch.json( + this.api_url() + '/api/index', + { + method: 'POST', + body: form, + }, + ) as { status: string; documents_count: number; names: string[]; total_documents: number } + + this.index_records([ + ... this.index_records(), + { count: resp.documents_count, names: resp.names } as IndexRecord, + ]) + + this.index_message( `Indexed ${ resp.documents_count } doc(s). Total: ${ resp.total_documents }` ) + this.doc_text( '' ) + this.doc_files( [] ) + } + + @ $mol_action + push_config() { + $mol_fetch.json( + this.api_url() + '/api/config', + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ env: { + LLM_API_KEY: this.llm_api_key(), + LLM_BASE_URL: this.llm_base_url(), + LLM_MODEL: this.llm_model(), + LLM_RPM: this.llm_rpm(), + EMBEDDER_API_KEY: this.embedder_api_key(), + EMBEDDER_BASE_URL: this.embedder_base_url(), + EMBEDDER_MODEL: this.embedder_model(), + EMBEDDER_DIM: this.embedder_dim(), + } }), + }, + ) + } + + @ $mol_action + override config_save() { + this.push_config() + this.config_message( 'Saved' ) + } + + } + +} diff --git a/front/app/index.html b/front/app/index.html new file mode 100644 index 0000000..26dd154 --- /dev/null +++ b/front/app/index.html @@ -0,0 +1,14 @@ + + + + + RAGU + + + + + +
+ + + diff --git a/server/main.py b/server/main.py new file mode 100644 index 0000000..5a00fa3 --- /dev/null +++ b/server/main.py @@ -0,0 +1,221 @@ +import io +import os +from contextlib import asynccontextmanager + +from fastapi import FastAPI, UploadFile, File, Form +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from pydantic import BaseModel + +from ragu import ( + KnowledgeGraph, + SimpleChunker, + BuilderArguments, + LocalSearchEngine, + GlobalSearchEngine, + NaiveSearchEngine, + Settings, +) +from ragu.llm import OpenAIClient +from ragu.embedder import OpenAIEmbedder +from ragu.triplet import ArtifactsExtractorLLM + + +ENV_KEYS = [ + "LLM_API_KEY", "LLM_BASE_URL", "LLM_MODEL", "LLM_RPM", + "EMBEDDER_API_KEY", "EMBEDDER_BASE_URL", "EMBEDDER_MODEL", "EMBEDDER_DIM", + "RAGU_STORAGE", +] + + +class State: + kg: KnowledgeGraph | None = None + client: OpenAIClient | None = None + embedder: OpenAIEmbedder | None = None + indexed: bool = False + all_documents: list[str] = [] + all_names: list[str] = [] + + +state = State() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + reinit_clients() + yield + if state.client: + await state.client.async_close() + if state.embedder: + await state.embedder.aclose() + + +app = FastAPI(title="RAGU API", lifespan=lifespan) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + + +class QueryRequest(BaseModel): + query: str + engine: str = "local" + top_k: int = 20 + + +class BotResponse(BaseModel): + message: str + files: list[str] = [] + document: str | None = None + confidence: float = 0.0 + digest: str = "" + title: str = "" + + +class ConfigUpdate(BaseModel): + env: dict[str, str] + + +def reinit_clients(): + state.client = OpenAIClient( + model_name=os.getenv("LLM_MODEL", ""), + base_url=os.getenv("LLM_BASE_URL", ""), + api_token=os.getenv("LLM_API_KEY", ""), + max_requests_per_minute=int(os.getenv("LLM_RPM", "60")), + ) + state.embedder = OpenAIEmbedder( + model_name=os.getenv("EMBEDDER_MODEL", ""), + base_url=os.getenv("EMBEDDER_BASE_URL", os.getenv("LLM_BASE_URL", "")), + api_token=os.getenv("EMBEDDER_API_KEY", os.getenv("LLM_API_KEY", "")), + dim=int(os.getenv("EMBEDDER_DIM", "3072")), + ) + + +@app.get("/api/status") +async def get_status(): + return {"indexed": state.indexed} + + +@app.get("/api/config") +async def get_config(): + return {k: os.getenv(k, "") for k in ENV_KEYS} + + +@app.post("/api/config") +async def set_config(req: ConfigUpdate): + for k, v in req.env.items(): + if k in ENV_KEYS: + os.environ[k] = v + reinit_clients() + return {k: os.getenv(k, "") for k in ENV_KEYS} + + +async def _read_file(upload: UploadFile) -> str: + content = await upload.read() + ext = (upload.filename or "").rsplit(".", 1)[-1].lower() + + if ext == "docx": + from docx import Document as DocxDocument + doc = DocxDocument(io.BytesIO(content)) + return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + + return content.decode("utf-8") + + +@app.post("/api/index") +async def index_documents( + files: list[UploadFile] = File(default=[]), + text: str = Form(default=""), + language: str = Form(default="russian"), +): + documents: list[str] = [] + names: list[str] = [] + + if text.strip(): + documents.append(text.strip()) + names.append("(text)") + + for f in files: + doc_text = await _read_file(f) + if doc_text.strip(): + documents.append(doc_text) + names.append(f.filename or "unknown") + + if not documents: + return {"status": "empty", "documents_count": 0, "names": []} + + state.all_documents.extend(documents) + state.all_names.extend(names) + + try: + Settings.storage_folder = os.getenv("RAGU_STORAGE", "ragu_data") + Settings.language = language + + chunker = SimpleChunker(max_chunk_size=1000) + extractor = ArtifactsExtractorLLM(client=state.client, do_validation=False) + + kg = KnowledgeGraph( + client=state.client, + embedder=state.embedder, + chunker=chunker, + artifact_extractor=extractor, + builder_settings=BuilderArguments( + use_llm_summarization=True, + vectorize_chunks=True, + ), + ) + + await kg.build_from_docs(state.all_documents) + + state.kg = kg + state.indexed = True + + except Exception as e: + # Roll back documents added in this request + state.all_documents = state.all_documents[:-len(documents)] + state.all_names = state.all_names[:-len(names)] + return JSONResponse( + status_code=500, + content={"error": str(e), "names": names, "documents_count": 0}, + ) + + return { + "status": "indexed", + "documents_count": len(documents), + "names": names, + "total_documents": len(state.all_documents), + } + + +@app.post("/api/query", response_model=BotResponse) +async def query_graph(req: QueryRequest): + if not state.kg: + return BotResponse( + message="Knowledge graph not built yet. Please index documents first.", + ) + + try: + if req.engine == "local": + engine = LocalSearchEngine(state.client, state.kg, state.embedder) + answer = await engine.a_query(req.query, top_k=req.top_k) + elif req.engine == "global": + engine = GlobalSearchEngine(state.client, state.kg) + answer = await engine.a_query(req.query) + elif req.engine == "naive": + engine = NaiveSearchEngine(state.client, state.kg, state.embedder) + answer = await engine.a_query(req.query, top_k=req.top_k) + else: + return BotResponse(message=f"Unknown engine: {req.engine}") + except Exception as e: + return BotResponse(message=f"Error: {e}") + + return BotResponse( + message=answer, + confidence=0.8, + title=req.query[:50], + ) + + diff --git a/server/requirements.txt b/server/requirements.txt new file mode 100644 index 0000000..f6b3ebf --- /dev/null +++ b/server/requirements.txt @@ -0,0 +1,4 @@ +fastapi +uvicorn +python-docx +python-multipart From 817f517b36cf7c97a2eccb68018c4d50fc9a97b4 Mon Sep 17 00:00:00 2001 From: "novgorodcev.k" Date: Sat, 18 Apr 2026 13:17:18 +0300 Subject: [PATCH 2/2] Baltiyskiy Bereg service desk chatbot (hackathon AI Business SPB 2026) - ETL: extract KB articles + tickets from MSSQL, build Knowledge Graph via RAGU - Local embedder (sentence-transformers) instead of API-based - Docker Compose: MSSQL (Azure SQL Edge) + API + Frontend - Auto DB restore + ETL on first start - Frontend: branded chat with naive search engine Co-Authored-By: Claude Opus 4.6 --- .gitignore | 2 + Dockerfile | 9 +- docker-compose.yml | 58 +++-- etl/__init__.py | 0 etl/extract_and_index.py | 201 ++++++++++++++++++ etl/local_embedder.py | 36 ++++ etl/rotating_llm.py | 109 ++++++++++ front/app/app.view.css.ts | 4 + front/app/app.view.tree | 9 +- front/app/app.view.ts | 11 +- restore-db.sh | 39 ++++ server/entrypoint.sh | 68 ++++++ server/main.py | 78 ++++--- .../README.md" | 185 ++++++++++++++++ .../docker-compose.yml" | 31 +++ .../pyproject.toml" | 13 ++ .../restore-db.sh" | 38 ++++ 17 files changed, 834 insertions(+), 57 deletions(-) create mode 100644 etl/__init__.py create mode 100644 etl/extract_and_index.py create mode 100644 etl/local_embedder.py create mode 100644 etl/rotating_llm.py create mode 100644 restore-db.sh create mode 100755 server/entrypoint.sh create mode 100644 "\320\267\320\260\320\264\320\260\320\275\320\270\320\265/README.md" create mode 100644 "\320\267\320\260\320\264\320\260\320\275\320\270\320\265/docker-compose.yml" create mode 100644 "\320\267\320\260\320\264\320\260\320\275\320\270\320\265/pyproject.toml" create mode 100644 "\320\267\320\260\320\264\320\260\320\275\320\270\320\265/restore-db.sh" diff --git a/.gitignore b/.gitignore index d7c3f2f..42eea76 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ checkpoints/ benchmark/*.json ragu_working_dir/ +ragu_data/ +*.bak # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Dockerfile b/Dockerfile index c7b74e0..1d9cfed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,13 @@ RUN pip install --no-cache-dir . COPY server/ ./server/ RUN pip install --no-cache-dir -r server/requirements.txt +COPY etl/ ./etl/ +RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir pymssql python-dotenv sentence-transformers + +COPY server/entrypoint.sh ./entrypoint.sh +RUN chmod +x ./entrypoint.sh + EXPOSE 8000 -CMD ["uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["./entrypoint.sh"] diff --git a/docker-compose.yml b/docker-compose.yml index 3b18626..e108215 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,24 +1,24 @@ services: - web: - build: - context: . - dockerfile: Dockerfile - target: frontend - ports: - - "9081:9080" + mssql: + image: mcr.microsoft.com/azure-sql-edge:latest + container_name: mssql-baltbereg restart: unless-stopped - tty: true - stdin_open: true - develop: - watch: - - action: sync - path: ./front - target: /app/bog/RAGU/front - - # http://localhost:9081/bog/RAGU/front/app/-/test.html + environment: + ACCEPT_EULA: "Y" + MSSQL_SA_PASSWORD: "BaltBeregHack2026!" + ports: + - "1433:1433" + volumes: + - mssql-data:/var/opt/mssql + - ./задание/cleaned.bak:/var/opt/mssql/backup/cleaned.bak:ro + healthcheck: + test: ["CMD-SHELL", "python3 -c \"import socket; s=socket.create_connection(('localhost',1433),2); s.close()\""] + interval: 10s + timeout: 5s + retries: 30 + start_period: 10s api: - platform: linux/arm64 build: context: . dockerfile: Dockerfile @@ -29,6 +29,11 @@ services: env_file: .env environment: - NUMBA_CPU_NAME=generic + - MSSQL_HOST=mssql + - MSSQL_SA_PASSWORD=BaltBeregHack2026! + depends_on: + mssql: + condition: service_healthy volumes: - ragu_data:/app/ragu_data develop: @@ -39,5 +44,24 @@ services: # http://localhost:8100/api/status + web: + build: + context: . + dockerfile: Dockerfile + target: frontend + ports: + - "9081:9080" + restart: unless-stopped + tty: true + stdin_open: true + develop: + watch: + - action: sync + path: ./front + target: /app/bog/RAGU/front + + # http://localhost:9081/bog/RAGU/front/app/-/test.html + volumes: + mssql-data: ragu_data: diff --git a/etl/__init__.py b/etl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/etl/extract_and_index.py b/etl/extract_and_index.py new file mode 100644 index 0000000..e6a010a --- /dev/null +++ b/etl/extract_and_index.py @@ -0,0 +1,201 @@ +""" +ETL: Extract tickets & KB from MSSQL → build Knowledge Graph via RAGU. + +50 most informative tickets + KB articles → full GraphRAG pipeline. + +Usage: + docker compose run --rm api python -m etl.extract_and_index +""" + +import asyncio +import os +import re +from html.parser import HTMLParser + +import pymssql +from dotenv import load_dotenv + +from ragu import KnowledgeGraph, SimpleChunker, BuilderArguments, Settings +from ragu.models import LLMOpenAI +from ragu.models.openai import CachedAsyncOpenAI +from ragu.triplet import ArtifactsExtractorLLM +from etl.local_embedder import LocalEmbedder + + +# --------------------------------------------------------------------------- +# HTML → plain text +# --------------------------------------------------------------------------- + +class _HTMLStripper(HTMLParser): + def __init__(self): + super().__init__() + self._parts: list[str] = [] + + def handle_data(self, data: str): + self._parts.append(data) + + def get_text(self) -> str: + return " ".join(self._parts) + + +def strip_html(html: str | None) -> str: + if not html: + return "" + s = _HTMLStripper() + s.feed(html) + text = s.get_text() + text = re.sub(r"\s+", " ", text).strip() + return text + + +# --------------------------------------------------------------------------- +# MSSQL helpers +# --------------------------------------------------------------------------- + +def get_connection(): + return pymssql.connect( + server=os.getenv("MSSQL_HOST", "mssql"), + user="SA", + password=os.getenv("MSSQL_SA_PASSWORD", "BaltBeregHack2026!"), + database="service_desk_tdbb", + charset="utf8", + ) + + +def fetch_lookups(conn) -> dict[str, dict[int, str]]: + lookups: dict[str, dict[int, str]] = {} + cursor = conn.cursor(as_dict=True) + for table in ("Service", "TaskType", "Status", "Priority"): + cursor.execute(f"SELECT Id, NameXml FROM {table}") + lookups[table] = {row["Id"]: strip_html(row["NameXml"]) for row in cursor} + return lookups + + +def fetch_tickets(conn, lookups: dict[str, dict[int, str]], limit: int = 50) -> list[str]: + """Fetch most informative tickets — those with longest Comment (richest Q&A).""" + cursor = conn.cursor(as_dict=True) + cursor.execute(f""" + SELECT TOP {limit} Id, Name, Description, Comment, + ServiceId, TypeId, StatusId, PriorityId + FROM Task + WHERE Comment IS NOT NULL AND LEN(Comment) > 100 + ORDER BY LEN(Comment) DESC + """) + + docs: list[str] = [] + for row in cursor: + service = lookups["Service"].get(row["ServiceId"], "") + task_type = lookups["TaskType"].get(row["TypeId"], "") + status = lookups["Status"].get(row["StatusId"], "") + priority = lookups["Priority"].get(row["PriorityId"], "") + + desc = strip_html(row["Description"]) + comment = strip_html(row["Comment"]) + + parts = [f"Тикет #{row['Id']}: {row['Name'] or ''}"] + if service: + parts.append(f"Сервис: {service}") + if task_type: + parts.append(f"Тип: {task_type}") + if status: + parts.append(f"Статус: {status}") + if priority: + parts.append(f"Приоритет: {priority}") + if desc: + parts.append(f"Описание: {desc}") + if comment: + parts.append(f"Переписка: {comment}") + + docs.append("\n".join(parts)) + + return docs + + +def fetch_kb_articles(conn, limit: int = 50) -> list[str]: + cursor = conn.cursor(as_dict=True) + cursor.execute(f""" + SELECT TOP {limit} Id, Name, Description + FROM KBDocument + WHERE IsPublished = 1 AND Description IS NOT NULL AND LEN(Description) > 50 + ORDER BY Rating DESC, Id DESC + """) + + docs: list[str] = [] + for row in cursor: + desc = strip_html(row["Description"]) + parts = [f"Статья базы знаний #{row['Id']}: {row['Name'] or ''}"] + if desc: + parts.append(desc) + docs.append("\n".join(parts)) + + return docs + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +async def main(): + load_dotenv() + + storage = os.getenv("RAGU_STORAGE", "ragu_data") + Settings.storage_folder = storage + Settings.language = "russian" + + llm_client = CachedAsyncOpenAI( + base_url=os.getenv("LLM_BASE_URL", "https://models.github.ai/inference"), + api_key=os.getenv("LLM_API_KEY", ""), + rate_max_per_minute=5, + rate_max_simultaneous=1, + rate_min_delay=8, + retry_times_sec=(30, 60, 120, 240), + ) + llm = LLMOpenAI(client=llm_client, model_name=os.getenv("LLM_MODEL", "openai/gpt-4.1-mini")) + + print("[ETL] Loading local embedder...") + embedder = LocalEmbedder("intfloat/multilingual-e5-small") + + # Extract from MSSQL + print("[ETL] Connecting to MSSQL...") + conn = get_connection() + + print("[ETL] Loading lookups...") + lookups = fetch_lookups(conn) + for table, data in lookups.items(): + print(f" {table}: {len(data)} entries") + + print("[ETL] Extracting top 10 KB articles (by rating)...") + kb_docs = fetch_kb_articles(conn, limit=10) + print(f" {len(kb_docs)} KB articles") + + print("[ETL] Extracting top 10 tickets (richest Q&A threads)...") + ticket_docs = fetch_tickets(conn, lookups, limit=10) + print(f" {len(ticket_docs)} tickets") + + conn.close() + + all_docs = kb_docs + ticket_docs + print(f"[ETL] Total: {len(all_docs)} documents") + + # Build full Knowledge Graph (entities, relations, communities, summaries) + print("[ETL] Building Knowledge Graph...") + extractor = ArtifactsExtractorLLM(llm=llm, do_validation=False) + + kg = KnowledgeGraph( + llm=llm, + embedder=embedder, + chunker=SimpleChunker(max_chunk_size=1000), + artifact_extractor=extractor, + builder_settings=BuilderArguments( + use_llm_summarization=True, + vectorize_chunks=True, + ), + ) + + await kg.build_from_docs(all_docs) + + print(f"[ETL] Done! Knowledge Graph saved to {storage}/") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/etl/local_embedder.py b/etl/local_embedder.py new file mode 100644 index 0000000..cc6cb01 --- /dev/null +++ b/etl/local_embedder.py @@ -0,0 +1,36 @@ +"""Local sentence-transformers embedder implementing RAGU's Embedder interface.""" + +from typing import Any +from ragu.models.embedder import Embedder, FLOATS + + +class LocalEmbedder(Embedder): + """Embedder using sentence-transformers locally. No API calls, no rate limits.""" + + def __init__(self, model_name: str = "intfloat/multilingual-e5-small"): + from sentence_transformers import SentenceTransformer + self._model = SentenceTransformer(model_name) + self._dim = self._model.get_sentence_embedding_dimension() + print(f"[LocalEmbedder] Loaded {model_name}, dim={self._dim}") + + @property + def dim(self) -> int: + return self._dim + + async def embed_text(self, text: str, **kwargs: Any) -> list[float]: + embedding = self._model.encode(text, normalize_embeddings=True) + return embedding.tolist() + + async def batch_embed_text( + self, + texts: list[str], + desc: str | None = None, + **kwargs: Any, + ) -> list[list[float]] | FLOATS: + embeddings = self._model.encode( + texts, + normalize_embeddings=True, + show_progress_bar=bool(desc), + batch_size=64, + ) + return embeddings.tolist() diff --git a/etl/rotating_llm.py b/etl/rotating_llm.py new file mode 100644 index 0000000..76f8760 --- /dev/null +++ b/etl/rotating_llm.py @@ -0,0 +1,109 @@ +"""LLM wrapper that rotates API keys across multiple CachedAsyncOpenAI clients. + +Keys are read from LLM_API_KEYS env var (comma-separated) or fall back to LLM_API_KEY. +""" + +import asyncio +import os +from ragu.models import LLMOpenAI +from ragu.models.openai import CachedAsyncOpenAI + + +def _load_keys() -> list[str]: + """Load API keys from LLM_API_KEYS (comma-separated) or LLM_API_KEY.""" + multi = os.getenv("LLM_API_KEYS", "") + if multi: + return [k.strip() for k in multi.split(",") if k.strip()] + single = os.getenv("LLM_API_KEY", "") + return [single] if single else [] + + +class RotatingLLM: + """Distributes LLM calls round-robin across multiple API keys. + + Each key gets its own CachedAsyncOpenAI with independent rate limiter. + Batches are processed in small windows to avoid overwhelming the API. + """ + + def __init__( + self, + base_url: str, + model_name: str, + keys: list[str] | None = None, + rpm_per_key: int = 5, + ): + keys = keys or _load_keys() + if not keys: + raise ValueError("No API keys found. Set LLM_API_KEYS or LLM_API_KEY env var.") + + self._llms: list[LLMOpenAI] = [] + for key in keys: + client = CachedAsyncOpenAI( + base_url=base_url, + api_key=key, + rate_max_per_minute=rpm_per_key, + rate_max_simultaneous=1, + rate_min_delay=3, + retry_times_sec=(30, 60, 120), + ) + self._llms.append(LLMOpenAI(client=client, model_name=model_name)) + + self._idx = 0 + self._batch_size = min(5, len(self._llms)) + print(f"[RotatingLLM] {len(self._llms)} keys, {rpm_per_key} RPM each, batch_size={self._batch_size}") + + def _next(self) -> LLMOpenAI: + llm = self._llms[self._idx % len(self._llms)] + self._idx += 1 + return llm + + async def chat_completion(self, conversation, output_schema=None, **kwargs): + return await self._next().chat_completion( + conversation=conversation, + output_schema=output_schema, + **kwargs, + ) + + async def batch_chat_completion(self, conversations, output_schema=None, desc=None, **kwargs): + """Process conversations in small batches to stay within rate limits.""" + results = [] + total = len(conversations) + batch_size = self._batch_size + + for start in range(0, total, batch_size): + batch = conversations[start:start + batch_size] + tasks = [] + for i, conv in enumerate(batch): + llm = self._llms[(start + i) % len(self._llms)] + tasks.append( + llm.chat_completion( + conversation=conv, + output_schema=output_schema, + **kwargs, + ) + ) + batch_results = await asyncio.gather(*tasks, return_exceptions=True) + + for j, res in enumerate(batch_results): + if isinstance(res, Exception): + print(f"[RotatingLLM] Retrying failed request ({type(res).__name__}), waiting 60s...") + await asyncio.sleep(60) + try: + res = await self._llms[(start + j) % len(self._llms)].chat_completion( + conversation=batch[j], + output_schema=output_schema, + **kwargs, + ) + except Exception as e2: + print(f"[RotatingLLM] Retry also failed: {e2}, skipping") + res = None + results.append(res) + + done = min(start + batch_size, total) + if desc: + print(f" {desc}: {done}/{total}") + + if start + batch_size < total: + await asyncio.sleep(15) + + return results diff --git a/front/app/app.view.css.ts b/front/app/app.view.css.ts index 712a8f9..56f3b9e 100644 --- a/front/app/app.view.css.ts +++ b/front/app/app.view.css.ts @@ -2,6 +2,10 @@ namespace $.$$ { $mol_style_define( $bog_RAGU_front_app, { + background: { + color: '#1a365d', + }, + Documents: { flex: { basis: '30rem', diff --git a/front/app/app.view.tree b/front/app/app.view.tree index 38f13d3..67188e4 100644 --- a/front/app/app.view.tree +++ b/front/app/app.view.tree @@ -1,5 +1,5 @@ $bog_RAGU_front_app $giper_bot - dialog_title @ \RAGU + dialog_title @ \Балтийский Берег — Служба поддержки api_url \http://localhost:8100 history? / doc_text? \ @@ -7,14 +7,15 @@ $bog_RAGU_front_app $giper_bot index_message? \ index_records? / config_message? \ - llm_api_key? \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 + llm_api_key? \ llm_base_url? \https://models.github.ai/inference llm_model? \openai/gpt-4.1-mini - llm_rpm? \60 - embedder_api_key? \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 + llm_rpm? \15 + embedder_api_key? \ embedder_base_url? \https://models.github.ai/inference embedder_model? \text-embedding-3-large embedder_dim? \3072 + placeholders / Doc_file* $mol_row sub / <= Doc_file_icon* $mol_icon_file diff --git a/front/app/app.view.ts b/front/app/app.view.ts index c413099..0b7cf45 100644 --- a/front/app/app.view.ts +++ b/front/app/app.view.ts @@ -14,7 +14,12 @@ namespace $.$$ { @ $mol_mem config_synced() { - this.push_config() + try { + this.push_config() + } catch( error: any ) { + if( $mol_promise_like( error ) ) $mol_fail_hidden( error ) + $mol_fail_log( error ) + } return true } @@ -22,8 +27,6 @@ namespace $.$$ { override pages() { this.config_synced() return [ - this.Settings_page(), - this.Documents(), this.Dialog(), ... this.result() ? [ this.Result_page( this.version() ) ] : [], ] @@ -73,7 +76,7 @@ namespace $.$$ { { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ query: last.message }), + body: JSON.stringify({ query: last.message + '\n\nОтвечай на том же языке, на котором задан вопрос.', engine: 'naive' }), }, ) this.history([ ... history, resp ]) diff --git a/restore-db.sh b/restore-db.sh new file mode 100644 index 0000000..403e020 --- /dev/null +++ b/restore-db.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Восстанавливает базу данных из backup при первом запуске +# Увеличен таймаут для ARM эмуляции +set -euo pipefail + +BACKUP_PATH="/var/opt/mssql/backup/cleaned.bak" +DB_NAME="service_desk_tdbb" +SA_PASSWORD="${MSSQL_SA_PASSWORD:?}" +SQLCMD="/opt/mssql-tools/bin/sqlcmd" + +echo "Waiting for SQL Server to start..." +for i in $(seq 1 180); do + if $SQLCMD -S localhost -U SA -P "$SA_PASSWORD" -Q "SELECT 1" -b -o /dev/null 2>/dev/null; then + echo "SQL Server is ready" + break + fi + if [ "$i" -eq 180 ]; then + echo "ERROR: SQL Server did not start in time" + exit 1 + fi + sleep 2 +done + +# Check if DB already exists +EXISTS=$($SQLCMD -S localhost -U SA -P "$SA_PASSWORD" -Q "SET NOCOUNT ON; SELECT COUNT(*) FROM sys.databases WHERE name='$DB_NAME'" -h -1 -b 2>/dev/null | tr -d '[:space:]') + +if [ "$EXISTS" = "0" ]; then + echo "Restoring $DB_NAME from backup..." + $SQLCMD -S localhost -U SA -P "$SA_PASSWORD" -Q " + RESTORE DATABASE [$DB_NAME] + FROM DISK = '$BACKUP_PATH' + WITH MOVE 'service_desk_tdbb' TO '/var/opt/mssql/data/${DB_NAME}.mdf', + MOVE 'service_desk_tdbb_log' TO '/var/opt/mssql/data/${DB_NAME}_log.ldf', + REPLACE + " -b + echo "Database restored successfully" +else + echo "Database $DB_NAME already exists, skipping restore" +fi diff --git a/server/entrypoint.sh b/server/entrypoint.sh new file mode 100755 index 0000000..fbf71e9 --- /dev/null +++ b/server/entrypoint.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -e + +INDEX_FILE="${RAGU_STORAGE:-ragu_data}/kv_chunks.json" + +# Remove empty index files from failed builds +if [ -f "$INDEX_FILE" ] && [ "$(wc -c < "$INDEX_FILE")" -le 4 ]; then + echo "[entrypoint] Removing empty index files from failed build..." + rm -f "${RAGU_STORAGE:-ragu_data}"/*.json +fi + +if [ ! -f "$INDEX_FILE" ]; then + echo "[entrypoint] Index not found. Restoring DB + running ETL..." + + python -c " +import pymssql, os, time + +host = os.getenv('MSSQL_HOST', 'mssql') +pwd = os.getenv('MSSQL_SA_PASSWORD', 'BaltBeregHack2026!') + +# Wait for SQL Server to accept connections +for i in range(90): + try: + c = pymssql.connect(server=host, user='SA', password=pwd) + c.close() + print(f'[restore] SQL Server ready after {i*2}s') + break + except: + time.sleep(2) +else: + print('[restore] ERROR: SQL Server did not start') + exit(1) + +# Check if DB exists +conn = pymssql.connect(server=host, user='SA', password=pwd) +cur = conn.cursor() +cur.execute(\"SELECT COUNT(*) FROM sys.databases WHERE name='service_desk_tdbb'\") +exists = cur.fetchone()[0] + +if exists == 0: + print('[restore] Restoring service_desk_tdbb...') + conn.autocommit(True) + cur.execute(''' + RESTORE DATABASE [service_desk_tdbb] + FROM DISK = '/var/opt/mssql/backup/cleaned.bak' + WITH MOVE 'IntraService3' TO '/var/opt/mssql/data/service_desk_tdbb.mdf', + MOVE 'ftrow_TaskSearchingCatalog' TO '/var/opt/mssql/data/service_desk_tdbb_search_cat.ndf', + MOVE 'ftrow_KBDocumentFTC' TO '/var/opt/mssql/data/service_desk_tdbb_document.ndf', + MOVE 'ftrow_TaskParentSearchCatalog' TO '/var/opt/mssql/data/service_desk_tdbb_parent_cat.ndf', + MOVE 'IntraService3_log' TO '/var/opt/mssql/data/service_desk_tdbb_log.ldf', + REPLACE + ''') + print('[restore] Database restored!') +else: + print('[restore] Database already exists') + +conn.close() +" + + echo "[entrypoint] Running ETL..." + python -m etl.extract_and_index + echo "[entrypoint] ETL done." +else + echo "[entrypoint] Index exists, skipping ETL." +fi + +echo "[entrypoint] Starting API server..." +exec uvicorn server.main:app --host 0.0.0.0 --port 8000 diff --git a/server/main.py b/server/main.py index 5a00fa3..c7d9c49 100644 --- a/server/main.py +++ b/server/main.py @@ -16,9 +16,10 @@ NaiveSearchEngine, Settings, ) -from ragu.llm import OpenAIClient -from ragu.embedder import OpenAIEmbedder +from ragu.models import LLMOpenAI +from ragu.models.openai import CachedAsyncOpenAI from ragu.triplet import ArtifactsExtractorLLM +from etl.local_embedder import LocalEmbedder ENV_KEYS = [ @@ -30,8 +31,8 @@ class State: kg: KnowledgeGraph | None = None - client: OpenAIClient | None = None - embedder: OpenAIEmbedder | None = None + llm: LLMOpenAI | None = None + embedder: LocalEmbedder | None = None indexed: bool = False all_documents: list[str] = [] all_names: list[str] = [] @@ -40,14 +41,43 @@ class State: state = State() +def reinit_clients(): + api_key = os.getenv("LLM_API_KEY_RUNTIME", os.getenv("LLM_API_KEY", "")) + + client = CachedAsyncOpenAI( + base_url=os.getenv("LLM_BASE_URL", "https://models.github.ai/inference"), + api_key=api_key, + rate_max_per_minute=10, + retry_times_sec=(15, 30, 60), + ) + state.llm = LLMOpenAI( + client=client, + model_name=os.getenv("LLM_MODEL", "openai/gpt-4.1-mini"), + ) + + state.embedder = LocalEmbedder("intfloat/multilingual-e5-small") + + @asynccontextmanager async def lifespan(app: FastAPI): reinit_clients() + + # Auto-load pre-built index if exists + storage = os.getenv("RAGU_STORAGE", "ragu_data") + chunks_path = os.path.join(storage, "kv_chunks.json") + if os.path.exists(chunks_path): + try: + Settings.storage_folder = storage + state.kg = KnowledgeGraph( + llm=state.llm, + embedder=state.embedder, + ) + state.indexed = True + print(f"[RAGU] Auto-loaded index from {storage}") + except Exception as e: + print(f"[RAGU] Failed to auto-load index: {e}") + yield - if state.client: - await state.client.async_close() - if state.embedder: - await state.embedder.aclose() app = FastAPI(title="RAGU API", lifespan=lifespan) @@ -79,21 +109,6 @@ class ConfigUpdate(BaseModel): env: dict[str, str] -def reinit_clients(): - state.client = OpenAIClient( - model_name=os.getenv("LLM_MODEL", ""), - base_url=os.getenv("LLM_BASE_URL", ""), - api_token=os.getenv("LLM_API_KEY", ""), - max_requests_per_minute=int(os.getenv("LLM_RPM", "60")), - ) - state.embedder = OpenAIEmbedder( - model_name=os.getenv("EMBEDDER_MODEL", ""), - base_url=os.getenv("EMBEDDER_BASE_URL", os.getenv("LLM_BASE_URL", "")), - api_token=os.getenv("EMBEDDER_API_KEY", os.getenv("LLM_API_KEY", "")), - dim=int(os.getenv("EMBEDDER_DIM", "3072")), - ) - - @app.get("/api/status") async def get_status(): return {"indexed": state.indexed} @@ -155,10 +170,10 @@ async def index_documents( Settings.language = language chunker = SimpleChunker(max_chunk_size=1000) - extractor = ArtifactsExtractorLLM(client=state.client, do_validation=False) + extractor = ArtifactsExtractorLLM(llm=state.llm, do_validation=False) kg = KnowledgeGraph( - client=state.client, + llm=state.llm, embedder=state.embedder, chunker=chunker, artifact_extractor=extractor, @@ -199,23 +214,24 @@ async def query_graph(req: QueryRequest): try: if req.engine == "local": - engine = LocalSearchEngine(state.client, state.kg, state.embedder) + engine = LocalSearchEngine(state.llm, state.kg, state.embedder) answer = await engine.a_query(req.query, top_k=req.top_k) elif req.engine == "global": - engine = GlobalSearchEngine(state.client, state.kg) + engine = GlobalSearchEngine(state.llm, state.kg) answer = await engine.a_query(req.query) elif req.engine == "naive": - engine = NaiveSearchEngine(state.client, state.kg, state.embedder) + engine = NaiveSearchEngine(state.llm, state.kg, state.embedder) answer = await engine.a_query(req.query, top_k=req.top_k) else: return BotResponse(message=f"Unknown engine: {req.engine}") except Exception as e: return BotResponse(message=f"Error: {e}") + # a_query() may return a string or a Pydantic model with .response + text = answer.response if hasattr(answer, "response") else str(answer) + return BotResponse( - message=answer, + message=text, confidence=0.8, title=req.query[:50], ) - - diff --git "a/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/README.md" "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/README.md" new file mode 100644 index 0000000..b165fc4 --- /dev/null +++ "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/README.md" @@ -0,0 +1,185 @@ +# Baltiyskiy Bereg — AI Business SPB Hackathon 2026 + +LLM-чатбот для сервис-деска «Балтийский Берег». Постройте ассистента, который помогает сотрудникам создавать, описывать и решать заявки, опираясь на историческую базу тикетов и статьи базы знаний из реальной системы IntraService. + +Данные — дамп MSSQL базы `service_desk_tdbb` (18 таблиц, ~104 000 тикетов, ~1 000 KB-статей). Автоматического скоринга нет — оценивает жюри. + +**Платформа:** https://app.ai-business-spb.ru + +--- + +## Описание задачи + +**О компании:** ООО «ТД «Балтийский Берег» — один из крупнейших производителей рыбной продукции в России. Более 350 наименований, производственные площадки в Петербурге и Ленобласти, поставки по всей стране и на экспорт. + +**Проблема:** Отсутствует структурированная база знаний на портале ITSM. Вся информация о решении инцидентов представлена в исторических тикетах. + +**Роль ИИ:** Обработать весь массив имеющихся тикетов и статей базы знаний (1 060 документов) из реальной системы IntraService, чтобы отвечать на новые запросы сотрудников. + +## Цель и видение результата на хакатон + +Разработать LLM-чатбота-помощника (для последующей интеграции с Telegram, Max, Битрикс), который: + +* подключается к MSSQL-базе сервис-деска (read-only); +* анализирует историю тикетов (Task, TaskFieldValues, TaskExpenses) и KB-статьи (KBDocument); +* отвечает на вопросы сотрудников, помогает создавать/описывать/решать заявки. + +## Потенциал бизнес-внедрения + +* Аудитория: ~1000 сотрудников компании. +* Эффект: снижение нагрузки на 1-ю линию техподдержки → сокращение расходов на ФОТ. +* Интеграция в существующий ITSM: IntraService 4.50.2 (вне скоупа хакатона, но решение должно быть готово к встраиванию). + +## Требования к решению + +| Категория | Требование | +|-----------|-----------| +| Юзкейсы | Пример 1: «Не подключается удаленка» → ответ: «Проверьте UniVPN и 2MFA Контур.Коннект». Пример 2: «Неправильный отчет в 1С» → ответ: «Проверьте период и список контрагентов» | +| Нагрузка | 30–50 запросов/сутки | +| Скорость | Ответ ≤ 30 секунд | +| Качество | ≤50% запросов должны уходить к живому консультанту. Лучше не ответить, чем ошибиться. Доля автоматизации — показатель бизнес-эффекта | +| Интеграция | Telegram / Max / Битрикс | +| Развертывание | Пилот — SaaS, продуктив — on-premise на сервере компании | +| Дашборд | Желателен | +| Админ-панель | Желательна для управления настройками | +| 152-ФЗ | Учитывать обработку персональных данных (ПДн из БД удалены, но архитектурно учесть для внедрения) | + +## Особенности задачи (важно учесть) + +Нет структурированной базы знаний на портале ITSM. +Вся экспертиза извлекается из исторических тикетов (особенно поле Task.Comment — HTML Q&A-диалоги) и неструктурированных статей KBDocument. +При переходе в продуктивный режим требуется регулярная обработка БД сервис-деска, чтобы в поиске участвовали как исторические, так и новые данные. + +## Критерии оценки решения + +| Критерий | Что проверяет жюри | +|----------|-------------------| +| Техническая реализация | Бот работает без ошибок, отвечает ≤30 секунд, корректно обрабатывает неструктурированный ввод пользователя. Бот удовлетворяет требованию развертывания на сервере компании. | +| Бизнес-ценность | Бот решает реальные проблемы сотрудников, снижает потребность в живом консультанте (≤50% эскалаций). | +| Готовность к внедрению | Код запускается по инструкции, есть README. Команда предлагает прототип на самостоятельный тест. | +| Презентация и питч | Команда показывает живые диалоги из БД, объясняет логику работы и проверки качества, аргументирует выбор решений. | +| Инновационность | Нестандартный подход к извлечению знаний из 104 тысяч тикетов при отсутствии готовой базы знаний. | + +--- + +## Оценка + +Жюри вручную оценивают: + +1. Качество ответов (точность, польза, естественный язык) +2. Поиск релевантных KB-статей +3. Классификацию заявок (предложение Service / TaskType / Priority) +4. Общий UX + +## Быстрый старт + +```bash +cp .env.example .env +# заполните переменные в .env +``` + +## Загрузка данных + +### Через API + +```bash +curl -H "X-API-Key: YOUR_API_KEY" \ + https://data.ai-business-spb.ru/data/baltiyskiy-bereg/cleaned.bak \ + -o data/cleaned.bak +``` + +Или через Python: + +```python +import requests + +headers = {"X-API-Key": "YOUR_API_KEY"} +r = requests.get( + "https://data.ai-business-spb.ru/data/baltiyskiy-bereg/cleaned.bak", + headers=headers, + stream=True, +) +with open("data/cleaned.bak", "wb") as f: + for chunk in r.iter_content(chunk_size=1 << 16): + f.write(chunk) +``` + +## Настройка базы данных в Docker + +1. Положите скачанный `cleaned.bak` в `data/`. + +2. Запустите MSSQL: + +```bash +docker compose up -d +``` + +3. Дождитесь восстановления базы (~1-2 минуты). Проверьте подключение: + +```bash +docker exec -it mssql-baltbereg /opt/mssql-tools/bin/sqlcmd \ + -S localhost -U SA -P "$MSSQL_SA_PASSWORD" \ + -Q "SELECT TOP 1 Name FROM service_desk_tdbb.dbo.Task" +``` + +### Ключевые таблицы + +| Таблица | Назначение | +|---|---| +| `Task` | ~104 000 тикетов. `Name`, `Description`, `Comment` (HTML Q&A thread), `StatusId`, `ServiceId`, `TypeId` | +| `TaskFieldValues` | Значения custom-полей | +| `TaskExpenses` | Записи работ: `Comments`, `Minutes`, `Date` | +| `KBDocument` | ~1 000 статей базы знаний: `Name`, `Description` (HTML), `IsPublished`, `Rating` | +| `KBDocumentTag`, `KBTag`, `KBFolder` | Теги и иерархия папок KB | +| `Service`, `TaskType`, `Status`, `Priority` | Lookup-таблицы | + +Персональные данные удалены из схемы. `Task.Comment` — главный источник Q&A-диалогов для RAG. + +## Подключение YandexGPT + +YandexGPT совместим с OpenAI API. Настройте переменные в `.env`: + +``` +YANDEX_GPT_API_KEY=your-api-key +YANDEX_GPT_FOLDER_ID=your-folder-id +YANDEX_GPT_MODEL=yandexgpt/latest +``` + +Endpoint: `https://llm.api.cloud.yandex.net/foundationModels/v1/completion` + +Для OpenAI-совместимого интерфейса используйте: + +``` +YANDEX_GPT_BASE_URL=https://llm.api.cloud.yandex.net/foundationModels/v1 +``` + +### Получение API-ключа + +1. Зайдите в [Yandex Cloud Console](https://console.yandex.cloud/) +2. Создайте сервисный аккаунт с ролью `ai.languageModels.user` +3. Создайте API-ключ для этого аккаунта +4. Скопируйте `folder_id` каталога и API-ключ в `.env` + +## Формат сдачи + +``` +Формат сдачи: +– Презентации должны открываться по ссылке +– Код загружен в публичный Git-репозиторий и открывается по ссылке (коммиты после дедлайна не принимаются) +– В ReadMe — минимальная документация: структура кода, зависимости, инструкция по деплою +– По кейсам с лидербордом — загружены и выбраны итоговые решения +``` + +## Активация промокода Яндекс + +``` +Активация промокода Яндекс: +Для активации промокода необходимо: + +1) Перейти по ссылке https://center.yandex.cloud/ +2) Перейти в раздел Billing +3) Нажать кнопку Активировать промокод +4) Активировать промокод + ++ будет ссылка на инструкцию по активации биллинг аккаунта. +``` diff --git "a/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/docker-compose.yml" "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/docker-compose.yml" new file mode 100644 index 0000000..4195daf --- /dev/null +++ "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/docker-compose.yml" @@ -0,0 +1,31 @@ +services: + mssql: + image: mcr.microsoft.com/mssql/server:2019-CU18-ubuntu-20.04 + container_name: mssql-baltbereg + restart: unless-stopped + environment: + ACCEPT_EULA: "Y" + MSSQL_SA_PASSWORD: "${MSSQL_SA_PASSWORD:?Set MSSQL_SA_PASSWORD in .env}" + MSSQL_PID: "Developer" + ports: + - "1433:1433" + volumes: + - mssql-data:/var/opt/mssql + - ./data/cleaned.bak:/var/opt/mssql/backup/cleaned.bak:ro + - ./restore-db.sh:/usr/local/bin/restore-db.sh:ro + entrypoint: /bin/bash + command: > + -c " + /opt/mssql/bin/sqlservr & + /usr/local/bin/restore-db.sh & + wait + " + healthcheck: + test: ["CMD-SHELL", "/opt/mssql-tools/bin/sqlcmd -S localhost -U SA -P $${MSSQL_SA_PASSWORD} -Q 'SELECT 1' -b -o /dev/null"] + interval: 15s + timeout: 5s + retries: 10 + start_period: 30s + +volumes: + mssql-data: diff --git "a/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/pyproject.toml" "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/pyproject.toml" new file mode 100644 index 0000000..ae00c28 --- /dev/null +++ "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/pyproject.toml" @@ -0,0 +1,13 @@ +[project] +name = "baltiyskiy-bereg-starter" +version = "0.1.0" +description = "Baltiyskiy Bereg service-desk chatbot starter — AI Business SPB Hackathon 2026" +requires-python = ">=3.11" +dependencies = [ + "pymssql>=2.3", + "requests>=2.31", + "python-dotenv>=1.0", +] + +[tool.uv] +package = false diff --git "a/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/restore-db.sh" "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/restore-db.sh" new file mode 100644 index 0000000..0d93139 --- /dev/null +++ "b/\320\267\320\260\320\264\320\260\320\275\320\270\320\265/restore-db.sh" @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Восстанавливает базу данных из backup при первом запуске +set -euo pipefail + +BACKUP_PATH="/var/opt/mssql/backup/cleaned.bak" +DB_NAME="service_desk_tdbb" +SA_PASSWORD="${MSSQL_SA_PASSWORD:?}" +SQLCMD="/opt/mssql-tools/bin/sqlcmd" + +echo "Waiting for SQL Server to start..." +for i in $(seq 1 60); do + if $SQLCMD -S localhost -U SA -P "$SA_PASSWORD" -Q "SELECT 1" -b -o /dev/null 2>/dev/null; then + echo "SQL Server is ready" + break + fi + if [ "$i" -eq 60 ]; then + echo "ERROR: SQL Server did not start in time" + exit 1 + fi + sleep 2 +done + +# Check if DB already exists +EXISTS=$($SQLCMD -S localhost -U SA -P "$SA_PASSWORD" -Q "SET NOCOUNT ON; SELECT COUNT(*) FROM sys.databases WHERE name='$DB_NAME'" -h -1 -b 2>/dev/null | tr -d '[:space:]') + +if [ "$EXISTS" = "0" ]; then + echo "Restoring $DB_NAME from backup..." + $SQLCMD -S localhost -U SA -P "$SA_PASSWORD" -Q " + RESTORE DATABASE [$DB_NAME] + FROM DISK = '$BACKUP_PATH' + WITH MOVE 'service_desk_tdbb' TO '/var/opt/mssql/data/${DB_NAME}.mdf', + MOVE 'service_desk_tdbb_log' TO '/var/opt/mssql/data/${DB_NAME}_log.ldf', + REPLACE + " -b + echo "Database restored successfully" +else + echo "Database $DB_NAME already exists, skipping restore" +fi