Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
LLM_API_KEY=github_pat_...
LLM_BASE_URL=https://models.github.ai/inference
LLM_MODEL=openai/gpt-4.1-mini
LLM_RPM=60

EMBEDDER_API_KEY=github_pat_...
EMBEDDER_BASE_URL=https://models.github.ai/inference
EMBEDDER_MODEL=text-embedding-3-large
EMBEDDER_DIM=3072

RAGU_STORAGE=ragu_data
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
* -text
*.woff binary
*.woff2 binary
*.ttf binary
*.eot binary
*.otf binary
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# $mol
-*
.DS_Store

checkpoints/
benchmark/*.json
ragu_working_dir/
ragu_data/
*.bak

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -175,4 +181,4 @@ cython_debug/
.ruff_cache/

# PyPI configuration file
.pypirc
.pypirc
40 changes: 40 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# --- Frontend ---
FROM node:20-alpine AS frontend

RUN apk add --no-cache git

WORKDIR /app
RUN git clone --depth 1 https://github.com/hyoo-ru/mam.git . \
&& npm install

COPY front/ bog/RAGU/front/

RUN npx mam bog/RAGU/front/app

EXPOSE 9080

CMD ["npm", "start"]


# --- API ---
FROM python:3.12-slim AS api

WORKDIR /app

COPY pyproject.toml ./
COPY ragu/ ./ragu/
RUN pip install --no-cache-dir .

COPY server/ ./server/
RUN pip install --no-cache-dir -r server/requirements.txt

COPY etl/ ./etl/
RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
&& pip install --no-cache-dir pymssql python-dotenv sentence-transformers

COPY server/entrypoint.sh ./entrypoint.sh
RUN chmod +x ./entrypoint.sh

EXPOSE 8000

CMD ["./entrypoint.sh"]
67 changes: 67 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
services:
mssql:
image: mcr.microsoft.com/azure-sql-edge:latest
container_name: mssql-baltbereg
restart: unless-stopped
environment:
ACCEPT_EULA: "Y"
MSSQL_SA_PASSWORD: "BaltBeregHack2026!"
ports:
- "1433:1433"
volumes:
- mssql-data:/var/opt/mssql
- ./задание/cleaned.bak:/var/opt/mssql/backup/cleaned.bak:ro
healthcheck:
test: ["CMD-SHELL", "python3 -c \"import socket; s=socket.create_connection(('localhost',1433),2); s.close()\""]
interval: 10s
timeout: 5s
retries: 30
start_period: 10s

api:
build:
context: .
dockerfile: Dockerfile
target: api
ports:
- "8100:8000"
restart: unless-stopped
env_file: .env
environment:
- NUMBA_CPU_NAME=generic
- MSSQL_HOST=mssql
- MSSQL_SA_PASSWORD=BaltBeregHack2026!
depends_on:
mssql:
condition: service_healthy
volumes:
- ragu_data:/app/ragu_data
develop:
watch:
- action: sync+restart
path: ./server
target: /app/server

# http://localhost:8100/api/status

web:
build:
context: .
dockerfile: Dockerfile
target: frontend
ports:
- "9081:9080"
restart: unless-stopped
tty: true
stdin_open: true
develop:
watch:
- action: sync
path: ./front
target: /app/bog/RAGU/front

# http://localhost:9081/bog/RAGU/front/app/-/test.html

volumes:
mssql-data:
ragu_data:
Empty file added etl/__init__.py
Empty file.
201 changes: 201 additions & 0 deletions etl/extract_and_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""
ETL: Extract tickets & KB from MSSQL → build Knowledge Graph via RAGU.

50 most informative tickets + KB articles → full GraphRAG pipeline.

Usage:
docker compose run --rm api python -m etl.extract_and_index
"""

import asyncio
import os
import re
from html.parser import HTMLParser

import pymssql
from dotenv import load_dotenv

from ragu import KnowledgeGraph, SimpleChunker, BuilderArguments, Settings
from ragu.models import LLMOpenAI
from ragu.models.openai import CachedAsyncOpenAI
from ragu.triplet import ArtifactsExtractorLLM
from etl.local_embedder import LocalEmbedder


# ---------------------------------------------------------------------------
# HTML → plain text
# ---------------------------------------------------------------------------

class _HTMLStripper(HTMLParser):
def __init__(self):
super().__init__()
self._parts: list[str] = []

def handle_data(self, data: str):
self._parts.append(data)

def get_text(self) -> str:
return " ".join(self._parts)


def strip_html(html: str | None) -> str:
if not html:
return ""
s = _HTMLStripper()
s.feed(html)
text = s.get_text()
text = re.sub(r"\s+", " ", text).strip()
return text


# ---------------------------------------------------------------------------
# MSSQL helpers
# ---------------------------------------------------------------------------

def get_connection():
return pymssql.connect(
server=os.getenv("MSSQL_HOST", "mssql"),
user="SA",
password=os.getenv("MSSQL_SA_PASSWORD", "BaltBeregHack2026!"),
database="service_desk_tdbb",
charset="utf8",
)


def fetch_lookups(conn) -> dict[str, dict[int, str]]:
lookups: dict[str, dict[int, str]] = {}
cursor = conn.cursor(as_dict=True)
for table in ("Service", "TaskType", "Status", "Priority"):
cursor.execute(f"SELECT Id, NameXml FROM {table}")
lookups[table] = {row["Id"]: strip_html(row["NameXml"]) for row in cursor}
return lookups


def fetch_tickets(conn, lookups: dict[str, dict[int, str]], limit: int = 50) -> list[str]:
"""Fetch most informative tickets — those with longest Comment (richest Q&A)."""
cursor = conn.cursor(as_dict=True)
cursor.execute(f"""
SELECT TOP {limit} Id, Name, Description, Comment,
ServiceId, TypeId, StatusId, PriorityId
FROM Task
WHERE Comment IS NOT NULL AND LEN(Comment) > 100
ORDER BY LEN(Comment) DESC
""")

docs: list[str] = []
for row in cursor:
service = lookups["Service"].get(row["ServiceId"], "")
task_type = lookups["TaskType"].get(row["TypeId"], "")
status = lookups["Status"].get(row["StatusId"], "")
priority = lookups["Priority"].get(row["PriorityId"], "")

desc = strip_html(row["Description"])
comment = strip_html(row["Comment"])

parts = [f"Тикет #{row['Id']}: {row['Name'] or ''}"]
if service:
parts.append(f"Сервис: {service}")
if task_type:
parts.append(f"Тип: {task_type}")
if status:
parts.append(f"Статус: {status}")
if priority:
parts.append(f"Приоритет: {priority}")
if desc:
parts.append(f"Описание: {desc}")
if comment:
parts.append(f"Переписка: {comment}")

docs.append("\n".join(parts))

return docs


def fetch_kb_articles(conn, limit: int = 50) -> list[str]:
cursor = conn.cursor(as_dict=True)
cursor.execute(f"""
SELECT TOP {limit} Id, Name, Description
FROM KBDocument
WHERE IsPublished = 1 AND Description IS NOT NULL AND LEN(Description) > 50
ORDER BY Rating DESC, Id DESC
""")

docs: list[str] = []
for row in cursor:
desc = strip_html(row["Description"])
parts = [f"Статья базы знаний #{row['Id']}: {row['Name'] or ''}"]
if desc:
parts.append(desc)
docs.append("\n".join(parts))

return docs


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

async def main():
load_dotenv()

storage = os.getenv("RAGU_STORAGE", "ragu_data")
Settings.storage_folder = storage
Settings.language = "russian"

llm_client = CachedAsyncOpenAI(
base_url=os.getenv("LLM_BASE_URL", "https://models.github.ai/inference"),
api_key=os.getenv("LLM_API_KEY", ""),
rate_max_per_minute=5,
rate_max_simultaneous=1,
rate_min_delay=8,
retry_times_sec=(30, 60, 120, 240),
)
llm = LLMOpenAI(client=llm_client, model_name=os.getenv("LLM_MODEL", "openai/gpt-4.1-mini"))

print("[ETL] Loading local embedder...")
embedder = LocalEmbedder("intfloat/multilingual-e5-small")

# Extract from MSSQL
print("[ETL] Connecting to MSSQL...")
conn = get_connection()

print("[ETL] Loading lookups...")
lookups = fetch_lookups(conn)
for table, data in lookups.items():
print(f" {table}: {len(data)} entries")

print("[ETL] Extracting top 10 KB articles (by rating)...")
kb_docs = fetch_kb_articles(conn, limit=10)
print(f" {len(kb_docs)} KB articles")

print("[ETL] Extracting top 10 tickets (richest Q&A threads)...")
ticket_docs = fetch_tickets(conn, lookups, limit=10)
print(f" {len(ticket_docs)} tickets")

conn.close()

all_docs = kb_docs + ticket_docs
print(f"[ETL] Total: {len(all_docs)} documents")

# Build full Knowledge Graph (entities, relations, communities, summaries)
print("[ETL] Building Knowledge Graph...")
extractor = ArtifactsExtractorLLM(llm=llm, do_validation=False)

kg = KnowledgeGraph(
llm=llm,
embedder=embedder,
chunker=SimpleChunker(max_chunk_size=1000),
artifact_extractor=extractor,
builder_settings=BuilderArguments(
use_llm_summarization=True,
vectorize_chunks=True,
),
)

await kg.build_from_docs(all_docs)

print(f"[ETL] Done! Knowledge Graph saved to {storage}/")


if __name__ == "__main__":
asyncio.run(main())
36 changes: 36 additions & 0 deletions etl/local_embedder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Local sentence-transformers embedder implementing RAGU's Embedder interface."""

from typing import Any
from ragu.models.embedder import Embedder, FLOATS


class LocalEmbedder(Embedder):
"""Embedder using sentence-transformers locally. No API calls, no rate limits."""

def __init__(self, model_name: str = "intfloat/multilingual-e5-small"):
from sentence_transformers import SentenceTransformer
self._model = SentenceTransformer(model_name)
self._dim = self._model.get_sentence_embedding_dimension()
print(f"[LocalEmbedder] Loaded {model_name}, dim={self._dim}")

@property
def dim(self) -> int:
return self._dim

async def embed_text(self, text: str, **kwargs: Any) -> list[float]:
embedding = self._model.encode(text, normalize_embeddings=True)
return embedding.tolist()

async def batch_embed_text(
self,
texts: list[str],
desc: str | None = None,
**kwargs: Any,
) -> list[list[float]] | FLOATS:
embeddings = self._model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=bool(desc),
batch_size=64,
)
return embeddings.tolist()
Loading