Python client library for the Cudara inference server (Ollama-compatible API).
This package provides a small, synchronous httpx-based client for:
- Chat (
/api/chat) - Text generation (
/api/generate) - Embeddings (
/api/embeddings) - Vision helpers (send images via
/api/generate) - Audio transcription (
/api/transcribe) - Prompt building + output parsing utilities
uv add cudara-clientpip install cudara-clientYou need a running Cudara server to use this client.
# Pull and run
docker run --gpus all -p 8000:8000 ghcr.io/juliog922/cudara:latest
# With persistent models
docker run --gpus all -p 8000:8000 \
-v cudara_models:/app/models \
ghcr.io/juliog922/cudara:latestgit clone https://github.com/juliog922/cudara
cd cudara
uv sync
# Run server
uv run cudara serveHealth check:
curl http://localhost:8000/healthfrom cudara_client import CudaraClient
with CudaraClient("http://localhost:8000") as client:
# Optionally pull/download a model (if enabled on your server)
client.pull("Qwen/Qwen2.5-3B-Instruct")
resp = client.chat("Qwen/Qwen2.5-3B-Instruct", "Hello!")
print(resp.content)from cudara_client import CudaraClient, Message, GenerationOptions
with CudaraClient("http://localhost:8000") as client:
resp = client.chat(
"Qwen/Qwen2.5-3B-Instruct",
[
Message(role="system", content="You are concise."),
Message(role="user", content="Explain embeddings in 2 bullets."),
],
options=GenerationOptions(temperature=0.2, max_tokens=128),
)
print(resp.content)from cudara_client import CudaraClient, GenerationOptions
with CudaraClient("http://localhost:8000") as client:
resp = client.generate(
"Qwen/Qwen2.5-3B-Instruct",
"Write a haiku about GPUs.",
system="You are a poet.",
options=GenerationOptions(max_tokens=64, temperature=0.8),
)
print(resp.content)from cudara_client import CudaraClient
with CudaraClient("http://localhost:8000") as client:
out = client.embed("sentence-transformers/all-MiniLM-L6-v2", ["hello", "world"])
print(len(out.embeddings), len(out.embeddings[0]))Single string convenience:
from cudara_client import CudaraClient
with CudaraClient("http://localhost:8000") as client:
out = client.embed("sentence-transformers/all-MiniLM-L6-v2", "hello")
vec = out.embedding
print(len(vec))If your server is running a vision-language model, you can send images.
vision() reads an image and sends it as base64 via /api/generate.
from cudara_client import CudaraClient
with CudaraClient("http://localhost:8000") as client:
resp = client.vision(
"your-vision-model",
"Describe the image.",
image_path="cat.jpg",
)
print(resp.content)from cudara_client import CudaraClient
with CudaraClient("http://localhost:8000") as client:
text = client.ocr("your-vision-model", "receipt.png")
print(text)Cudara supports multipart transcription at /api/transcribe.
from cudara_client import CudaraClient
with CudaraClient("http://localhost:8000") as client:
out = client.transcribe("openai/whisper-small", "audio.wav", language="en")
print(out.text)Any 4xx/5xx response raises CudaraError.
from cudara_client import CudaraClient, CudaraError
try:
with CudaraClient("http://localhost:8000") as client:
client.generate("unknown-model", "hello")
except CudaraError as e:
print("Request failed:", e)uv sync --dev --locked
uv run pytest -vMIT