-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstart.sh
More file actions
executable file
·145 lines (128 loc) · 4.45 KB
/
start.sh
File metadata and controls
executable file
·145 lines (128 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env bash
set -euo pipefail
# Configurables (can be overridden via environment variables)
WEB_PORT=${WEB_PORT:-8012}
LLAMA_PORT=${LLAMA_PORT:-8009}
LLAMA_CTX=${LLAMA_CTX:-8192}
LLAMA_BIN=${LLAMA_BIN:-./llama.cpp/build/bin/llama-server}
LLAMA_PARALLEL=${LLAMA_PARALLEL:-1}
SKIP_LLAMACPP=${SKIP_LLAMACPP:-0}
DEFAULT_MODEL_GEMMA4="models/gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf"
DEFAULT_MMPROJ_GEMMA4="models/mmproj-F16.gguf"
DEFAULT_MODEL_NAME_GEMMA4="Gemma-4-26B-A4B-It"
DEFAULT_MODEL_QWEN35="models/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
DEFAULT_CHAT_TEMPLATE_QWEN35="app/chat_templates/qwen3.5-35b-a3b.chat_template.jinja"
LLAMA_MODEL="${LLAMA_MODEL:-$DEFAULT_MODEL_GEMMA4}"
LLAMA_MMPROJ="${LLAMA_MMPROJ:-$DEFAULT_MMPROJ_GEMMA4}"
LLAMA_MODEL_NAME="${LLAMA_MODEL_NAME:-$DEFAULT_MODEL_NAME_GEMMA4}"
LLAMA_CHAT_TEMPLATE_FILE="${LLAMA_CHAT_TEMPLATE_FILE:-}"
LLAMA_CHAT_TEMPLATE_KWARGS="${LLAMA_CHAT_TEMPLATE_KWARGS:-}"
LLAMA_REASONING="${LLAMA_REASONING:-}"
LLAMA_THINK_BUDGET="${LLAMA_THINK_BUDGET:-}"
is_gemma4_model=0
case "$LLAMA_MODEL" in
*gemma-4*|*Gemma-4*)
is_gemma4_model=1
;;
esac
if [ -z "$LLAMA_CHAT_TEMPLATE_FILE" ] && [ -n "${LLAMA_ARG_CHAT_TEMPLATE_FILE:-}" ]; then
LLAMA_CHAT_TEMPLATE_FILE="$LLAMA_ARG_CHAT_TEMPLATE_FILE"
fi
if [ -z "$LLAMA_THINK_BUDGET" ] && [ -n "${LLAMA_ARG_THINK_BUDGET:-}" ]; then
LLAMA_THINK_BUDGET="$LLAMA_ARG_THINK_BUDGET"
fi
if [ -z "$LLAMA_THINK_BUDGET" ] && [ -n "${LAMA_ARG_THINK_BUDGET:-}" ]; then
LLAMA_THINK_BUDGET="$LAMA_ARG_THINK_BUDGET"
echo "[WARN] LAMA_ARG_THINK_BUDGET is a typo; use LLAMA_THINK_BUDGET instead."
fi
if [ "$is_gemma4_model" = "1" ]; then
if [ -z "$LLAMA_REASONING" ]; then
LLAMA_REASONING=off
fi
if [ -z "$LLAMA_THINK_BUDGET" ]; then
LLAMA_THINK_BUDGET=0
fi
fi
if [ "$LLAMA_MODEL" = "$DEFAULT_MODEL_QWEN35" ]; then
if [ -z "$LLAMA_CHAT_TEMPLATE_FILE" ] && [ -f "$DEFAULT_CHAT_TEMPLATE_QWEN35" ]; then
LLAMA_CHAT_TEMPLATE_FILE="$DEFAULT_CHAT_TEMPLATE_QWEN35"
fi
if [ -z "$LLAMA_THINK_BUDGET" ]; then
LLAMA_THINK_BUDGET=0
fi
fi
if ! command -v uv >/dev/null 2>&1; then
echo "[ERROR] uv is required. Install via: pip install uv" >&2
exit 1
fi
# Install Python deps inside .venv via uv (no global installs)
uv sync
# Initialize TTS models and dictionary
uv run app/scripts/setup_tts.py
if [ "$SKIP_LLAMACPP" != "1" ]; then
if [ ! -x "$LLAMA_BIN" ]; then
echo "[ERROR] llama-server binary not found at $LLAMA_BIN"
echo "Run scripts/build_llama.sh to clone & build llama.cpp with CUDA."
exit 1
fi
if [ ! -f "$LLAMA_MODEL" ]; then
echo "[ERROR] model file not found at $LLAMA_MODEL"
exit 1
fi
if [ ! -f "$LLAMA_MMPROJ" ]; then
echo "[ERROR] mmproj file not found at $LLAMA_MMPROJ"
exit 1
fi
LLAMA_ARGS=(
--host 127.0.0.1
--port "$LLAMA_PORT"
--parallel "$LLAMA_PARALLEL"
-m "$LLAMA_MODEL"
-c "$LLAMA_CTX"
-ngl 999
--jinja
--flash-attn on
--mmproj "$LLAMA_MMPROJ"
)
if [ -n "$LLAMA_CHAT_TEMPLATE_FILE" ]; then
LLAMA_ARGS+=(--chat-template-file "$LLAMA_CHAT_TEMPLATE_FILE")
fi
if [ -n "$LLAMA_CHAT_TEMPLATE_KWARGS" ]; then
LLAMA_ARGS+=(--chat-template-kwargs "$LLAMA_CHAT_TEMPLATE_KWARGS")
fi
if [ -n "$LLAMA_REASONING" ]; then
LLAMA_ARGS+=(--reasoning "$LLAMA_REASONING")
fi
if [ -n "$LLAMA_THINK_BUDGET" ]; then
LLAMA_ARGS+=(--reasoning-budget "$LLAMA_THINK_BUDGET")
fi
echo "[INFO] starting llama.cpp server on port $LLAMA_PORT"
echo "[INFO] model: $LLAMA_MODEL"
echo "[INFO] model name: $LLAMA_MODEL_NAME"
echo "[INFO] mmproj: $LLAMA_MMPROJ"
echo "[INFO] parallel: $LLAMA_PARALLEL"
if [ -n "$LLAMA_CHAT_TEMPLATE_FILE" ]; then
echo "[INFO] chat template: $LLAMA_CHAT_TEMPLATE_FILE"
fi
if [ -n "$LLAMA_CHAT_TEMPLATE_KWARGS" ]; then
echo "[INFO] chat template kwargs: $LLAMA_CHAT_TEMPLATE_KWARGS"
fi
if [ -n "$LLAMA_REASONING" ]; then
echo "[INFO] reasoning: $LLAMA_REASONING"
fi
if [ -n "$LLAMA_THINK_BUDGET" ]; then
echo "[INFO] reasoning budget: $LLAMA_THINK_BUDGET"
fi
set +e
"$LLAMA_BIN" "${LLAMA_ARGS[@]}" > llama-server.log 2>&1 &
LLAMA_PID=$!
set -e
trap 'echo "[INFO] stopping llama.cpp"; kill $LLAMA_PID 2>/dev/null || true' EXIT
else
echo "[INFO] SKIP_LLAMACPP=1 -> assuming llama-server already running on $LLAMA_PORT"
fi
export LLAMA_SERVER_URL=${LLAMA_SERVER_URL:-http://127.0.0.1:${LLAMA_PORT}}
export LLAMA_CTX
export LLAMA_MODEL_NAME
echo "[INFO] starting FastAPI on port $WEB_PORT"
uv run uvicorn app.main:app --host 127.0.0.1 --port "$WEB_PORT"