forked from mgerstgrasser/tacheles
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.dev.yaml
More file actions
93 lines (90 loc) · 2.64 KB
/
docker-compose.dev.yaml
File metadata and controls
93 lines (90 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
version: "3"
services:
# # Using vllm
# inference:
# image: vllm/vllm-openai:latest
# # build:
# # context: ./inference
# # dockerfile: Dockerfile.vllm
# ports:
# - "8000:8000"
# # - "5679:5679"
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# device_ids: [ '0' ]
# # count: 1
# capabilities: [ gpu ]
# volumes:
# - ~/.cache/huggingface:/root/.cache/huggingface
# # Uncomment to use local model by mounting it inside the container
# # - ${MODELPATH}:/model
# environment:
# - HF_TOKEN=${HF_TOKEN}
# - MODEL=${MODEL}
# # entrypoint:
# # [
# # "sh",
# # "-c",
# # "pip install debugpy -t /tmp && python3 /tmp/debugpy --listen 0.0.0.0:5679 -m vllm.entrypoints.openai.api_server --model=${MODEL} --port=8000 --tensor-parallel-size=${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}"
# # ]
# command: --model=${MODEL} --port=8000 --tensor-parallel-size=${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}
# shm_size: '2gb'
# Using sglang:
inference:
build:
context: ./inference
dockerfile: Dockerfile.sglang
ports:
- "8000:8000"
- "5679:5679"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
# Alternatively specify specific GPU device IDs
# device_ids: [ '2' ]
capabilities: [gpu]
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface
# Uncomment to use local model by mounting it inside the container
# - ${MODELPATH}:/model
environment:
- HF_TOKEN=${HF_TOKEN}
- MODEL=${MODEL}
# entrypoint:
# [
# "sh",
# "-c",
# "pip install debugpy -t /tmp && python3 /tmp/debugpy --listen 0.0.0.0:5679 -m sglang.launch_server --port 8000 --host 0.0.0.0 --model-path ${MODEL} --tp ${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}",
# ]
# command: --model=${MODEL} --port=8000 --tensor-parallel-size=${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}
shm_size: "2gb"
backend:
build:
context: ./backend
dockerfile: Dockerfile.dev
environment:
INFERENCE_API_URI: http://inference:8000/v1
MODEL: ${MODEL}
ports:
- "8001:8001"
- "5678:5678"
volumes:
- ./backend:/app
frontend:
build:
context: ./frontend
ports:
- "3000:3000"
volumes:
- ./frontend/src:/app/src
- ./frontend/public:/app/public
depends_on:
- backend
environment:
REACT_APP_BACKEND_URL: http://localhost:8001