tacheles/docker-compose.dev.yaml at main · CBMM/tacheles · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
version: "3"
services:
  # # Using vllm
  # inference:
  #   image: vllm/vllm-openai:latest
  #   # build:
  #   #   context: ./inference
  #   #   dockerfile: Dockerfile.vllm
  #   ports:
  #     - "8000:8000"
  #     # - "5679:5679"
  #   deploy:
  #     resources:
  #       reservations:
  #         devices:
  #           - driver: nvidia
  #             device_ids: [ '0' ]
  #             # count: 1
  #             capabilities: [ gpu ]
  #   volumes:
  #     - ~/.cache/huggingface:/root/.cache/huggingface
  #     # Uncomment to use local model by mounting it inside the container
  #     # - ${MODELPATH}:/model
  #   environment:
  #     - HF_TOKEN=${HF_TOKEN}
  #     - MODEL=${MODEL}
  #   # entrypoint:
  #   #   [
  #   #     "sh",
  #   #     "-c",
  #   #     "pip install debugpy -t /tmp && python3 /tmp/debugpy --listen 0.0.0.0:5679 -m vllm.entrypoints.openai.api_server --model=${MODEL} --port=8000 --tensor-parallel-size=${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}"
  #   #   ]
  #   command: --model=${MODEL} --port=8000 --tensor-parallel-size=${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}
  #   shm_size: '2gb'

  # Using sglang:
  inference:
    build:
      context: ./inference
      dockerfile: Dockerfile.sglang
    ports:
      - "8000:8000"
      - "5679:5679"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              # Alternatively specify specific GPU device IDs
              # device_ids: [ '2' ]
              capabilities: [gpu]
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
      # Uncomment to use local model by mounting it inside the container
      # - ${MODELPATH}:/model
    environment:
      - HF_TOKEN=${HF_TOKEN}
      - MODEL=${MODEL}
    # entrypoint:
    #   [
    #     "sh",
    #     "-c",
    #     "pip install debugpy -t /tmp && python3 /tmp/debugpy --listen 0.0.0.0:5679 -m sglang.launch_server --port 8000 --host 0.0.0.0 --model-path ${MODEL} --tp ${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}",
    #   ]
    # command: --model=${MODEL} --port=8000 --tensor-parallel-size=${NUM_GPUS:-1} ${EXTRA_INFERENCE_ARGS:-}
    shm_size: "2gb"

  backend:
    build:
      context: ./backend
      dockerfile: Dockerfile.dev
    environment:
      INFERENCE_API_URI: http://inference:8000/v1
      MODEL: ${MODEL}
    ports:
      - "8001:8001"
      - "5678:5678"
    volumes:
      - ./backend:/app

  frontend:
    build:
      context: ./frontend
    ports:
      - "3000:3000"
    volumes:
      - ./frontend/src:/app/src
      - ./frontend/public:/app/public
    depends_on:
      - backend
    environment:
      REACT_APP_BACKEND_URL: http://localhost:8001