From 6342c11260c6664bbbcfc507ebd5f9fb46781ff2 Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:17:40 +0200 Subject: [PATCH 01/19] more parameters for openai api --- src/engine.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/engine.py b/src/engine.py index ddf5257..64fd412 100644 --- a/src/engine.py +++ b/src/engine.py @@ -100,7 +100,7 @@ class OpenAIRequest: def __init__(self, base_url="http://0.0.0.0:30000/v1", api_key="EMPTY"): self.client = openai.Client(base_url=base_url, api_key=api_key) - async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False): + async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0): if messages is None: messages = [ {"role": "system", "content": "You are a helpful AI assistant"}, @@ -111,7 +111,12 @@ async def request_chat_completions(self, model="default", messages=None, max_tok model=model, messages=messages, max_tokens=max_tokens, - stream=stream + stream=stream, + frequency_penalty=frequency_penalty, + n=n, + stop=stop, + temperature=temperature, + top_p=top_p ) if stream: @@ -120,12 +125,17 @@ async def request_chat_completions(self, model="default", messages=None, max_tok else: yield response.to_dict() - async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False): + async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0): response = self.client.completions.create( model=model, prompt=prompt, max_tokens=max_tokens, - stream=stream + stream=stream, + frequency_penalty=frequency_penalty, + n=n, + stop=stop, + temperature=temperature, + top_p=top_p ) if stream: From db5e61a7863d5890fea3ce119dcab10fe2d0c9e3 Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:23:42 +0200 Subject: [PATCH 02/19] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index be9f922..c33ac1d 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ ## 📖 | Getting Started 1. Clone this repository. -2. Build a docker image - ```docker build -t :worker-sglang:v1 .``` -3. ```docker push :worker-sglang:v1``` +2. Build a docker image - ```docker build -t /worker-sglang:v1 .``` +3. ```docker push /worker-sglang:v1``` ***Once you have built the Docker image and deployed the endpoint, you can use the code below to interact with the endpoint***: From 06fd157de40f8c384e562070509cff29f31d91df Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:16:28 +0200 Subject: [PATCH 03/19] Add files via upload --- .github/workflows/docker-build-push.yml | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/docker-build-push.yml diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 0000000..a2d4ace --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,28 @@ +name: Docker Build and Push + +on: + push: + branches: [ main ] # Adjust this if your main branch has a different name + +jobs: + build-and-push: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/your-repo-name:latest From 54185142e0c53f48d0141cdd7103e0959d55bf11 Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:16:59 +0200 Subject: [PATCH 04/19] Update docker-build-push.yml --- .github/workflows/docker-build-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index a2d4ace..3d91639 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -25,4 +25,4 @@ jobs: with: context: . push: true - tags: ${{ secrets.DOCKERHUB_USERNAME }}/your-repo-name:latest + tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest From 0c3c59de663d5af38f012fd438a07717861dcdd1 Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:17:22 +0200 Subject: [PATCH 05/19] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c33ac1d..cfac1c0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
-

SgLang Worker

+

SgLang Worker

🚀 | SGLang is yet another fast serving framework for large language models and vision language models.
From 5223e056ac0a5656261aa749275b0a9a8666b51a Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Thu, 22 Aug 2024 13:17:32 +0000 Subject: [PATCH 06/19] Update package version --- builder/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builder/requirements.txt b/builder/requirements.txt index db07106..056b1ea 100644 --- a/builder/requirements.txt +++ b/builder/requirements.txt @@ -1,7 +1,7 @@ ray pandas pyarrow -runpod==1.6.2 +runpod==1.7.0 huggingface-hub packaging typing-extensions==4.7.1 From e0483fd4c867b0c3bdc5fc71bfe8b491ced55661 Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 16:52:31 +0200 Subject: [PATCH 07/19] Update Dockerfile Cuda / flashinfer new versions --- Dockerfile | 59 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2c994ed..d90213a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,52 @@ -FROM nvidia/cuda:12.1.0-base-ubuntu22.04 +ARG CUDA_VERSION=12.1.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +ARG BUILD_TYPE=all +ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update -y \ - && apt-get install -y python3-pip +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt update -y \ + && apt install software-properties-common -y \ + && add-apt-repository ppa:deadsnakes/ppa -y && apt update \ + && apt install python3.10 python3.10-dev -y \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \ + && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \ + && apt install curl git sudo -y \ + && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \ + && python3 --version \ + && python3 -m pip --version \ + && rm -rf /var/lib/apt/lists/* \ + && apt clean -RUN ldconfig /usr/local/cuda-12.1/compat/ +WORKDIR /sgl-workspace -# Install Python dependencies -COPY builder/requirements.txt /requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install --upgrade -r /requirements.txt +RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ + && git clone --depth=1 https://github.com/sgl-project/sglang.git \ + && cd sglang \ + && if [ "$BUILD_TYPE" = "srt" ]; then \ + python3 -m pip --no-cache-dir install -e "python[srt]"; \ + else \ + python3 -m pip --no-cache-dir install -e "python[all]"; \ + fi -# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer -RUN python3 -m pip install "sglang[all]" && \ - python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 +ARG CUDA_VERSION +RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \ + export CUDA_IDENTIFIER=cu121 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \ + elif [ "$CUDA_VERSION" = "12.4.1" ]; then \ + export CUDA_IDENTIFIER=cu124 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \ + elif [ "$CUDA_VERSION" = "11.8.0" ]; then \ + export CUDA_IDENTIFIER=cu118 && \ + python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \ + else \ + echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \ + fi + +RUN python3 -m pip cache purge + +ENV DEBIAN_FRONTEND=interactive # Setup for Option 2: Building the Image with the Model included ARG MODEL_NAME="" @@ -47,4 +80,4 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \ fi # Start the handler -CMD ["python3", "/src/handler.py"] \ No newline at end of file +CMD ["python3", "/src/handler.py"] From 8cdebaa2fce8558390f59863b3745db2cff54bef Mon Sep 17 00:00:00 2001 From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com> Date: Thu, 22 Aug 2024 17:12:46 +0200 Subject: [PATCH 08/19] Update docker-build-push.yml --- .github/workflows/docker-build-push.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index 3d91639..e94c3a7 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -6,7 +6,7 @@ on: jobs: build-and-push: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-xl steps: - name: Checkout code uses: actions/checkout@v2 @@ -26,3 +26,11 @@ jobs: context: . push: true tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Cleanup + if: always() + run: | + docker system prune -af + df -h From cdfb17682139b696aa9d6d2281b0f96939a80fff Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Thu, 22 Aug 2024 18:14:13 +0200 Subject: [PATCH 09/19] . --- .github/workflows/docker-build-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml index e94c3a7..41d30e1 100644 --- a/.github/workflows/docker-build-push.yml +++ b/.github/workflows/docker-build-push.yml @@ -6,7 +6,7 @@ on: jobs: build-and-push: - runs-on: ubuntu-latest-xl + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v2 From f9866a1f1c3fca364a874f57e1fab7746fc2d147 Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Thu, 22 Aug 2024 18:24:37 +0200 Subject: [PATCH 10/19] more recent version of cuda and flashinfer --- Dockerfile | 52 ++++++++++++---------------------------------------- 1 file changed, 12 insertions(+), 40 deletions(-) diff --git a/Dockerfile b/Dockerfile index d90213a..2152fa3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,53 +1,25 @@ ARG CUDA_VERSION=12.1.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -ARG BUILD_TYPE=all ENV DEBIAN_FRONTEND=noninteractive -RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt update -y \ - && apt install software-properties-common -y \ - && add-apt-repository ppa:deadsnakes/ppa -y && apt update \ - && apt install python3.10 python3.10-dev -y \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \ - && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \ - && apt install curl git sudo -y \ - && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \ - && python3 --version \ - && python3 -m pip --version \ - && rm -rf /var/lib/apt/lists/* \ +RUN apt-get update -y \ + && apt-get install -y python3-pip \ && apt clean -WORKDIR /sgl-workspace +RUN ldconfig /usr/local/cuda-12.1/compat/ -RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ - && git clone --depth=1 https://github.com/sgl-project/sglang.git \ - && cd sglang \ - && if [ "$BUILD_TYPE" = "srt" ]; then \ - python3 -m pip --no-cache-dir install -e "python[srt]"; \ - else \ - python3 -m pip --no-cache-dir install -e "python[all]"; \ - fi +# Install Python dependencies +COPY builder/requirements.txt /requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install --upgrade -r /requirements.txt -ARG CUDA_VERSION -RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \ - export CUDA_IDENTIFIER=cu121 && \ - python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \ - elif [ "$CUDA_VERSION" = "12.4.1" ]; then \ - export CUDA_IDENTIFIER=cu124 && \ - python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \ - elif [ "$CUDA_VERSION" = "11.8.0" ]; then \ - export CUDA_IDENTIFIER=cu118 && \ - python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \ - python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \ - else \ - echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \ - fi +# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer +RUN python3 -m pip install "sglang[all]" && \ + python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4 RUN python3 -m pip cache purge -ENV DEBIAN_FRONTEND=interactive - # Setup for Option 2: Building the Image with the Model included ARG MODEL_NAME="" ARG TOKENIZER_NAME="" @@ -80,4 +52,4 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \ fi # Start the handler -CMD ["python3", "/src/handler.py"] +CMD ["python3", "/src/handler.py"] \ No newline at end of file From 2def68446a50771f8220ee5a8a933785543d390b Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Thu, 22 Aug 2024 18:55:46 +0200 Subject: [PATCH 11/19] flashinfer / cuda updated --- Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2152fa3..0d53b21 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG CUDA_VERSION=12.1.1 +ARG CUDA_VERSION=12.4.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive @@ -15,8 +15,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade -r /requirements.txt # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer -RUN python3 -m pip install "sglang[all]" && \ - python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4 + +RUN export CUDA_IDENTIFIER=cu124 && \ + python3 -m pip install "sglang[all]" && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; RUN python3 -m pip cache purge From 22523507111558d46e7d3aa5e19b392b6d906483 Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 08:09:49 +0200 Subject: [PATCH 12/19] python updated --- Dockerfile | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0d53b21..b526eee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,19 @@ ARG CUDA_VERSION=12.4.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update -y \ - && apt-get install -y python3-pip \ +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt update -y \ + && apt install software-properties-common -y \ + && add-apt-repository ppa:deadsnakes/ppa -y && apt update \ + && apt install python3.10 python3.10-dev -y \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \ + && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \ + && apt install curl git sudo -y \ + && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \ + && python3 --version \ + && python3 -m pip --version \ + && rm -rf /var/lib/apt/lists/* \ && apt clean RUN ldconfig /usr/local/cuda-12.1/compat/ @@ -17,7 +28,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer RUN export CUDA_IDENTIFIER=cu124 && \ - python3 -m pip install "sglang[all]" && \ + python3 -m pip install --upgrade pip setuptools wheel html5lib six \ + python3 -m pip install --upgrade "sglang[all]" && \ python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; RUN python3 -m pip cache purge From 721c951bb8c9eb2917bca9be7a36af46c9ef4a8b Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 08:42:00 +0200 Subject: [PATCH 13/19] python updated --- Dockerfile | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index b526eee..ffecb52 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,18 +19,35 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN ldconfig /usr/local/cuda-12.1/compat/ +WORKDIR /sgl-workspace + +RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ + && git clone --depth=1 https://github.com/sgl-project/sglang.git \ + && cd sglang \ + && if [ "$BUILD_TYPE" = "srt" ]; then \ + python3 -m pip --no-cache-dir install -e "python[srt]"; \ + else \ + python3 -m pip --no-cache-dir install -e "python[all]"; \ + fi + +ARG CUDA_VERSION +RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \ + export CUDA_IDENTIFIER=cu121 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \ + elif [ "$CUDA_VERSION" = "12.4.1" ]; then \ + export CUDA_IDENTIFIER=cu124 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \ + elif [ "$CUDA_VERSION" = "11.8.0" ]; then \ + export CUDA_IDENTIFIER=cu118 && \ + python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \ + else \ + echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \ + fi + # Install Python dependencies COPY builder/requirements.txt /requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install --upgrade -r /requirements.txt - -# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer - -RUN export CUDA_IDENTIFIER=cu124 && \ - python3 -m pip install --upgrade pip setuptools wheel html5lib six \ - python3 -m pip install --upgrade "sglang[all]" && \ - python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; +RUN python3 -m pip install --upgrade -r /requirements.txt RUN python3 -m pip cache purge From 648ae6bd0bacd0faf533f6a5b3e7622a32366087 Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 09:04:07 +0200 Subject: [PATCH 14/19] typing extensions updated --- builder/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builder/requirements.txt b/builder/requirements.txt index 056b1ea..751628d 100644 --- a/builder/requirements.txt +++ b/builder/requirements.txt @@ -4,7 +4,7 @@ pyarrow runpod==1.7.0 huggingface-hub packaging -typing-extensions==4.7.1 +typing-extensions==4.11.0 pydantic pydantic-settings hf-transfer \ No newline at end of file From 4643d01fd51782f49bdb65c59f31b3acb33802bb Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 09:35:17 +0200 Subject: [PATCH 15/19] some trace to debug --- src/engine.py | 2 ++ src/handler.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/engine.py b/src/engine.py index 64fd412..63e68a5 100644 --- a/src/engine.py +++ b/src/engine.py @@ -74,6 +74,8 @@ def start_server(self): if os.getenv(flag, '').lower() in ('true', '1', 'yes'): command.append(f"--{flag.lower().replace('_', '-')}") + print("LAUNCH SERVER COMMAND:") + print(command) self.process = subprocess.Popen(command, stdout=None, stderr=None) print(f"Server started with PID: {self.process.pid}") diff --git a/src/handler.py b/src/handler.py index ccf925e..4ae6ad2 100644 --- a/src/handler.py +++ b/src/handler.py @@ -8,6 +8,7 @@ engine.start_server() engine.wait_for_server() +print(f" ==== start_server") async def async_handler(job): """Handle the requests asynchronously.""" From 3e52cbdad6ddfb3f642958009c0b9fb0a255bd38 Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 09:47:04 +0200 Subject: [PATCH 16/19] some trace to debug + max_concurrency --- src/handler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/handler.py b/src/handler.py index 4ae6ad2..69a4c06 100644 --- a/src/handler.py +++ b/src/handler.py @@ -2,6 +2,7 @@ import requests from engine import SGlangEngine, OpenAIRequest import runpod +import os # Initialize the engine engine = SGlangEngine() @@ -41,7 +42,10 @@ async def async_handler(job): else: yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text} -runpod.serverless.start({"handler": async_handler, "return_aggregate_stream": True}) +max_concurrency = int(os.getenv("MAX_CONCURRENCY", 100)) +print(f"MAX_CONCURRENCY {max_concurrency}") + +runpod.serverless.start({"handler": async_handler, "concurrency_modifier": max_concurrency, "return_aggregate_stream": True}) # # Ensure the server is shut down when the serverless function is terminated # import atexit From 9829d0e6309a189fc2cc94f89868f7dd1565d69d Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 09:52:11 +0200 Subject: [PATCH 17/19] some trace to debug + max_concurrency --- src/handler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/handler.py b/src/handler.py index 69a4c06..71ffcc0 100644 --- a/src/handler.py +++ b/src/handler.py @@ -45,7 +45,9 @@ async def async_handler(job): max_concurrency = int(os.getenv("MAX_CONCURRENCY", 100)) print(f"MAX_CONCURRENCY {max_concurrency}") -runpod.serverless.start({"handler": async_handler, "concurrency_modifier": max_concurrency, "return_aggregate_stream": True}) +runpod.serverless.start({"handler": async_handler, + "concurrency_modifier": lambda x: max_concurrency, + "return_aggregate_stream": True}) # # Ensure the server is shut down when the serverless function is terminated # import atexit From 605957349b63b9cac1f2972cbb1481e93ebf2469 Mon Sep 17 00:00:00 2001 From: supa-thibaud Date: Fri, 23 Aug 2024 13:16:33 +0200 Subject: [PATCH 18/19] some trace to debug + max concurrency --- src/engine.py | 2 +- src/handler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine.py b/src/engine.py index 63e68a5..28e3861 100644 --- a/src/engine.py +++ b/src/engine.py @@ -95,7 +95,7 @@ def wait_for_server(self, timeout=300, interval=5): def shutdown(self): if self.process: self.process.terminate() - self.process.wait() + self.process.wait() print("Server shut down.") class OpenAIRequest: diff --git a/src/handler.py b/src/handler.py index 71ffcc0..3aa0693 100644 --- a/src/handler.py +++ b/src/handler.py @@ -6,7 +6,7 @@ # Initialize the engine engine = SGlangEngine() -engine.start_server() +engine.start_server() engine.wait_for_server() print(f" ==== start_server") From 856bbd6fb2e7794a0422b4e87f268b6eccbb5ba8 Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Wed, 28 Aug 2024 13:45:31 -0700 Subject: [PATCH 19/19] Delete src/handler.py --- src/handler.py | 54 -------------------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 src/handler.py diff --git a/src/handler.py b/src/handler.py deleted file mode 100644 index 3aa0693..0000000 --- a/src/handler.py +++ /dev/null @@ -1,54 +0,0 @@ -import asyncio -import requests -from engine import SGlangEngine, OpenAIRequest -import runpod -import os - -# Initialize the engine -engine = SGlangEngine() -engine.start_server() -engine.wait_for_server() - -print(f" ==== start_server") - -async def async_handler(job): - """Handle the requests asynchronously.""" - job_input = job["input"] - print(f"JOB_INPUT: {job_input}") - - if job_input.get("openai_route"): - openai_route, openai_input = job_input.get("openai_route"), job_input.get("openai_input") - openai_request = OpenAIRequest() - - if openai_route == "/v1/chat/completions": - async for chunk in openai_request.request_chat_completions(**openai_input): - yield chunk - elif openai_route == "/v1/completions": - async for chunk in openai_request.request_completions(**openai_input): - yield chunk - elif openai_route == "/v1/models": - models = await openai_request.get_models() - yield models - else: - generate_url = f"{engine.base_url}/generate" - headers = {"Content-Type": "application/json"} - generate_data = { - "text": job_input.get("prompt", ""), - "sampling_params": job_input.get("sampling_params", {}) - } - response = requests.post(generate_url, json=generate_data, headers=headers) - if response.status_code == 200: - yield response.json() - else: - yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text} - -max_concurrency = int(os.getenv("MAX_CONCURRENCY", 100)) -print(f"MAX_CONCURRENCY {max_concurrency}") - -runpod.serverless.start({"handler": async_handler, - "concurrency_modifier": lambda x: max_concurrency, - "return_aggregate_stream": True}) - -# # Ensure the server is shut down when the serverless function is terminated -# import atexit -# atexit.register(engine.shutdown) \ No newline at end of file