From 6342c11260c6664bbbcfc507ebd5f9fb46781ff2 Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 14:17:40 +0200
Subject: [PATCH 01/19] more parameters for openai api

---
 src/engine.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/engine.py b/src/engine.py
index ddf5257..64fd412 100644
--- a/src/engine.py
+++ b/src/engine.py
@@ -100,7 +100,7 @@ class OpenAIRequest:
     def __init__(self, base_url="http://0.0.0.0:30000/v1", api_key="EMPTY"):
         self.client = openai.Client(base_url=base_url, api_key=api_key)
     
-    async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False):
+    async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0):
         if messages is None:
             messages = [
                 {"role": "system", "content": "You are a helpful AI assistant"},
@@ -111,7 +111,12 @@ async def request_chat_completions(self, model="default", messages=None, max_tok
             model=model,
             messages=messages,
             max_tokens=max_tokens,
-            stream=stream
+            stream=stream,
+            frequency_penalty=frequency_penalty,
+            n=n,
+            stop=stop,
+            temperature=temperature,
+            top_p=top_p
         )
         
         if stream:
@@ -120,12 +125,17 @@ async def request_chat_completions(self, model="default", messages=None, max_tok
         else:
             yield response.to_dict()
     
-    async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False):
+    async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0):
         response = self.client.completions.create(
             model=model,
             prompt=prompt,
             max_tokens=max_tokens,
-            stream=stream
+            stream=stream,
+            frequency_penalty=frequency_penalty,
+            n=n,
+            stop=stop,
+            temperature=temperature,
+            top_p=top_p
         )
         
         if stream:

From db5e61a7863d5890fea3ce119dcab10fe2d0c9e3 Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 14:23:42 +0200
Subject: [PATCH 02/19] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index be9f922..c33ac1d 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,8 @@
 ## 📖 | Getting Started
 
 1. Clone this repository.
-2. Build a docker image - ```docker build -t <your_username>:worker-sglang:v1 .```
-3. ```docker push <your_username>:worker-sglang:v1```
+2. Build a docker image - ```docker build -t <your_username>/worker-sglang:v1 .```
+3. ```docker push <your_username>/worker-sglang:v1```
 
 
 ***Once you have built the Docker image and deployed the endpoint, you can use the code below to interact with the endpoint***: 

From 06fd157de40f8c384e562070509cff29f31d91df Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:16:28 +0200
Subject: [PATCH 03/19] Add files via upload

---
 .github/workflows/docker-build-push.yml | 28 +++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 .github/workflows/docker-build-push.yml

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
new file mode 100644
index 0000000..a2d4ace
--- /dev/null
+++ b/.github/workflows/docker-build-push.yml
@@ -0,0 +1,28 @@
+name: Docker Build and Push
+
+on:
+  push:
+    branches: [ main ]  # Adjust this if your main branch has a different name
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+    
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v1
+    
+    - name: Login to DockerHub
+      uses: docker/login-action@v1
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+    
+    - name: Build and push
+      uses: docker/build-push-action@v2
+      with:
+        context: .
+        push: true
+        tags: ${{ secrets.DOCKERHUB_USERNAME }}/your-repo-name:latest

From 54185142e0c53f48d0141cdd7103e0959d55bf11 Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:16:59 +0200
Subject: [PATCH 04/19] Update docker-build-push.yml

---
 .github/workflows/docker-build-push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
index a2d4ace..3d91639 100644
--- a/.github/workflows/docker-build-push.yml
+++ b/.github/workflows/docker-build-push.yml
@@ -25,4 +25,4 @@ jobs:
       with:
         context: .
         push: true
-        tags: ${{ secrets.DOCKERHUB_USERNAME }}/your-repo-name:latest
+        tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest

From 0c3c59de663d5af38f012fd438a07717861dcdd1 Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:17:22 +0200
Subject: [PATCH 05/19] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c33ac1d..cfac1c0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 
-<h1> SgLang Worker</h1>
+<h1> SgLang Worker</h1> 
 
 🚀 | SGLang is yet another fast serving framework for large language models and vision language models.
 </div>

From 5223e056ac0a5656261aa749275b0a9a8666b51a Mon Sep 17 00:00:00 2001
From: supa-thibaud <supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 13:17:32 +0000
Subject: [PATCH 06/19] Update package version

---
 builder/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/builder/requirements.txt b/builder/requirements.txt
index db07106..056b1ea 100644
--- a/builder/requirements.txt
+++ b/builder/requirements.txt
@@ -1,7 +1,7 @@
 ray
 pandas
 pyarrow
-runpod==1.6.2
+runpod==1.7.0
 huggingface-hub
 packaging
 typing-extensions==4.7.1

From e0483fd4c867b0c3bdc5fc71bfe8b491ced55661 Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 16:52:31 +0200
Subject: [PATCH 07/19] Update Dockerfile Cuda / flashinfer new versions

---
 Dockerfile | 59 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2c994ed..d90213a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,52 @@
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 
+ARG CUDA_VERSION=12.1.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG BUILD_TYPE=all
+ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt update -y \
+    && apt install software-properties-common -y \
+    && add-apt-repository ppa:deadsnakes/ppa -y && apt update \
+    && apt install python3.10 python3.10-dev -y \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \
+    && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
+    && apt install curl git sudo -y \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
+    && python3 --version \
+    && python3 -m pip --version \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt clean
 
-RUN ldconfig /usr/local/cuda-12.1/compat/
+WORKDIR /sgl-workspace
 
-# Install Python dependencies
-COPY builder/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
+RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+    && git clone --depth=1 https://github.com/sgl-project/sglang.git \
+    && cd sglang \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python3 -m pip --no-cache-dir install -e "python[srt]"; \
+       else \
+         python3 -m pip --no-cache-dir install -e "python[all]"; \
+       fi
 
-# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
-RUN python3 -m pip install "sglang[all]" && \
-    python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
+ARG CUDA_VERSION
+RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+        export CUDA_IDENTIFIER=cu121 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \
+    elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+        export CUDA_IDENTIFIER=cu124 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \
+    elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+        export CUDA_IDENTIFIER=cu118 && \
+        python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \
+    else \
+        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+    fi
+
+RUN python3 -m pip cache purge
+
+ENV DEBIAN_FRONTEND=interactive
 
 # Setup for Option 2: Building the Image with the Model included
 ARG MODEL_NAME=""
@@ -47,4 +80,4 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \
     fi
 
 # Start the handler
-CMD ["python3", "/src/handler.py"]
\ No newline at end of file
+CMD ["python3", "/src/handler.py"]

From 8cdebaa2fce8558390f59863b3745db2cff54bef Mon Sep 17 00:00:00 2001
From: Thibaud Zamora <166740648+supa-thibaud@users.noreply.github.com>
Date: Thu, 22 Aug 2024 17:12:46 +0200
Subject: [PATCH 08/19] Update docker-build-push.yml

---
 .github/workflows/docker-build-push.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
index 3d91639..e94c3a7 100644
--- a/.github/workflows/docker-build-push.yml
+++ b/.github/workflows/docker-build-push.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build-and-push:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest-xl
     steps:
     - name: Checkout code
       uses: actions/checkout@v2
@@ -26,3 +26,11 @@ jobs:
         context: .
         push: true
         tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest
+        cache-from: type=gha
+        cache-to: type=gha,mode=max
+
+    - name: Cleanup
+      if: always()
+      run: |
+        docker system prune -af
+        df -h

From cdfb17682139b696aa9d6d2281b0f96939a80fff Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Thu, 22 Aug 2024 18:14:13 +0200
Subject: [PATCH 09/19] .

---
 .github/workflows/docker-build-push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
index e94c3a7..41d30e1 100644
--- a/.github/workflows/docker-build-push.yml
+++ b/.github/workflows/docker-build-push.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   build-and-push:
-    runs-on: ubuntu-latest-xl
+    runs-on: ubuntu-latest
     steps:
     - name: Checkout code
       uses: actions/checkout@v2

From f9866a1f1c3fca364a874f57e1fab7746fc2d147 Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Thu, 22 Aug 2024 18:24:37 +0200
Subject: [PATCH 10/19] more recent version of cuda and flashinfer

---
 Dockerfile | 52 ++++++++++++----------------------------------------
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d90213a..2152fa3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,53 +1,25 @@
 ARG CUDA_VERSION=12.1.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-ARG BUILD_TYPE=all
 ENV DEBIAN_FRONTEND=noninteractive
 
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt update -y \
-    && apt install software-properties-common -y \
-    && add-apt-repository ppa:deadsnakes/ppa -y && apt update \
-    && apt install python3.10 python3.10-dev -y \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \
-    && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
-    && apt install curl git sudo -y \
-    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
-    && python3 --version \
-    && python3 -m pip --version \
-    && rm -rf /var/lib/apt/lists/* \
+RUN apt-get update -y \
+    && apt-get install -y python3-pip \
     && apt clean
 
-WORKDIR /sgl-workspace
+RUN ldconfig /usr/local/cuda-12.1/compat/
 
-RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
-    && git clone --depth=1 https://github.com/sgl-project/sglang.git \
-    && cd sglang \
-    && if [ "$BUILD_TYPE" = "srt" ]; then \
-         python3 -m pip --no-cache-dir install -e "python[srt]"; \
-       else \
-         python3 -m pip --no-cache-dir install -e "python[all]"; \
-       fi
+# Install Python dependencies
+COPY builder/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
 
-ARG CUDA_VERSION
-RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \
-        export CUDA_IDENTIFIER=cu121 && \
-        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \
-    elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
-        export CUDA_IDENTIFIER=cu124 && \
-        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \
-    elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
-        export CUDA_IDENTIFIER=cu118 && \
-        python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \
-        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \
-    else \
-        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
-    fi
+# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
+RUN python3 -m pip install "sglang[all]" && \
+    python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4
 
 RUN python3 -m pip cache purge
 
-ENV DEBIAN_FRONTEND=interactive
-
 # Setup for Option 2: Building the Image with the Model included
 ARG MODEL_NAME=""
 ARG TOKENIZER_NAME=""
@@ -80,4 +52,4 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \
     fi
 
 # Start the handler
-CMD ["python3", "/src/handler.py"]
+CMD ["python3", "/src/handler.py"]
\ No newline at end of file

From 2def68446a50771f8220ee5a8a933785543d390b Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Thu, 22 Aug 2024 18:55:46 +0200
Subject: [PATCH 11/19] flashinfer / cuda updated

---
 Dockerfile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2152fa3..0d53b21 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG CUDA_VERSION=12.1.1
+ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -15,8 +15,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade -r /requirements.txt
 
 # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
-RUN python3 -m pip install "sglang[all]" && \
-    python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4
+ 
+RUN export CUDA_IDENTIFIER=cu124 && \
+    python3 -m pip install "sglang[all]" && \
+    python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/;
 
 RUN python3 -m pip cache purge
 

From 22523507111558d46e7d3aa5e19b392b6d906483 Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 08:09:49 +0200
Subject: [PATCH 12/19] python updated

---
 Dockerfile | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0d53b21..b526eee 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,19 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get update -y \
-    && apt-get install -y python3-pip \
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt update -y \
+    && apt install software-properties-common -y \
+    && add-apt-repository ppa:deadsnakes/ppa -y && apt update \
+    && apt install python3.10 python3.10-dev -y \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \
+    && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
+    && apt install curl git sudo -y \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
+    && python3 --version \
+    && python3 -m pip --version \
+    && rm -rf /var/lib/apt/lists/* \
     && apt clean
 
 RUN ldconfig /usr/local/cuda-12.1/compat/
@@ -17,7 +28,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
  
 RUN export CUDA_IDENTIFIER=cu124 && \
-    python3 -m pip install "sglang[all]" && \
+    python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+    python3 -m pip install --upgrade "sglang[all]" && \
     python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/;
 
 RUN python3 -m pip cache purge

From 721c951bb8c9eb2917bca9be7a36af46c9ef4a8b Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 08:42:00 +0200
Subject: [PATCH 13/19] python updated

---
 Dockerfile | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b526eee..ffecb52 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,18 +19,35 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 
 RUN ldconfig /usr/local/cuda-12.1/compat/
 
+WORKDIR /sgl-workspace
+
+RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+    && git clone --depth=1 https://github.com/sgl-project/sglang.git \
+    && cd sglang \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python3 -m pip --no-cache-dir install -e "python[srt]"; \
+       else \
+         python3 -m pip --no-cache-dir install -e "python[all]"; \
+       fi
+
+ARG CUDA_VERSION
+RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+        export CUDA_IDENTIFIER=cu121 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \
+    elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+        export CUDA_IDENTIFIER=cu124 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \
+    elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+        export CUDA_IDENTIFIER=cu118 && \
+        python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \
+    else \
+        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+    fi
+
 # Install Python dependencies
 COPY builder/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
-
-# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
- 
-RUN export CUDA_IDENTIFIER=cu124 && \
-    python3 -m pip install --upgrade pip setuptools wheel html5lib six \
-    python3 -m pip install --upgrade "sglang[all]" && \
-    python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/;
+RUN python3 -m pip install --upgrade -r /requirements.txt
 
 RUN python3 -m pip cache purge
 

From 648ae6bd0bacd0faf533f6a5b3e7622a32366087 Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 09:04:07 +0200
Subject: [PATCH 14/19] typing extensions updated

---
 builder/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/builder/requirements.txt b/builder/requirements.txt
index 056b1ea..751628d 100644
--- a/builder/requirements.txt
+++ b/builder/requirements.txt
@@ -4,7 +4,7 @@ pyarrow
 runpod==1.7.0
 huggingface-hub
 packaging
-typing-extensions==4.7.1
+typing-extensions==4.11.0
 pydantic
 pydantic-settings
 hf-transfer
\ No newline at end of file

From 4643d01fd51782f49bdb65c59f31b3acb33802bb Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 09:35:17 +0200
Subject: [PATCH 15/19] some trace to debug

---
 src/engine.py  | 2 ++
 src/handler.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/engine.py b/src/engine.py
index 64fd412..63e68a5 100644
--- a/src/engine.py
+++ b/src/engine.py
@@ -74,6 +74,8 @@ def start_server(self):
             if os.getenv(flag, '').lower() in ('true', '1', 'yes'):
                 command.append(f"--{flag.lower().replace('_', '-')}")
 
+        print("LAUNCH SERVER COMMAND:")
+        print(command)
         self.process = subprocess.Popen(command, stdout=None, stderr=None)
         print(f"Server started with PID: {self.process.pid}")
     
diff --git a/src/handler.py b/src/handler.py
index ccf925e..4ae6ad2 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -8,6 +8,7 @@
 engine.start_server()
 engine.wait_for_server()
 
+print(f" ==== start_server")
 
 async def async_handler(job):
     """Handle the requests asynchronously."""

From 3e52cbdad6ddfb3f642958009c0b9fb0a255bd38 Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 09:47:04 +0200
Subject: [PATCH 16/19] some trace to debug + max_concurrency

---
 src/handler.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/handler.py b/src/handler.py
index 4ae6ad2..69a4c06 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -2,6 +2,7 @@
 import requests
 from engine import SGlangEngine, OpenAIRequest
 import runpod
+import os
 
 # Initialize the engine
 engine = SGlangEngine()
@@ -41,7 +42,10 @@ async def async_handler(job):
         else:
             yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text}
 
-runpod.serverless.start({"handler": async_handler, "return_aggregate_stream": True})
+max_concurrency = int(os.getenv("MAX_CONCURRENCY", 100))
+print(f"MAX_CONCURRENCY {max_concurrency}")
+
+runpod.serverless.start({"handler": async_handler, "concurrency_modifier": max_concurrency, "return_aggregate_stream": True})
 
 # # Ensure the server is shut down when the serverless function is terminated
 # import atexit

From 9829d0e6309a189fc2cc94f89868f7dd1565d69d Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 09:52:11 +0200
Subject: [PATCH 17/19] some trace to debug + max_concurrency

---
 src/handler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/handler.py b/src/handler.py
index 69a4c06..71ffcc0 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -45,7 +45,9 @@ async def async_handler(job):
 max_concurrency = int(os.getenv("MAX_CONCURRENCY", 100))
 print(f"MAX_CONCURRENCY {max_concurrency}")
 
-runpod.serverless.start({"handler": async_handler, "concurrency_modifier": max_concurrency, "return_aggregate_stream": True})
+runpod.serverless.start({"handler": async_handler, 
+                         "concurrency_modifier": lambda x: max_concurrency, 
+                         "return_aggregate_stream": True})
 
 # # Ensure the server is shut down when the serverless function is terminated
 # import atexit

From 605957349b63b9cac1f2972cbb1481e93ebf2469 Mon Sep 17 00:00:00 2001
From: supa-thibaud <thibaud@supafriends.com>
Date: Fri, 23 Aug 2024 13:16:33 +0200
Subject: [PATCH 18/19] some trace to debug + max concurrency

---
 src/engine.py  | 2 +-
 src/handler.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/engine.py b/src/engine.py
index 63e68a5..28e3861 100644
--- a/src/engine.py
+++ b/src/engine.py
@@ -95,7 +95,7 @@ def wait_for_server(self, timeout=300, interval=5):
     def shutdown(self):
         if self.process:
             self.process.terminate()
-            self.process.wait()
+            self.process.wait() 
             print("Server shut down.")
 
 class OpenAIRequest:
diff --git a/src/handler.py b/src/handler.py
index 71ffcc0..3aa0693 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -6,7 +6,7 @@
 
 # Initialize the engine
 engine = SGlangEngine()
-engine.start_server()
+engine.start_server() 
 engine.wait_for_server()
 
 print(f" ==== start_server")

From 856bbd6fb2e7794a0422b4e87f268b6eccbb5ba8 Mon Sep 17 00:00:00 2001
From: Marut Pandya <pandyamarut@gmail.com>
Date: Wed, 28 Aug 2024 13:45:31 -0700
Subject: [PATCH 19/19] Delete src/handler.py

---
 src/handler.py | 54 --------------------------------------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 src/handler.py

diff --git a/src/handler.py b/src/handler.py
deleted file mode 100644
index 3aa0693..0000000
--- a/src/handler.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import asyncio
-import requests
-from engine import SGlangEngine, OpenAIRequest
-import runpod
-import os
-
-# Initialize the engine
-engine = SGlangEngine()
-engine.start_server() 
-engine.wait_for_server()
-
-print(f" ==== start_server")
-
-async def async_handler(job):
-    """Handle the requests asynchronously."""
-    job_input = job["input"]
-    print(f"JOB_INPUT: {job_input}")
-    
-    if job_input.get("openai_route"):
-        openai_route, openai_input = job_input.get("openai_route"), job_input.get("openai_input")
-        openai_request = OpenAIRequest()
-        
-        if openai_route == "/v1/chat/completions":
-            async for chunk in openai_request.request_chat_completions(**openai_input):
-                yield chunk
-        elif openai_route == "/v1/completions":
-            async for chunk in openai_request.request_completions(**openai_input):
-                yield chunk
-        elif openai_route == "/v1/models":
-            models = await openai_request.get_models()
-            yield models
-    else:
-        generate_url = f"{engine.base_url}/generate"
-        headers = {"Content-Type": "application/json"}
-        generate_data = {
-            "text": job_input.get("prompt", ""),
-            "sampling_params": job_input.get("sampling_params", {})
-        }
-        response = requests.post(generate_url, json=generate_data, headers=headers)
-        if response.status_code == 200:
-            yield response.json()
-        else:
-            yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text}
-
-max_concurrency = int(os.getenv("MAX_CONCURRENCY", 100))
-print(f"MAX_CONCURRENCY {max_concurrency}")
-
-runpod.serverless.start({"handler": async_handler, 
-                         "concurrency_modifier": lambda x: max_concurrency, 
-                         "return_aggregate_stream": True})
-
-# # Ensure the server is shut down when the serverless function is terminated
-# import atexit
-# atexit.register(engine.shutdown)
\ No newline at end of file