diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
new file mode 100644
index 0000000..41d30e1
--- /dev/null
+++ b/.github/workflows/docker-build-push.yml
@@ -0,0 +1,36 @@
+name: Docker Build and Push
+
+on:
+ push:
+ branches: [ main ] # Adjust this if your main branch has a different name
+
+jobs:
+ build-and-push:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v2
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v1
+
+ - name: Login to DockerHub
+ uses: docker/login-action@v1
+ with:
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+ - name: Build and push
+ uses: docker/build-push-action@v2
+ with:
+ context: .
+ push: true
+ tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest
+ cache-from: type=gha
+ cache-to: type=gha,mode=max
+
+ - name: Cleanup
+ if: always()
+ run: |
+ docker system prune -af
+ df -h
diff --git a/Dockerfile b/Dockerfile
index 2c994ed..ffecb52 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,55 @@
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04
+ARG CUDA_VERSION=12.4.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update -y \
- && apt-get install -y python3-pip
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+ && apt update -y \
+ && apt install software-properties-common -y \
+ && add-apt-repository ppa:deadsnakes/ppa -y && apt update \
+ && apt install python3.10 python3.10-dev -y \
+ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \
+ && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
+ && apt install curl git sudo -y \
+ && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
+ && python3 --version \
+ && python3 -m pip --version \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt clean
RUN ldconfig /usr/local/cuda-12.1/compat/
+WORKDIR /sgl-workspace
+
+RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+ && git clone --depth=1 https://github.com/sgl-project/sglang.git \
+ && cd sglang \
+ && if [ "$BUILD_TYPE" = "srt" ]; then \
+ python3 -m pip --no-cache-dir install -e "python[srt]"; \
+ else \
+ python3 -m pip --no-cache-dir install -e "python[all]"; \
+ fi
+
+ARG CUDA_VERSION
+RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+ export CUDA_IDENTIFIER=cu121 && \
+ python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \
+ elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+ export CUDA_IDENTIFIER=cu124 && \
+ python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \
+ elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+ export CUDA_IDENTIFIER=cu118 && \
+ python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \
+ python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \
+ else \
+ echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+ fi
+
# Install Python dependencies
COPY builder/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install --upgrade pip && \
- python3 -m pip install --upgrade -r /requirements.txt
+RUN python3 -m pip install --upgrade -r /requirements.txt
-# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
-RUN python3 -m pip install "sglang[all]" && \
- python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
+RUN python3 -m pip cache purge
# Setup for Option 2: Building the Image with the Model included
ARG MODEL_NAME=""
diff --git a/README.md b/README.md
index be9f922..cfac1c0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-
SgLang Worker
+ SgLang Worker
🚀 | SGLang is yet another fast serving framework for large language models and vision language models.
@@ -8,8 +8,8 @@
## 📖 | Getting Started
1. Clone this repository.
-2. Build a docker image - ```docker build -t :worker-sglang:v1 .```
-3. ```docker push :worker-sglang:v1```
+2. Build a docker image - ```docker build -t /worker-sglang:v1 .```
+3. ```docker push /worker-sglang:v1```
***Once you have built the Docker image and deployed the endpoint, you can use the code below to interact with the endpoint***:
diff --git a/builder/requirements.txt b/builder/requirements.txt
index db07106..751628d 100644
--- a/builder/requirements.txt
+++ b/builder/requirements.txt
@@ -1,10 +1,10 @@
ray
pandas
pyarrow
-runpod==1.6.2
+runpod==1.7.0
huggingface-hub
packaging
-typing-extensions==4.7.1
+typing-extensions==4.11.0
pydantic
pydantic-settings
hf-transfer
\ No newline at end of file
diff --git a/src/engine.py b/src/engine.py
index ddf5257..28e3861 100644
--- a/src/engine.py
+++ b/src/engine.py
@@ -74,6 +74,8 @@ def start_server(self):
if os.getenv(flag, '').lower() in ('true', '1', 'yes'):
command.append(f"--{flag.lower().replace('_', '-')}")
+ print("LAUNCH SERVER COMMAND:")
+ print(command)
self.process = subprocess.Popen(command, stdout=None, stderr=None)
print(f"Server started with PID: {self.process.pid}")
@@ -93,14 +95,14 @@ def wait_for_server(self, timeout=300, interval=5):
def shutdown(self):
if self.process:
self.process.terminate()
- self.process.wait()
+ self.process.wait()
print("Server shut down.")
class OpenAIRequest:
def __init__(self, base_url="http://0.0.0.0:30000/v1", api_key="EMPTY"):
self.client = openai.Client(base_url=base_url, api_key=api_key)
- async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False):
+ async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0):
if messages is None:
messages = [
{"role": "system", "content": "You are a helpful AI assistant"},
@@ -111,7 +113,12 @@ async def request_chat_completions(self, model="default", messages=None, max_tok
model=model,
messages=messages,
max_tokens=max_tokens,
- stream=stream
+ stream=stream,
+ frequency_penalty=frequency_penalty,
+ n=n,
+ stop=stop,
+ temperature=temperature,
+ top_p=top_p
)
if stream:
@@ -120,12 +127,17 @@ async def request_chat_completions(self, model="default", messages=None, max_tok
else:
yield response.to_dict()
- async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False):
+ async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0):
response = self.client.completions.create(
model=model,
prompt=prompt,
max_tokens=max_tokens,
- stream=stream
+ stream=stream,
+ frequency_penalty=frequency_penalty,
+ n=n,
+ stop=stop,
+ temperature=temperature,
+ top_p=top_p
)
if stream:
diff --git a/src/handler.py b/src/handler.py
deleted file mode 100644
index ccf925e..0000000
--- a/src/handler.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import asyncio
-import requests
-from engine import SGlangEngine, OpenAIRequest
-import runpod
-
-# Initialize the engine
-engine = SGlangEngine()
-engine.start_server()
-engine.wait_for_server()
-
-
-async def async_handler(job):
- """Handle the requests asynchronously."""
- job_input = job["input"]
- print(f"JOB_INPUT: {job_input}")
-
- if job_input.get("openai_route"):
- openai_route, openai_input = job_input.get("openai_route"), job_input.get("openai_input")
- openai_request = OpenAIRequest()
-
- if openai_route == "/v1/chat/completions":
- async for chunk in openai_request.request_chat_completions(**openai_input):
- yield chunk
- elif openai_route == "/v1/completions":
- async for chunk in openai_request.request_completions(**openai_input):
- yield chunk
- elif openai_route == "/v1/models":
- models = await openai_request.get_models()
- yield models
- else:
- generate_url = f"{engine.base_url}/generate"
- headers = {"Content-Type": "application/json"}
- generate_data = {
- "text": job_input.get("prompt", ""),
- "sampling_params": job_input.get("sampling_params", {})
- }
- response = requests.post(generate_url, json=generate_data, headers=headers)
- if response.status_code == 200:
- yield response.json()
- else:
- yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text}
-
-runpod.serverless.start({"handler": async_handler, "return_aggregate_stream": True})
-
-# # Ensure the server is shut down when the serverless function is terminated
-# import atexit
-# atexit.register(engine.shutdown)
\ No newline at end of file