diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 0000000..41d30e1 --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,36 @@ +name: Docker Build and Push + +on: + push: + branches: [ main ] # Adjust this if your main branch has a different name + +jobs: + build-and-push: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Cleanup + if: always() + run: | + docker system prune -af + df -h diff --git a/Dockerfile b/Dockerfile index 2c994ed..ffecb52 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,55 @@ -FROM nvidia/cuda:12.1.0-base-ubuntu22.04 +ARG CUDA_VERSION=12.4.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update -y \ - && apt-get install -y python3-pip +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt update -y \ + && apt install software-properties-common -y \ + && add-apt-repository ppa:deadsnakes/ppa -y && apt update \ + && apt install python3.10 python3.10-dev -y \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \ + && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \ + && apt install curl git sudo -y \ + && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \ + && python3 --version \ + && python3 -m pip --version \ + && rm -rf /var/lib/apt/lists/* \ + && apt clean RUN ldconfig /usr/local/cuda-12.1/compat/ +WORKDIR /sgl-workspace + +RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \ + && git clone --depth=1 https://github.com/sgl-project/sglang.git \ + && cd sglang \ + && if [ "$BUILD_TYPE" = "srt" ]; then \ + python3 -m pip --no-cache-dir install -e "python[srt]"; \ + else \ + python3 -m pip --no-cache-dir install -e "python[all]"; \ + fi + +ARG CUDA_VERSION +RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \ + export CUDA_IDENTIFIER=cu121 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \ + elif [ "$CUDA_VERSION" = "12.4.1" ]; then \ + export CUDA_IDENTIFIER=cu124 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \ + elif [ "$CUDA_VERSION" = "11.8.0" ]; then \ + export CUDA_IDENTIFIER=cu118 && \ + python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \ + python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \ + else \ + echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \ + fi + # Install Python dependencies COPY builder/requirements.txt /requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install --upgrade -r /requirements.txt +RUN python3 -m pip install --upgrade -r /requirements.txt -# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer -RUN python3 -m pip install "sglang[all]" && \ - python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 +RUN python3 -m pip cache purge # Setup for Option 2: Building the Image with the Model included ARG MODEL_NAME="" diff --git a/README.md b/README.md index be9f922..cfac1c0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
-

SgLang Worker

+

SgLang Worker

🚀 | SGLang is yet another fast serving framework for large language models and vision language models.
@@ -8,8 +8,8 @@ ## 📖 | Getting Started 1. Clone this repository. -2. Build a docker image - ```docker build -t :worker-sglang:v1 .``` -3. ```docker push :worker-sglang:v1``` +2. Build a docker image - ```docker build -t /worker-sglang:v1 .``` +3. ```docker push /worker-sglang:v1``` ***Once you have built the Docker image and deployed the endpoint, you can use the code below to interact with the endpoint***: diff --git a/builder/requirements.txt b/builder/requirements.txt index db07106..751628d 100644 --- a/builder/requirements.txt +++ b/builder/requirements.txt @@ -1,10 +1,10 @@ ray pandas pyarrow -runpod==1.6.2 +runpod==1.7.0 huggingface-hub packaging -typing-extensions==4.7.1 +typing-extensions==4.11.0 pydantic pydantic-settings hf-transfer \ No newline at end of file diff --git a/src/engine.py b/src/engine.py index ddf5257..28e3861 100644 --- a/src/engine.py +++ b/src/engine.py @@ -74,6 +74,8 @@ def start_server(self): if os.getenv(flag, '').lower() in ('true', '1', 'yes'): command.append(f"--{flag.lower().replace('_', '-')}") + print("LAUNCH SERVER COMMAND:") + print(command) self.process = subprocess.Popen(command, stdout=None, stderr=None) print(f"Server started with PID: {self.process.pid}") @@ -93,14 +95,14 @@ def wait_for_server(self, timeout=300, interval=5): def shutdown(self): if self.process: self.process.terminate() - self.process.wait() + self.process.wait() print("Server shut down.") class OpenAIRequest: def __init__(self, base_url="http://0.0.0.0:30000/v1", api_key="EMPTY"): self.client = openai.Client(base_url=base_url, api_key=api_key) - async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False): + async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0): if messages is None: messages = [ {"role": "system", "content": "You are a helpful AI assistant"}, @@ -111,7 +113,12 @@ async def request_chat_completions(self, model="default", messages=None, max_tok model=model, messages=messages, max_tokens=max_tokens, - stream=stream + stream=stream, + frequency_penalty=frequency_penalty, + n=n, + stop=stop, + temperature=temperature, + top_p=top_p ) if stream: @@ -120,12 +127,17 @@ async def request_chat_completions(self, model="default", messages=None, max_tok else: yield response.to_dict() - async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False): + async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0): response = self.client.completions.create( model=model, prompt=prompt, max_tokens=max_tokens, - stream=stream + stream=stream, + frequency_penalty=frequency_penalty, + n=n, + stop=stop, + temperature=temperature, + top_p=top_p ) if stream: diff --git a/src/handler.py b/src/handler.py deleted file mode 100644 index ccf925e..0000000 --- a/src/handler.py +++ /dev/null @@ -1,47 +0,0 @@ -import asyncio -import requests -from engine import SGlangEngine, OpenAIRequest -import runpod - -# Initialize the engine -engine = SGlangEngine() -engine.start_server() -engine.wait_for_server() - - -async def async_handler(job): - """Handle the requests asynchronously.""" - job_input = job["input"] - print(f"JOB_INPUT: {job_input}") - - if job_input.get("openai_route"): - openai_route, openai_input = job_input.get("openai_route"), job_input.get("openai_input") - openai_request = OpenAIRequest() - - if openai_route == "/v1/chat/completions": - async for chunk in openai_request.request_chat_completions(**openai_input): - yield chunk - elif openai_route == "/v1/completions": - async for chunk in openai_request.request_completions(**openai_input): - yield chunk - elif openai_route == "/v1/models": - models = await openai_request.get_models() - yield models - else: - generate_url = f"{engine.base_url}/generate" - headers = {"Content-Type": "application/json"} - generate_data = { - "text": job_input.get("prompt", ""), - "sampling_params": job_input.get("sampling_params", {}) - } - response = requests.post(generate_url, json=generate_data, headers=headers) - if response.status_code == 200: - yield response.json() - else: - yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text} - -runpod.serverless.start({"handler": async_handler, "return_aggregate_stream": True}) - -# # Ensure the server is shut down when the serverless function is terminated -# import atexit -# atexit.register(engine.shutdown) \ No newline at end of file