IBM · dependabot · Dec 19, 2024 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
@@ -12,7 +12,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Helm
-        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+        uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1
         with:
           version: v3.14.4
 

diff --git a/.github/workflows/test-spyre.yml b/.github/workflows/test-spyre.yml
@@ -0,0 +1,27 @@
+name: test-sypre
+
+on: pull_request
+
+jobs:
+  test-spyre:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Build docker image
+      run: docker build . -t vllm-spyre -f Dockerfile.spyre
+    - name: Run Spyre tests within docker container
+      run: |
+        docker run -i --rm --entrypoint /bin/bash vllm-spyre -c '''
+          pip install pytest sentence-transformers && \
+          python3.12 -c "from transformers import pipeline; pipeline(\"text-generation\", model=\"JackFram/llama-160m\")" && \
+          export VARIANT=$(ls /root/.cache/huggingface/hub/models--JackFram--llama-160m/snapshots/) && \
+          mkdir -p /models && \
+          ln -s /root/.cache/huggingface/hub/models--JackFram--llama-160m/snapshots/${VARIANT} /models/llama-194m && \
+          python3.12 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer(\"sentence-transformers/all-roberta-large-v1\")" && \
+          export VARIANT=$(ls /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/) && \
+          ln -s /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/${VARIANT} /models/all-roberta-large-v1 && \
+          export MASTER_PORT=12355 && \
+          export MASTER_ADDR=localhost && \
+          export DISTRIBUTED_STRATEGY_IGNORE_MODULES=WordEmbedding && \
+          python3.12 -m pytest tests/spyre -v
+        '''
diff --git a/.yapfignore b/.yapfignore
@@ -1 +1,3 @@
 collect_env.py
+
+vllm/model_executor/model_loader/spyre_setup.py
diff --git a/Dockerfile.spyre b/Dockerfile.spyre
@@ -0,0 +1,28 @@
+# Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.4
+ARG PYTHON_VERSION=3.12
+
+# Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+WORKDIR /workspace/vllm
+
+# Install some basic utilities ##################################################################
+RUN microdnf update -y && microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel git vim gcc g++\
+    && microdnf clean all
+
+# Install build dependencies ##################################################################
+RUN --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    python3.12 -m pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+# Build vLLM ##################################################################
+COPY . . 
+
+ENV VLLM_TARGET_DEVICE=spyre
+RUN --mount=type=bind,source=.git,target=.git \
+    pip install --no-build-isolation -v -e .
+
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -16,9 +16,13 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)
+## What is the purpose of this fork?
 
----
+This is a fork of vLLM that we are using to develop support for IBM's AI accelerator (Spyre). 
+The idea is that the main branch of this repo should not diverge significantly from upstream beyond changes required to enable Spyre.
+We will try to rebase against upstream frequently and we plan to contribute these changes to the upstream repository in the future. 
 
+---
 *Latest News* 🔥
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).

diff --git a/examples/offline_inference_multi_spyre.py b/examples/offline_inference_multi_spyre.py
@@ -0,0 +1,60 @@
+import gc
+import os
+import time
+
+from vllm import LLM, SamplingParams
+
+max_tokens = 3
+
+os.environ["VLLM_SPYRE_WARMUP_PROMPT_LENS"] = '64'
+os.environ["VLLM_SPYRE_WARMUP_NEW_TOKENS"] = str(max_tokens)
+os.environ['VLLM_SPYRE_WARMUP_BATCH_SIZES'] = '1'
+
+# stuff for multi-spyre
+os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+os.environ["DISTRIBUTED_STRATEGY_IGNORE_MODULES"] = "WordEmbedding"
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "12355"
+
+# Sample prompts.
+template = (
+    "Below is an instruction that describes a task. Write a response that "
+    "appropriately completes the request. Be polite in your response to the "
+    "user.\n\n### Instruction:\n{}\n\n### Response:")
+prompt1 = template.format(
+    "Provide a list of instructions for preparing chicken soup for a family "
+    "of four.")
+prompts = [
+    prompt1,
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(max_tokens=max_tokens,
+                                 temperature=0.0,
+                                 ignore_eos=True)
+# Create an LLM.
+llm = LLM(
+    model="/models/llama-194m",
+    tokenizer="/models/llama-194m",
+    max_model_len=2048,
+    block_size=2048,
+    device="spyre",
+    tensor_parallel_size=2,
+)
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+print("=============== GENERATE")
+t0 = time.time()
+outputs = llm.generate(prompts, sampling_params)
+print("Time elaspsed for %d tokens is %.2f sec" %
+      (len(outputs[0].outputs[0].token_ids), time.time() - t0))
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+print(output.outputs[0])
+
+# needed to prevent ugly stackdump caused by sigterm
+del llm
+gc.collect()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		collect_env.py

		vllm/model_executor/model_loader/spyre_setup.py