From a1572ae1936608df27b188713a0e49f7cb3eee4b Mon Sep 17 00:00:00 2001
From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de>
Date: Mon, 26 May 2025 08:54:20 +0200
Subject: [PATCH] added vLLM examples

---
 README.md                                     |   3 +-
 machine-and-deep-learning/vllm/README.md      |  54 +++++++++
 machine-and-deep-learning/vllm/basic.py       |  34 ++++++
 machine-and-deep-learning/vllm/chat.py        | 104 ++++++++++++++++++
 .../vllm/submit_job_container.sh              |  39 +++++++
 .../vllm/submit_job_venv.sh                   |  36 ++++++
 6 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 machine-and-deep-learning/vllm/README.md
 create mode 100644 machine-and-deep-learning/vllm/basic.py
 create mode 100644 machine-and-deep-learning/vllm/chat.py
 create mode 100644 machine-and-deep-learning/vllm/submit_job_container.sh
 create mode 100644 machine-and-deep-learning/vllm/submit_job_venv.sh

diff --git a/README.md b/README.md
index 74aba94..2acb7c3 100644
--- a/README.md
+++ b/README.md
@@ -18,4 +18,5 @@ For general help, documentation, and trainings please refer to the following pag
 | | [ollama](machine-and-deep-learning/ollama) | Examples how to run and use LLMs with Ollama. |
 | | [pytorch](machine-and-deep-learning/pytorch) | Example scripts and best practices for running PyTorch workloads on an HPC cluster, including distributed training and GPU utilization. |
 | | [scikit-learn](machine-and-deep-learning/scikit-learn) | HPC-friendly examples of using Scikit-Learn, including job submission scripts for machine learning model training. |
-| | [tensorflow](machine-and-deep-learning/tensorflow) | TensorFlow job scripts and performance optimization techniques for running deep learning models on CPUs and GPUs in an HPC environment. |
\ No newline at end of file
+| | [tensorflow](machine-and-deep-learning/tensorflow) | TensorFlow job scripts and performance optimization techniques for running deep learning models on CPUs and GPUs in an HPC environment. |
+| | [vllm](machine-and-deep-learning/vllm) | Examples how to run and use LLMs with vLLM. |
\ No newline at end of file
diff --git a/machine-and-deep-learning/vllm/README.md b/machine-and-deep-learning/vllm/README.md
new file mode 100644
index 0000000..fba8867
--- /dev/null
+++ b/machine-and-deep-learning/vllm/README.md
@@ -0,0 +1,54 @@
+# Running temporary Large Language Models (LLMs) with vLLM
+
+This directory outlines how to run LLMs via vLLM, either with a predefined Apptainer container image or with a virtual environment where vLLM is installed. Interaction with LLMs happens through the `vllm` Python package.
+
+You can find additional information and examples on vLLM under https://docs.vllm.ai/en/latest/
+
+Python examples contained in this directory have been taken from https://github.com/vllm-project/vllm/tree/main/examples which is under Apache License 2.0 license.
+
+## 1. Running vLLM
+
+## 1.1. Running vLLM using Apptainer (recommended)
+
+A vLLM container will be centrally provided on our HPC system **very soon**. However, for now lets assume we created one with the following command:
+```bash
+# Specify the desired vLLM apptainer image path
+export VLLM_COINTAINER_IMAGE=${HOME}/vllm.sif
+
+# build apptainer container based on Docker image
+apptainer build ${VLLM_COINTAINER_IMAGE} docker://vllm/vllm-openai
+```
+
+Then you can start using the examples right away, either in your current shell or by submitting a batch job to run them on a backend node:
+```bash
+# run in current active shell
+zsh submit_job_container.sh
+
+# submit batch job
+sbatch submit_job_container.sh
+```
+
+## 1.2. Running vLLM in a custom Python environment
+
+This option assumes that you already have a working custom virtual environment, e.g., through the following instructions:
+```bash
+# Specify the desired vLLM environment directory
+export VLLM_ROOT_DIR=${HOME}/vllm/
+export VLLM_VENV_DIR=${VLLM_ROOT_DIR}/venv_vllm/
+mkdir -p ${VLLM_ROOT_DIR}
+
+# create the venv
+module load Python
+python -m venv ${VLLM_VENV_DIR}
+source ${VLLM_VENV_DIR}/bin/activate
+pip install vllm
+```
+
+Now you can execute the examples, either in the current shell or by submitting a batch job that runs the examples on a backend node:
+```bash
+# run in current active shell
+zsh submit_job_venv.sh
+
+# submit batch job
+sbatch submit_job_venv.sh
+```
\ No newline at end of file
diff --git a/machine-and-deep-learning/vllm/basic.py b/machine-and-deep-learning/vllm/basic.py
new file mode 100644
index 0000000..ae5ae7c
--- /dev/null
+++ b/machine-and-deep-learning/vllm/basic.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m")
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/machine-and-deep-learning/vllm/chat.py b/machine-and-deep-learning/vllm/chat.py
new file mode 100644
index 0000000..8e6f78e
--- /dev/null
+++ b/machine-and-deep-learning/vllm/chat.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def create_parser():
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+    # Add example params
+    parser.add_argument("--chat-template-path", type=str)
+
+    return parser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+    chat_template_path = args.pop("chat_template_path")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    def print_outputs(outputs):
+        print("\nGenerated Outputs:\n" + "-" * 80)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}\n")
+            print(f"Generated text: {generated_text!r}")
+            print("-" * 80)
+
+    print("=" * 80)
+
+    # In this script, we demonstrate how to pass input to the chat method:
+    conversation = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello"
+        },
+        {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content":
+            "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    print_outputs(outputs)
+
+    # You can run batch inference with llm.chat API
+    conversations = [conversation for _ in range(10)]
+
+    # We turn on tqdm progress bar to verify it's indeed running batch inference
+    outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
+    print_outputs(outputs)
+
+    # A chat template can be optionally supplied.
+    # If not, the model will use its default chat template.
+    if chat_template_path is not None:
+        with open(chat_template_path) as f:
+            chat_template = f.read()
+
+        outputs = llm.chat(
+            conversations,
+            sampling_params,
+            use_tqdm=False,
+            chat_template=chat_template,
+        )
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args: dict = vars(parser.parse_args())
+    main(args)
diff --git a/machine-and-deep-learning/vllm/submit_job_container.sh b/machine-and-deep-learning/vllm/submit_job_container.sh
new file mode 100644
index 0000000..debfe19
--- /dev/null
+++ b/machine-and-deep-learning/vllm/submit_job_container.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# Specify the desired vLLM apptainer image path
+export VLLM_COINTAINER_IMAGE=${HOME}/vllm.sif
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# basic text completion example
+apptainer exec -e --nv ${VLLM_COINTAINER_IMAGE} \
+    bash -c "python3 -W ignore basic.py"
+
+# chat example
+# apptainer exec -e --nv ${VLLM_COINTAINER_IMAGE} \
+#     bash -c "python3 -W ignore chat.py"
diff --git a/machine-and-deep-learning/vllm/submit_job_venv.sh b/machine-and-deep-learning/vllm/submit_job_venv.sh
new file mode 100644
index 0000000..fe64897
--- /dev/null
+++ b/machine-and-deep-learning/vllm/submit_job_venv.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: load/activate your desired modules and virtual environment
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# basic text completion example
+python3 -W ignore basic.py
+
+# chat example
+# python3 -W ignore chat.py
-- 
GitLab