diff --git a/README.md b/README.md index 74aba94b08348165086d9fe200ee9a3f3bc90e2b..2acb7c312a0d3f24aa27526cca3ee06721769997 100644 --- a/README.md +++ b/README.md @@ -18,4 +18,5 @@ For general help, documentation, and trainings please refer to the following pag | | [ollama](machine-and-deep-learning/ollama) | Examples how to run and use LLMs with Ollama. | | | [pytorch](machine-and-deep-learning/pytorch) | Example scripts and best practices for running PyTorch workloads on an HPC cluster, including distributed training and GPU utilization. | | | [scikit-learn](machine-and-deep-learning/scikit-learn) | HPC-friendly examples of using Scikit-Learn, including job submission scripts for machine learning model training. | -| | [tensorflow](machine-and-deep-learning/tensorflow) | TensorFlow job scripts and performance optimization techniques for running deep learning models on CPUs and GPUs in an HPC environment. | \ No newline at end of file +| | [tensorflow](machine-and-deep-learning/tensorflow) | TensorFlow job scripts and performance optimization techniques for running deep learning models on CPUs and GPUs in an HPC environment. | +| | [vllm](machine-and-deep-learning/vllm) | Examples how to run and use LLMs with vLLM. | \ No newline at end of file diff --git a/machine-and-deep-learning/vllm/README.md b/machine-and-deep-learning/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fba8867c4c75a7e470dfbb57a6cac9ed15b9ab44 --- /dev/null +++ b/machine-and-deep-learning/vllm/README.md @@ -0,0 +1,54 @@ +# Running temporary Large Language Models (LLMs) with vLLM + +This directory outlines how to run LLMs via vLLM, either with a predefined Apptainer container image or with a virtual environment where vLLM is installed. Interaction with LLMs happens through the `vllm` Python package. + +You can find additional information and examples on vLLM under https://docs.vllm.ai/en/latest/ + +Python examples contained in this directory have been taken from https://github.com/vllm-project/vllm/tree/main/examples which is under Apache License 2.0 license. + +## 1. Running vLLM + +## 1.1. Running vLLM using Apptainer (recommended) + +A vLLM container will be centrally provided on our HPC system **very soon**. However, for now lets assume we created one with the following command: +```bash +# Specify the desired vLLM apptainer image path +export VLLM_COINTAINER_IMAGE=${HOME}/vllm.sif + +# build apptainer container based on Docker image +apptainer build ${VLLM_COINTAINER_IMAGE} docker://vllm/vllm-openai +``` + +Then you can start using the examples right away, either in your current shell or by submitting a batch job to run them on a backend node: +```bash +# run in current active shell +zsh submit_job_container.sh + +# submit batch job +sbatch submit_job_container.sh +``` + +## 1.2. Running vLLM in a custom Python environment + +This option assumes that you already have a working custom virtual environment, e.g., through the following instructions: +```bash +# Specify the desired vLLM environment directory +export VLLM_ROOT_DIR=${HOME}/vllm/ +export VLLM_VENV_DIR=${VLLM_ROOT_DIR}/venv_vllm/ +mkdir -p ${VLLM_ROOT_DIR} + +# create the venv +module load Python +python -m venv ${VLLM_VENV_DIR} +source ${VLLM_VENV_DIR}/bin/activate +pip install vllm +``` + +Now you can execute the examples, either in the current shell or by submitting a batch job that runs the examples on a backend node: +```bash +# run in current active shell +zsh submit_job_venv.sh + +# submit batch job +sbatch submit_job_venv.sh +``` \ No newline at end of file diff --git a/machine-and-deep-learning/vllm/basic.py b/machine-and-deep-learning/vllm/basic.py new file mode 100644 index 0000000000000000000000000000000000000000..ae5ae7cb483461953959668a7da1d465e130cda6 --- /dev/null +++ b/machine-and-deep-learning/vllm/basic.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + +def main(): + # Create an LLM. + llm = LLM(model="facebook/opt-125m") + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/machine-and-deep-learning/vllm/chat.py b/machine-and-deep-learning/vllm/chat.py new file mode 100644 index 0000000000000000000000000000000000000000..8e6f78ed7de21fa078981770dde2d14a2197355a --- /dev/null +++ b/machine-and-deep-learning/vllm/chat.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def create_parser(): + parser = FlexibleArgumentParser() + # Add engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + # Add example params + parser.add_argument("--chat-template-path", type=str) + + return parser + + +def main(args: dict): + # Pop arguments not used by LLM + max_tokens = args.pop("max_tokens") + temperature = args.pop("temperature") + top_p = args.pop("top_p") + top_k = args.pop("top_k") + chat_template_path = args.pop("chat_template_path") + + # Create an LLM + llm = LLM(**args) + + # Create sampling params object + sampling_params = llm.get_default_sampling_params() + if max_tokens is not None: + sampling_params.max_tokens = max_tokens + if temperature is not None: + sampling_params.temperature = temperature + if top_p is not None: + sampling_params.top_p = top_p + if top_k is not None: + sampling_params.top_k = top_k + + def print_outputs(outputs): + print("\nGenerated Outputs:\n" + "-" * 80) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\n") + print(f"Generated text: {generated_text!r}") + print("-" * 80) + + print("=" * 80) + + # In this script, we demonstrate how to pass input to the chat method: + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": + "Write an essay about the importance of higher education.", + }, + ] + outputs = llm.chat(conversation, sampling_params, use_tqdm=False) + print_outputs(outputs) + + # You can run batch inference with llm.chat API + conversations = [conversation for _ in range(10)] + + # We turn on tqdm progress bar to verify it's indeed running batch inference + outputs = llm.chat(conversations, sampling_params, use_tqdm=True) + print_outputs(outputs) + + # A chat template can be optionally supplied. + # If not, the model will use its default chat template. + if chat_template_path is not None: + with open(chat_template_path) as f: + chat_template = f.read() + + outputs = llm.chat( + conversations, + sampling_params, + use_tqdm=False, + chat_template=chat_template, + ) + + +if __name__ == "__main__": + parser = create_parser() + args: dict = vars(parser.parse_args()) + main(args) diff --git a/machine-and-deep-learning/vllm/submit_job_container.sh b/machine-and-deep-learning/vllm/submit_job_container.sh new file mode 100644 index 0000000000000000000000000000000000000000..debfe1962c96532b4a77d02931f9a290fb4c3e13 --- /dev/null +++ b/machine-and-deep-learning/vllm/submit_job_container.sh @@ -0,0 +1,39 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# Specify the desired vLLM apptainer image path +export VLLM_COINTAINER_IMAGE=${HOME}/vllm.sif + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +############################################################ +### Execution (Model Training) +############################################################ + +# basic text completion example +apptainer exec -e --nv ${VLLM_COINTAINER_IMAGE} \ + bash -c "python3 -W ignore basic.py" + +# chat example +# apptainer exec -e --nv ${VLLM_COINTAINER_IMAGE} \ +# bash -c "python3 -W ignore chat.py" diff --git a/machine-and-deep-learning/vllm/submit_job_venv.sh b/machine-and-deep-learning/vllm/submit_job_venv.sh new file mode 100644 index 0000000000000000000000000000000000000000..fe648976d5e9c8493a3a92b16aff119c365de1e5 --- /dev/null +++ b/machine-and-deep-learning/vllm/submit_job_venv.sh @@ -0,0 +1,36 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# TODO: load/activate your desired modules and virtual environment + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +############################################################ +### Execution (Model Training) +############################################################ + +# basic text completion example +python3 -W ignore basic.py + +# chat example +# python3 -W ignore chat.py