From 92acc5401620b44802535d0e798582af96dc9dd3 Mon Sep 17 00:00:00 2001 From: zgsu Date: Tue, 13 Jan 2026 14:53:36 +0800 Subject: [PATCH 1/4] chore: change to training AI-23523 --- docs/en/installation/tools.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/installation/tools.mdx b/docs/en/installation/tools.mdx index 2bd08dc..06a7a98 100644 --- a/docs/en/installation/tools.mdx +++ b/docs/en/installation/tools.mdx @@ -136,8 +136,8 @@ data: "type": "item", "link": "/model-repo/training", "i18nKey": "nav_pre_train", - "text": "预训练", - "en": "PreTraining", + "text": "训练", + "en": "Training", "icon": "" }, { From a26a8991da93240893b10ae3a5073b4d032c96e5 Mon Sep 17 00:00:00 2001 From: zgsu Date: Wed, 14 Jan 2026 16:02:54 +0800 Subject: [PATCH 2/4] feat: llm compressor AI-23582 --- .../how_to/compressor_by_workbench.mdx | 50 +++++ .../llm-compressor/how_to/evaluate_model.mdx | 79 +++++++ docs/en/llm-compressor/how_to/index.mdx | 7 + docs/en/llm-compressor/index.mdx | 7 + docs/en/llm-compressor/intro.mdx | 15 ++ docs/public/calibration-compressor.ipynb | 203 ++++++++++++++++++ docs/public/data-free-compressor.ipynb | 164 ++++++++++++++ 7 files changed, 525 insertions(+) create mode 100644 docs/en/llm-compressor/how_to/compressor_by_workbench.mdx create mode 100644 docs/en/llm-compressor/how_to/evaluate_model.mdx create mode 100644 docs/en/llm-compressor/how_to/index.mdx create mode 100644 docs/en/llm-compressor/index.mdx create mode 100644 docs/en/llm-compressor/intro.mdx create mode 100644 docs/public/calibration-compressor.ipynb create mode 100644 docs/public/data-free-compressor.ipynb diff --git a/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx b/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx new file mode 100644 index 0000000..5ab710c --- /dev/null +++ b/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx @@ -0,0 +1,50 @@ +--- +weight: 30 +--- + +# LLM Compressor with Alauda AI + +This document describes how to use the LLM Compressor integration with the Alauda AI platform to perform model compression workflows. The Alauda AI integration of LLM Compressor provides two example workflows: + +- A workbench image and the [data-free compressor notebook](/data-free-compressor.ipynb) that demonstrate how to compress a model, with an optional example for evaluating the compressed model. +- A workbench image and the [calibration compressor notebook](/calibration-compressor.ipynb) that demonstrate how to compress a model using a calibration dataset, with an optional example for evaluating the model after compression. + +## Supported Model Compression Workflows + +On the Alauda AI platform, you can use the Workbench feature to run LLM Compressor on models stored in your model repository. The following workflow outlines the typical steps for compressing a model. + +### Create a Workbench + +Follow the instructions in [Create Workbench](../../workbench/how_to/create_workbench.mdx) to create a new Workbench instance. Note that model compression is currently supported only within **JupyterLab**. + +### Create a Model Repository and Upload Models + +Refer to [Upload Models Using Notebook](../../model_inference/model_management/how_to/upload_models_using_notebook.mdx) for detailed steps on creating a model repository and uploading your model files. The example notebooks in this guide use the [TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. + +### (Optional) Prepare and Upload a Dataset + +:::note +If you plan to use the **data-free compressor notebook**, you can skip this step. +::: + +To use the **calibration compressor notebook**, you must prepare and upload a calibration dataset. Prepare your dataset using the same process described in *Upload Models Using Notebook*. The example calibration notebook uses the [ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) dataset. + +### Clone Models and Datasets in JupyterLab + +In the JupyterLab terminal, use `git clone` to download the model repository (and dataset, if applicable) to your workspace. The data-free compressor notebook does not require a dataset. + +### Create and Run Compression Notebooks + +Download the appropriate example notebook for your use case: the [calibration compressor notebook](/calibration-compressor.ipynb) if you are using a dataset, or the [data-free compressor notebook](/data-free-compressor.ipynb) otherwise. Create a new notebook (for example, `compressor.ipynb`) in JupyterLab and paste the contents of the example notebook into it. Run the cells to perform model compression. + +### (Optional) Evaluate the Compressed Model + +After compression, you may choose to evaluate the resulting model using standard inference and performance metrics. + +### Upload the Compressed Model to the Repository + +Once compression (and optional evaluation) is complete, upload the compressed model back to the model repository using the steps outlined in *Upload Models Using Notebook*. + +### Deploy and Use the Compressed Model for Inference + +After uploading the compressed model, create a new inference service to deploy and use it. Follow the instructions in [create inference service](../../model_inference/inference_service/functions/inference_service.html#create-inference-service) to complete this step. diff --git a/docs/en/llm-compressor/how_to/evaluate_model.mdx b/docs/en/llm-compressor/how_to/evaluate_model.mdx new file mode 100644 index 0000000..e688697 --- /dev/null +++ b/docs/en/llm-compressor/how_to/evaluate_model.mdx @@ -0,0 +1,79 @@ +--- +weight: 30 +--- + +# Evaluating Models with Alauda AI + +If you followed the example described in the [LLM Compressor with Alauda AI](./compressor_by_workbench.mdx) documentation and want to run the evaluation steps demonstrated in the Notebook, you must perform several additional manual steps. + +At present, Alauda AI does not provide full, built-in support for model evaluation. As a result, these steps must be completed manually within the JupyterLab environment. + +## Installing required dependencies + +In JupyterLab, open the **Launcher** page, select the **Terminal** tile, and run the following commands to install the required dependencies: + +```bash +/.venv/bin/python -m pip install vllm==0.8.5 -i https://pypi.tuna.tsinghua.edu.cn/simple && #[!code callout] +/.venv/bin/python -m pip install compressed_tensors==0.10.2 -i https://pypi.tuna.tsinghua.edu.cn/simple && #[!code callout] +/.venv/bin/python -m pip install --force-reinstall "numpy<2.0" -i https://pypi.tuna.tsinghua.edu.cn/simple && #[!code callout] +/.venv/bin/python -m pip install lm-eval -i https://pypi.tuna.tsinghua.edu.cn/simple #[!code callout] +``` + + + 1. When using GPUs, installing the `vllm` framework is recommended to accelerate evaluation. The preinstalled `torch 2.6.0` in the workbench is compatible with this version of `vllm`. + 2. To avoid incompatibilities, the `compressed_tensors` version is pinned. + 3. To prevent dependency conflicts, `numpy` is restricted to versions earlier than 2.0. + 4. **Required**: `lm-eval` is the core dependency used for model evaluation. + + +## Creating a custom evaluation task + +:::note +As of the latest release, the `lm_eval` library does not natively support custom evaluation tasks. To enable this capability, you must manually apply a small patch to the `lm_eval` source code. + +Edit the following file: + +`~/.venv/lib/python3.11/site-packages/lm_eval/tasks/__init__.py` + +Locate approximately line 683 and update the code as shown below. For additional context, see this upstream pull request: [PR #3436](https://github.com/EleutherAI/lm-evaluation-harness/pull/3436/files). + +```python +try: + relative_yaml_path = yaml_path.relative_to(lm_eval_tasks_path) +except ValueError: + relative_yaml_path = yaml_path +``` +::: + +In the Notebook examples, the evaluation task named **my-wikitext** is referenced. This task is not provided by default and must be defined manually by creating a `my-wikitext.yaml` file. + +The built-in evaluation tasks in `lm_eval` use hard-coded dataset definitions with relative paths. This behavior causes the framework to automatically download datasets from Hugging Face. Because Hugging Face is not accessible from mainland China, you must define a custom evaluation task that points to a local dataset. + +The following example shows a sample `my-wikitext.yaml` configuration: + +```yaml +task: my-wikitext +dataset_path: /home/jovyan/wikitext_document_level #[!code callout] +dataset_name: wikitext-2-raw-v1 +output_type: loglikelihood_rolling +training_split: train +validation_split: validation +test_split: test +doc_to_text: '''' +doc_to_target: !function preprocess_wikitext.wikitext_detokenizer +process_results: !function preprocess_wikitext-process_results +should_decontaminate: true +doc_to_decontamination_query: "{{page}}" +metric_list: + - metric: word_perplexity + - metric: byte_perplexity + - metric: bits_per_byte +metadata: + version: 1.0 +``` + + + 1. Prepare the dataset by following the **Prepare and Upload a Dataset** and **Clone Models and Datasets in JupyterLab** sections in the [LLM Compressor with Alauda AI](./compressor_by_workbench.mdx) documentation. + + +After completing these steps, you can proceed with the model evaluation sections in either the [data-free compressor notebook](/data-free-compressor.ipynb) or the [calibration compressor notebook](/calibration-compressor.ipynb). diff --git a/docs/en/llm-compressor/how_to/index.mdx b/docs/en/llm-compressor/how_to/index.mdx new file mode 100644 index 0000000..2c72779 --- /dev/null +++ b/docs/en/llm-compressor/how_to/index.mdx @@ -0,0 +1,7 @@ +--- +weight: 60 +--- + +# How To + + diff --git a/docs/en/llm-compressor/index.mdx b/docs/en/llm-compressor/index.mdx new file mode 100644 index 0000000..850aab7 --- /dev/null +++ b/docs/en/llm-compressor/index.mdx @@ -0,0 +1,7 @@ +--- +weight: 82 +--- + +# LLM Compressor + + diff --git a/docs/en/llm-compressor/intro.mdx b/docs/en/llm-compressor/intro.mdx new file mode 100644 index 0000000..56a3748 --- /dev/null +++ b/docs/en/llm-compressor/intro.mdx @@ -0,0 +1,15 @@ +--- +weight: 10 +--- + +# Introduction + +[LLM Compressor](https://github.com/vllm-project/llm-compressor), part of [the vLLM project](https://docs.vllm.ai/en/latest/) for efficient serving of LLMs, integrates the latest model compression research into a single open-source library enabling the generation of efficient, compressed models with minimal effort. + +The framework allows users to apply some of the most recent research on model compression techniques to improve generative AI (gen AI) models' efficiency, scalability and performance while maintaining accuracy. With native support for Hugging Face and vLLM, the compressed models can be integrated into deployment pipelines, delivering faster and more cost-effective inference at scale. + +LLM Compressor supports a wide variety of compression techniques: + +- Weight-only quantization (W4A16) compresses model weights to 4-bit precision, valuable for AI applications with limited hardware resources or high sensitivity to latency. +- Weight and activation quantization (W8A8) compresses both weights and activations to 8-bit precision, targeting general server scenarios for integer and floating point formats. +- Weight pruning, also known as sparsification, removes certain weights from the model entirely. While this requires fine-tuning, it can be used in conjunction with quantization for further inference acceleration. diff --git a/docs/public/calibration-compressor.ipynb b/docs/public/calibration-compressor.ipynb new file mode 100644 index 0000000..cd4f12e --- /dev/null +++ b/docs/public/calibration-compressor.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LLM Compressor Workbench -- Getting Started\n", + "\n", + "This notebook will demonstrate how common [LLM Compressor](https://github.com/vllm-project/llm-compressor) flows can be run on the Alauda AI.\n", + "\n", + "We will show how a user can compress and evaluate a Large Language Model, with a calibration dataset.\n", + "\n", + "The notebook will detect if a GPU is available. If one is not available, it will demonstrate an abbreviated run, so users without GPU access can still get a feel for `llm-compressor`.\n", + "\n", + "\n", + "
\n", + "Note: If you want to evaluate compressed model, just be sure to have lm_eval>=0.4.8 installed\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1\\) Calibrated Compression with a Dataset\n", + "\n", + "Some more advanced compression algorithms require a small dataset of calibration samples that are meant to be a representative random subset of the language the model will see at inference.\n", + "\n", + "We will show how the previous section can be augmented with a calibration dataset and GPTQ, one of the first published LLM compression algorithms.\n", + "\n", + "
\n", + "Note: This will take several minutes if no GPU is available\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "use_gpu = torch.cuda.is_available()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We will use a new recipe running GPTQ (https://arxiv.org/abs/2210.17323)\n", + "# to reduce error caused by quantization. GPTQ requires a calibration dataset.\n", + "from llmcompressor.modifiers.quantization import GPTQModifier\n", + "\n", + "# model to compress\n", + "model_id = \"./TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", + "recipe = GPTQModifier(targets=\"Linear\", scheme=\"W4A16\", ignore=[\"lm_head\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "# Create the calibration dataset, using Huggingface datasets API\n", + "dataset_id = \"./ultrachat_200k\"\n", + "\n", + "# Select number of samples. 512 samples is a good place to start.\n", + "# Increasing the number of samples can improve accuracy.\n", + "num_calibration_samples = 512 if use_gpu else 4\n", + "max_sequence_length = 2048 if use_gpu else 16\n", + "\n", + "# Load dataset\n", + "ds = load_dataset(dataset_id, split=\"train_sft\")\n", + "# Shuffle and grab only the number of samples we need\n", + "ds = ds.shuffle(seed=42).select(range(num_calibration_samples))\n", + "\n", + "\n", + "# Preprocess and tokenize into format the model uses\n", + "def preprocess(example):\n", + " text = tokenizer.apply_chat_template(\n", + " example[\"messages\"],\n", + " tokenize=False,\n", + " )\n", + " return tokenizer(\n", + " text,\n", + " padding=False,\n", + " max_length=max_sequence_length,\n", + " truncation=True,\n", + " add_special_tokens=False,\n", + " )\n", + "\n", + "\n", + "ds = ds.map(preprocess, remove_columns=ds.column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# oneshot modifies model in-place, so reload\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_id, device_map=\"auto\", torch_dtype=\"auto\"\n", + ")\n", + "# run oneshot again, with dataset\n", + "model = oneshot(\n", + " model=model,\n", + " dataset=ds,\n", + " recipe=recipe,\n", + " max_seq_length=max_sequence_length,\n", + " num_calibration_samples=num_calibration_samples,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save model and tokenizer\n", + "model_dir = \"./\" + model_id.split(\"/\")[-1] + \"-GPTQ-W4A16\"\n", + "model.save_pretrained(model_dir)\n", + "tokenizer.save_pretrained(model_dir);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2\\) Run `lm_eval`\n", + "\n", + "Note that perplexity score has improved (lower is better) for this `TinyLlama` model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os. environ [\"VLLM_USE_V1\"] = \"0\"\n", + "\n", + "import lm_eval\n", + "from lm_eval.utils import make_table\n", + "\n", + "from lm_eval. tasks import TaskManager\n", + "task_manager = TaskManager (include_path=\"./my-wikitext.yaml\")\n", + "\n", + "results = lm_eval.simple_evaluate(\n", + " model=\"vllm\" if use_gpu else \"hf\",\n", + " model_args={\n", + " \"pretrained\": model_dir,\n", + " \"add_bos_token\": True,\n", + " \"device\": \"auto\",\n", + " \"gpu_memory_utilization\": 0.8,\n", + " },\n", + " tasks=[\"my-wikitext\"],\n", + " batch_size=\"auto\" if use_gpu else 4,\n", + " limit=None if use_gpu else 4,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(make_table(results))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/public/data-free-compressor.ipynb b/docs/public/data-free-compressor.ipynb new file mode 100644 index 0000000..377c025 --- /dev/null +++ b/docs/public/data-free-compressor.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LLM Compressor Workbench -- Getting Started\n", + "\n", + "This notebook will demonstrate how common [LLM Compressor](https://github.com/vllm-project/llm-compressor) flows can be run on the Alauda AI.\n", + "\n", + "We will show how a user can compress and evaluate a Large Language Model, without data.\n", + "\n", + "The notebook will detect if a GPU is available. If one is not available, it will demonstrate an abbreviated run, so users without GPU access can still get a feel for `llm-compressor`.\n", + "\n", + "\n", + "
\n", + "Note: If you want to evaluate compressed model, just be sure to have lm_eval>=0.4.8 installed\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1\\) Data-Free Model Compression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "use_gpu = torch.cuda.is_available()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llmcompressor.modifiers.quantization import QuantizationModifier\n", + "\n", + "# model to compress\n", + "model_id = \"./TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", + "\n", + "# This recipe will quantize all Linear layers except those in the `lm_head`,\n", + "# which is often sensitive to quantization. The W4A16 scheme compresses\n", + "# weights to 4-bit integers while retaining 16-bit activations.\n", + "recipe = QuantizationModifier(targets=\"Linear\", scheme=\"W4A16\", ignore=[\"lm_head\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load up model using huggingface API\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_id, device_map=\"auto\", torch_dtype=\"auto\"\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run compression using `oneshot`\n", + "from llmcompressor import oneshot\n", + "\n", + "model = oneshot(model=model, recipe=recipe, tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save model and tokenizer\n", + "model_dir = \"./\" + model_id.split(\"/\")[-1] + \"-W4A16\"\n", + "model.save_pretrained(model_dir)\n", + "tokenizer.save_pretrained(model_dir);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2\\) Evaluate compressed model using open-source `lm_eval` framework\n", + "\n", + "We will evaluate the performance of the model on the [`wikitext`](https://huggingface.co/datasets/EleutherAI/wikitext_document_level) language modeling dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os. environ [\"VLLM_USE_V1\"] = \"0\"\n", + "\n", + "import lm_eval\n", + "from lm_eval.utils import make_table\n", + "\n", + "from lm_eval. tasks import TaskManager\n", + "task_manager = TaskManager (include_path=\"./my-wikitext.yaml\")\n", + "\n", + "results = lm_eval.simple_evaluate(\n", + " model=\"vllm\" if use_gpu else \"hf\",\n", + " model_args={\n", + " \"pretrained\": model_dir,\n", + " \"add_bos_token\": True,\n", + " \"device\": \"auto\",\n", + " \"gpu_memory_utilization\": 0.8,\n", + " },\n", + " tasks=[\"my-wikitext\"],\n", + " batch_size=\"auto\" if use_gpu else 4,\n", + " limit=None if use_gpu else 4,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(make_table(results))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3b72714b5e27b1be35e0862d47ce5d7faa474d7c Mon Sep 17 00:00:00 2001 From: zgsu Date: Thu, 15 Jan 2026 10:51:53 +0800 Subject: [PATCH 3/4] chore: remove eval --- .../how_to/compressor_by_workbench.mdx | 10 +-- .../llm-compressor/how_to/evaluate_model.mdx | 79 ------------------- docs/public/calibration-compressor.ipynb | 58 +------------- docs/public/data-free-compressor.ipynb | 69 +--------------- 4 files changed, 8 insertions(+), 208 deletions(-) delete mode 100644 docs/en/llm-compressor/how_to/evaluate_model.mdx diff --git a/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx b/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx index 5ab710c..d51a948 100644 --- a/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx +++ b/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx @@ -6,8 +6,8 @@ weight: 30 This document describes how to use the LLM Compressor integration with the Alauda AI platform to perform model compression workflows. The Alauda AI integration of LLM Compressor provides two example workflows: -- A workbench image and the [data-free compressor notebook](/data-free-compressor.ipynb) that demonstrate how to compress a model, with an optional example for evaluating the compressed model. -- A workbench image and the [calibration compressor notebook](/calibration-compressor.ipynb) that demonstrate how to compress a model using a calibration dataset, with an optional example for evaluating the model after compression. +- A workbench image and the [data-free compressor notebook](/data-free-compressor.ipynb) that demonstrate how to compress a model. +- A workbench image and the [calibration compressor notebook](/calibration-compressor.ipynb) that demonstrate how to compress a model using a calibration dataset. ## Supported Model Compression Workflows @@ -37,13 +37,9 @@ In the JupyterLab terminal, use `git clone` to download the model repository (an Download the appropriate example notebook for your use case: the [calibration compressor notebook](/calibration-compressor.ipynb) if you are using a dataset, or the [data-free compressor notebook](/data-free-compressor.ipynb) otherwise. Create a new notebook (for example, `compressor.ipynb`) in JupyterLab and paste the contents of the example notebook into it. Run the cells to perform model compression. -### (Optional) Evaluate the Compressed Model - -After compression, you may choose to evaluate the resulting model using standard inference and performance metrics. - ### Upload the Compressed Model to the Repository -Once compression (and optional evaluation) is complete, upload the compressed model back to the model repository using the steps outlined in *Upload Models Using Notebook*. +Once compression is complete, upload the compressed model back to the model repository using the steps outlined in *Upload Models Using Notebook*. ### Deploy and Use the Compressed Model for Inference diff --git a/docs/en/llm-compressor/how_to/evaluate_model.mdx b/docs/en/llm-compressor/how_to/evaluate_model.mdx deleted file mode 100644 index e688697..0000000 --- a/docs/en/llm-compressor/how_to/evaluate_model.mdx +++ /dev/null @@ -1,79 +0,0 @@ ---- -weight: 30 ---- - -# Evaluating Models with Alauda AI - -If you followed the example described in the [LLM Compressor with Alauda AI](./compressor_by_workbench.mdx) documentation and want to run the evaluation steps demonstrated in the Notebook, you must perform several additional manual steps. - -At present, Alauda AI does not provide full, built-in support for model evaluation. As a result, these steps must be completed manually within the JupyterLab environment. - -## Installing required dependencies - -In JupyterLab, open the **Launcher** page, select the **Terminal** tile, and run the following commands to install the required dependencies: - -```bash -/.venv/bin/python -m pip install vllm==0.8.5 -i https://pypi.tuna.tsinghua.edu.cn/simple && #[!code callout] -/.venv/bin/python -m pip install compressed_tensors==0.10.2 -i https://pypi.tuna.tsinghua.edu.cn/simple && #[!code callout] -/.venv/bin/python -m pip install --force-reinstall "numpy<2.0" -i https://pypi.tuna.tsinghua.edu.cn/simple && #[!code callout] -/.venv/bin/python -m pip install lm-eval -i https://pypi.tuna.tsinghua.edu.cn/simple #[!code callout] -``` - - - 1. When using GPUs, installing the `vllm` framework is recommended to accelerate evaluation. The preinstalled `torch 2.6.0` in the workbench is compatible with this version of `vllm`. - 2. To avoid incompatibilities, the `compressed_tensors` version is pinned. - 3. To prevent dependency conflicts, `numpy` is restricted to versions earlier than 2.0. - 4. **Required**: `lm-eval` is the core dependency used for model evaluation. - - -## Creating a custom evaluation task - -:::note -As of the latest release, the `lm_eval` library does not natively support custom evaluation tasks. To enable this capability, you must manually apply a small patch to the `lm_eval` source code. - -Edit the following file: - -`~/.venv/lib/python3.11/site-packages/lm_eval/tasks/__init__.py` - -Locate approximately line 683 and update the code as shown below. For additional context, see this upstream pull request: [PR #3436](https://github.com/EleutherAI/lm-evaluation-harness/pull/3436/files). - -```python -try: - relative_yaml_path = yaml_path.relative_to(lm_eval_tasks_path) -except ValueError: - relative_yaml_path = yaml_path -``` -::: - -In the Notebook examples, the evaluation task named **my-wikitext** is referenced. This task is not provided by default and must be defined manually by creating a `my-wikitext.yaml` file. - -The built-in evaluation tasks in `lm_eval` use hard-coded dataset definitions with relative paths. This behavior causes the framework to automatically download datasets from Hugging Face. Because Hugging Face is not accessible from mainland China, you must define a custom evaluation task that points to a local dataset. - -The following example shows a sample `my-wikitext.yaml` configuration: - -```yaml -task: my-wikitext -dataset_path: /home/jovyan/wikitext_document_level #[!code callout] -dataset_name: wikitext-2-raw-v1 -output_type: loglikelihood_rolling -training_split: train -validation_split: validation -test_split: test -doc_to_text: '''' -doc_to_target: !function preprocess_wikitext.wikitext_detokenizer -process_results: !function preprocess_wikitext-process_results -should_decontaminate: true -doc_to_decontamination_query: "{{page}}" -metric_list: - - metric: word_perplexity - - metric: byte_perplexity - - metric: bits_per_byte -metadata: - version: 1.0 -``` - - - 1. Prepare the dataset by following the **Prepare and Upload a Dataset** and **Clone Models and Datasets in JupyterLab** sections in the [LLM Compressor with Alauda AI](./compressor_by_workbench.mdx) documentation. - - -After completing these steps, you can proceed with the model evaluation sections in either the [data-free compressor notebook](/data-free-compressor.ipynb) or the [calibration compressor notebook](/calibration-compressor.ipynb). diff --git a/docs/public/calibration-compressor.ipynb b/docs/public/calibration-compressor.ipynb index cd4f12e..09eeef8 100644 --- a/docs/public/calibration-compressor.ipynb +++ b/docs/public/calibration-compressor.ipynb @@ -8,21 +8,16 @@ "\n", "This notebook will demonstrate how common [LLM Compressor](https://github.com/vllm-project/llm-compressor) flows can be run on the Alauda AI.\n", "\n", - "We will show how a user can compress and evaluate a Large Language Model, with a calibration dataset.\n", + "We will show how a user can compress a Large Language Model, with a calibration dataset.\n", "\n", - "The notebook will detect if a GPU is available. If one is not available, it will demonstrate an abbreviated run, so users without GPU access can still get a feel for `llm-compressor`.\n", - "\n", - "\n", - "
\n", - "Note: If you want to evaluate compressed model, just be sure to have lm_eval>=0.4.8 installed\n", - "
" + "The notebook will detect if a GPU is available. If one is not available, it will demonstrate an abbreviated run, so users without GPU access can still get a feel for `llm-compressor`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1\\) Calibrated Compression with a Dataset\n", + "### Calibrated Compression with a Dataset\n", "\n", "Some more advanced compression algorithms require a small dataset of calibration samples that are meant to be a representative random subset of the language the model will see at inference.\n", "\n", @@ -130,53 +125,6 @@ "model.save_pretrained(model_dir)\n", "tokenizer.save_pretrained(model_dir);" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2\\) Run `lm_eval`\n", - "\n", - "Note that perplexity score has improved (lower is better) for this `TinyLlama` model. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os. environ [\"VLLM_USE_V1\"] = \"0\"\n", - "\n", - "import lm_eval\n", - "from lm_eval.utils import make_table\n", - "\n", - "from lm_eval. tasks import TaskManager\n", - "task_manager = TaskManager (include_path=\"./my-wikitext.yaml\")\n", - "\n", - "results = lm_eval.simple_evaluate(\n", - " model=\"vllm\" if use_gpu else \"hf\",\n", - " model_args={\n", - " \"pretrained\": model_dir,\n", - " \"add_bos_token\": True,\n", - " \"device\": \"auto\",\n", - " \"gpu_memory_utilization\": 0.8,\n", - " },\n", - " tasks=[\"my-wikitext\"],\n", - " batch_size=\"auto\" if use_gpu else 4,\n", - " limit=None if use_gpu else 4,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(make_table(results))" - ] } ], "metadata": { diff --git a/docs/public/data-free-compressor.ipynb b/docs/public/data-free-compressor.ipynb index 377c025..b35448c 100644 --- a/docs/public/data-free-compressor.ipynb +++ b/docs/public/data-free-compressor.ipynb @@ -8,32 +8,14 @@ "\n", "This notebook will demonstrate how common [LLM Compressor](https://github.com/vllm-project/llm-compressor) flows can be run on the Alauda AI.\n", "\n", - "We will show how a user can compress and evaluate a Large Language Model, without data.\n", - "\n", - "The notebook will detect if a GPU is available. If one is not available, it will demonstrate an abbreviated run, so users without GPU access can still get a feel for `llm-compressor`.\n", - "\n", - "\n", - "
\n", - "Note: If you want to evaluate compressed model, just be sure to have lm_eval>=0.4.8 installed\n", - "
" + "We will show how a user can compress a Large Language Model, without data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1\\) Data-Free Model Compression" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "\n", - "use_gpu = torch.cuda.is_available()" + "### Data-Free Model Compression" ] }, { @@ -91,53 +73,6 @@ "model.save_pretrained(model_dir)\n", "tokenizer.save_pretrained(model_dir);" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2\\) Evaluate compressed model using open-source `lm_eval` framework\n", - "\n", - "We will evaluate the performance of the model on the [`wikitext`](https://huggingface.co/datasets/EleutherAI/wikitext_document_level) language modeling dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os. environ [\"VLLM_USE_V1\"] = \"0\"\n", - "\n", - "import lm_eval\n", - "from lm_eval.utils import make_table\n", - "\n", - "from lm_eval. tasks import TaskManager\n", - "task_manager = TaskManager (include_path=\"./my-wikitext.yaml\")\n", - "\n", - "results = lm_eval.simple_evaluate(\n", - " model=\"vllm\" if use_gpu else \"hf\",\n", - " model_args={\n", - " \"pretrained\": model_dir,\n", - " \"add_bos_token\": True,\n", - " \"device\": \"auto\",\n", - " \"gpu_memory_utilization\": 0.8,\n", - " },\n", - " tasks=[\"my-wikitext\"],\n", - " batch_size=\"auto\" if use_gpu else 4,\n", - " limit=None if use_gpu else 4,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(make_table(results))" - ] } ], "metadata": { From e602da34b195dd890a19bd806b2386abbf7ff3aa Mon Sep 17 00:00:00 2001 From: zgsu Date: Thu, 15 Jan 2026 11:18:24 +0800 Subject: [PATCH 4/4] chore: add introduction --- .cspell/compound.txt | 1 + .../how_to/compressor_by_workbench.mdx | 10 ++++---- docs/en/llm-compressor/intro.mdx | 24 +++++++++++++++++-- docs/public/calibration-compressor.ipynb | 23 ++++++++++++++---- 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/.cspell/compound.txt b/.cspell/compound.txt index efddde2..599fc9f 100644 --- a/.cspell/compound.txt +++ b/.cspell/compound.txt @@ -4,3 +4,4 @@ knative kserve xinference servicemeshv1 +ipynb diff --git a/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx b/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx index d51a948..76f1e39 100644 --- a/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx +++ b/docs/en/llm-compressor/how_to/compressor_by_workbench.mdx @@ -6,8 +6,8 @@ weight: 30 This document describes how to use the LLM Compressor integration with the Alauda AI platform to perform model compression workflows. The Alauda AI integration of LLM Compressor provides two example workflows: -- A workbench image and the [data-free compressor notebook](/data-free-compressor.ipynb) that demonstrate how to compress a model. -- A workbench image and the [calibration compressor notebook](/calibration-compressor.ipynb) that demonstrate how to compress a model using a calibration dataset. +- A workbench image and the data-free compressor notebook that demonstrate how to compress a model. +- A workbench image and the calibration compressor notebook that demonstrate how to compress a model using a calibration dataset. ## Supported Model Compression Workflows @@ -35,7 +35,7 @@ In the JupyterLab terminal, use `git clone` to download the model repository (an ### Create and Run Compression Notebooks -Download the appropriate example notebook for your use case: the [calibration compressor notebook](/calibration-compressor.ipynb) if you are using a dataset, or the [data-free compressor notebook](/data-free-compressor.ipynb) otherwise. Create a new notebook (for example, `compressor.ipynb`) in JupyterLab and paste the contents of the example notebook into it. Run the cells to perform model compression. +Download the appropriate example notebook for your use case: the **calibration compressor notebook** if you are using a dataset, or the **data-free compressor notebook** otherwise. Create a new notebook (for example, `compressor.ipynb`) in JupyterLab and paste the contents of the example notebook into it. Run the cells to perform model compression. ### Upload the Compressed Model to the Repository @@ -43,4 +43,6 @@ Once compression is complete, upload the compressed model back to the model repo ### Deploy and Use the Compressed Model for Inference -After uploading the compressed model, create a new inference service to deploy and use it. Follow the instructions in [create inference service](../../model_inference/inference_service/functions/inference_service.html#create-inference-service) to complete this step. +Quantized and sparse models that you create with LLM Compressor are saved using the `compressed-tensors` library (an extension of [Safetensors](https://huggingface.co/docs/safetensors/en/index)). +The compression format matches the model's quantization or sparsity type. These formats are natively supported in vLLM, enabling fast inference through optimized deployment kernels by using Alauda AI Inference Server. +Follow the instructions in [create inference service](../../model_inference/inference_service/functions/inference_service.mdx#create-inference-service) to complete this step. diff --git a/docs/en/llm-compressor/intro.mdx b/docs/en/llm-compressor/intro.mdx index 56a3748..c8df8ef 100644 --- a/docs/en/llm-compressor/intro.mdx +++ b/docs/en/llm-compressor/intro.mdx @@ -4,12 +4,32 @@ weight: 10 # Introduction +## Preface + [LLM Compressor](https://github.com/vllm-project/llm-compressor), part of [the vLLM project](https://docs.vllm.ai/en/latest/) for efficient serving of LLMs, integrates the latest model compression research into a single open-source library enabling the generation of efficient, compressed models with minimal effort. The framework allows users to apply some of the most recent research on model compression techniques to improve generative AI (gen AI) models' efficiency, scalability and performance while maintaining accuracy. With native support for Hugging Face and vLLM, the compressed models can be integrated into deployment pipelines, delivering faster and more cost-effective inference at scale. -LLM Compressor supports a wide variety of compression techniques: +LLM Compressor allows you to perform model optimization techniques such as quantization, sparsity, and compression to reduce memory use, model size, and improve inference without affecting the accuracy of model responses. The following compression methodologies are supported by LLM Compressor: + +- **Quantization**: Converts model weights and activations to lower-bit formats such as int8, reducing memory usage. +- **Sparsity**: Sets a portion of model weights to zero, often in fixed patterns, allowing for more efficient computation. +- **Compression**: Shrinks the saved model file size, ideally with minimal impact on performance. + +Use these methods together to deploy models more efficiently on resource-limited hardware. + +## LLM Compressor supports a wide variety of compression techniques: - Weight-only quantization (W4A16) compresses model weights to 4-bit precision, valuable for AI applications with limited hardware resources or high sensitivity to latency. -- Weight and activation quantization (W8A8) compresses both weights and activations to 8-bit precision, targeting general server scenarios for integer and floating point formats. +- Weight and activation quantization (W8A8) compresses both weights and activations to 8-bit precision, targeting general server scenarios for integer and floating-point formats. - Weight pruning, also known as sparsification, removes certain weights from the model entirely. While this requires fine-tuning, it can be used in conjunction with quantization for further inference acceleration. + +## LLM Compressor supports several compression algorithms: + +- AWQ: Weight-only `INT4` quantization +- GPTQ: Weight-only `INT4` quantization +- FP8: Dynamic per-token quantization +- SparseGPT: Post-training sparsity +- SmoothQuant: Activation quantization + +Each of these compression methods computes optimal scales and zero-points for weights and activations. Optimized scales can be per tensor, channel, group, or token. The final result is a compressed model saved with all its applied quantization parameters. diff --git a/docs/public/calibration-compressor.ipynb b/docs/public/calibration-compressor.ipynb index 09eeef8..a863086 100644 --- a/docs/public/calibration-compressor.ipynb +++ b/docs/public/calibration-compressor.ipynb @@ -54,6 +54,21 @@ "recipe = GPTQModifier(targets=\"Linear\", scheme=\"W4A16\", ignore=[\"lm_head\"])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load up model using huggingface API\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_id, device_map=\"auto\", torch_dtype=\"auto\"\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -100,11 +115,9 @@ "metadata": {}, "outputs": [], "source": [ - "# oneshot modifies model in-place, so reload\n", - "model = AutoModelForCausalLM.from_pretrained(\n", - " model_id, device_map=\"auto\", torch_dtype=\"auto\"\n", - ")\n", - "# run oneshot again, with dataset\n", + "# run oneshot, with dataset\n", + "from llmcompressor import oneshot\n", + "\n", "model = oneshot(\n", " model=model,\n", " dataset=ds,\n",