Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "third-party/verl"]
path = third-party/verl
url = git@github.com:EPFLiGHT/verl.git
[submodule "third-party/sglang"]
path = third-party/sglang
url = git@github.com:EPFLiGHT/sglang.git
30 changes: 0 additions & 30 deletions config/helper/verl_hydra_gen.yaml

This file was deleted.

43 changes: 43 additions & 0 deletions config/rl/ds/config-baai-taco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Notice: TACO make use of dataset script that are no longer supported, need to
# downgrade datasets to datasets<=3.6.0 to load the dataset.
hydra:
searchpath:
- pkg://multimeditron.config

defaults:
- preprocess-ds
- _self_

source:
type: hf # Supported types: 'hf', 'jsonl'
kwargs:
path: BAAI/TACO
split: train

tokenizer:
enable: false
model: null
use_fast: true
attachment_token: <|reserved_special_token_0|>

output: /capstor/store/cscs/swissai/a127/meditron/multimediset/reasoning/taco.parquet
num_processes: 128

processes:
- type: python
kwargs:
remove_columns: [
'question', 'solutions', 'starter_code', 'input_output', 'name', 'url', 'Expected Auxiliary Space', 'Expected Time Complexity',
'raw_tags', 'skill_types', 'tags', 'source',
'date', 'picture_num',
]
imports: ['re']
func: |
{
"prompt": data["question"],
"solution": data["solutions"],
"checks": data["input_output"],
}
- type: shuffle
kwargs:
seed: 42
43 changes: 43 additions & 0 deletions config/rl/ds/config-math-shepherd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
hydra:
searchpath:
- pkg://multimeditron.config

defaults:
- preprocess-ds
- _self_

source:
type: hf # Supported types: 'hf', 'jsonl'
kwargs:
path: trl-lib/math_shepherd
split: train

tokenizer:
enable: false
model: null
use_fast: true
attachment_token: <|reserved_special_token_0|>

output: /capstor/store/cscs/swissai/a127/meditron/multimediset/reasoning/math-shepherd.parquet
num_processes: 128

processes:
- type: python-filter
kwargs:
func: |
all(k for k in data["labels"])

- type: python
kwargs:
remove_columns: ['labels', 'completions']
imports: ['re']
func: |
{
"prompt": [{"content": data["prompt"], "role": "user"}],
"reward_model": {"ground_truth": data["completions"]},
"data_source": "math-shepherd",
}

- type: shuffle
kwargs:
seed: 42
36 changes: 36 additions & 0 deletions config/rl/ds/config-nemotron-post-training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
hydra:
searchpath:
- pkg://multimeditron.config

defaults:
- preprocess-ds
- _self_

source:
type: hf # Supported types: 'hf', 'jsonl'
kwargs:
path: nvidia/Llama-Nemotron-Post-Training-Dataset
split: code

tokenizer:
enable: false
model: null
use_fast: true
attachment_token: <|reserved_special_token_0|>

output: /capstor/store/cscs/swissai/a127/meditron/multimediset/reasoning/nemotron-post-training-code.parquet
num_processes: 32

processes:
- type: python
kwargs:
remove_columns: ['input', 'output', 'category', 'license', 'reasoning', 'generator', 'used_in_training', 'version', 'system_prompt']
imports: ['re']
func:
- output_p = re.sub(r"<think>(([^<]|<(?!\/think>))*)<\/think>", '', data["output"], flags=re.MULTILINE).strip()
- code_p = list(re.finditer(r"```python\n(([^`]|`(?!``))*)\n```", output_p))
- |
{
"prompt": data["input"][0]["content"],
"response": code_p[-1].group(1).strip() if len(code_p) > 0 else output_p,
}
18 changes: 18 additions & 0 deletions config/rl/grpo/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
hydra:
searchpath:
- pkg://multimeditron.config
- pkg://verl.trainer.config

defaults:
- verl_trainer
- _self_

data:
train_files:
- /capstor/store/cscs/swissai/a127/meditron/multimediset/reasoning/math-shepherd.parquet
# - ./mock_dataset/mock_dataset.parquet
val_files:
- /capstor/store/cscs/swissai/a127/meditron/multimediset/reasoning/math-shepherd-val.parquet
# - ./mock_dataset/mock_dataset.parquet
prompt_key: prompt
reward_fn_key: data_source
29 changes: 26 additions & 3 deletions docker/Dockerfile.verl
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@ RUN pip3 install --upgrade pip && \
'nvidia-cudnn-frontend>=1.13.0' 'nvidia-cudnn-cu12>=9.11.0.98'
RUN conda install -n py312 -y mpi4py

# Install nsjail runtime dependencies in base image
RUN apt-get update -y && \
apt-get install -y libc6 libstdc++6 libprotobuf32 libnl-route-3-200 && \
rm -rf /var/lib/apt/lists/*

########################################################################
# Build nsjail in separated image
FROM base AS build-nsjail

RUN apt-get update -y && \
apt-get install -y \
autoconf bison flex gcc g++ libprotobuf-dev \
libnl-route-3-dev libtool make pkg-config protobuf-compiler git
RUN git clone https://github.com/google/nsjail.git --depth 1 -b 3.4 /nsjail && \
cd /nsjail && make clean && make

########################################################################
# Download Z-Shell enhancements.
FROM docker.io/alpine/git:2.40.1 AS git-pure
Expand All @@ -78,9 +94,10 @@ RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting
# This layer can be distributed so that subsequent users

FROM base AS final

ENV HYDRA_FULL_ERROR=1

COPY --from=build-nsjail /nsjail/nsjail /bin

# A final record of the dependencies from pip freeze.
# RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-final.txt
# RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-final.txt
Expand Down Expand Up @@ -125,7 +142,9 @@ RUN pip3 install --upgrade pip && pip3 install -U \
nvidia-ml-py flashinfer-python
RUN pip3 install --upgrade pip && pip3 install -v -U --no-build-isolation \
"sglang[test]==0.5.2" sgl_kernel
RUN pip3 uninstall -y pynvml
RUN pip3 uninstall -y pynvml datasets && \
pip3 install datasets


# Add code tunnel for remote code development directly on the docker image
RUN mkdir -p /tmp/code
Expand All @@ -136,6 +155,10 @@ RUN curl -Lk 'https://code.visualstudio.com/sha/download?build=stable&os=cli-alp
WORKDIR /
RUN rm -rf /tmp/code

# Cleanup duty to make the docker image as lightweight as possible
RUN rm -rf /var/lib/apt/lists/* && \
apt-get clean && \
pip cache purge

# Entrypoint command (zsh)
CMD ["/bin/zsh"]

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dependencies = [
"click",
"ray",
"hydra-core",
"pydantic",
"rich",
"pydanclick",
"webdataset",
"transformers",
Expand Down
61 changes: 61 additions & 0 deletions scripts/download-datasets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
# Usage: <script> name0 name1 ... (if no names are given, provide a list of all datasets)
set -eou pipefail

# Check that the multimeditron scripts have been installed
if ! command -v mm &> /dev/null
then
echo "mm command could not be found, please install the multimeditron package"
exit 1
fi

# List of all of the datasets, command to run
BASE_CONFIG_PATH=$(realpath $(dirname "$0")/../config)
echo "Base config path: $BASE_CONFIG_PATH"
DATASETS=(
"math-shepherd mm preprocess-ds -c $BASE_CONFIG_PATH/rl/ds/config-math-shepherd.yaml"
"math-shepherd-val mm preprocess-ds -c $BASE_CONFIG_PATH/rl/ds/config-math-shepherd.yaml source.kwargs.split=test output=/capstor/store/cscs/swissai/a127/meditron/multimediset/reasoning/math-shepherd-val.parquet"
"baai-taco mm preprocess-ds -c $BASE_CONFIG_PATH/rl/ds/config-baai-taco.yaml"
"nemotron mm preprocess-ds -c $BASE_CONFIG_PATH/rl/ds/config-nemotron-post-training.yaml"
)

# Extract the names of the datasets
ALL_DATASET_NAMES=()
for entry in "${DATASETS[@]}"; do
name=$(echo $entry | cut -d' ' -f1)
ALL_DATASET_NAMES+=("$name")
done

# If no arguments are given, display the list of all datasets and exit
if [ "$#" -eq 0 ]; then
echo "No dataset names provided. Available datasets are:"
for name in "${ALL_DATASET_NAMES[@]}"; do
echo " - $name"
done
exit 0
fi

# Download the specified datasets
for name in "$@"; do
found=false
for entry in "${DATASETS[@]}"; do
entry_name=$(echo $entry | cut -d' ' -f1)
if [ "$name" == "$entry_name" ]; then
found=true
echo "Downloading dataset: $name"
# Execute the command to download the dataset
cmd=$(echo $entry | cut -d' ' -f2-)
echo "Running command: $cmd"
$cmd
echo "Finished downloading dataset: $name"
break
fi
done
if [ "$found" = false ]; then
echo "Dataset name '$name' not recognized. Available datasets are:"
for valid_name in "${ALL_DATASET_NAMES[@]}"; do
echo " - $valid_name"
done
exit 1
fi
done
10 changes: 0 additions & 10 deletions scripts/generate_hydra_verl_config.py

This file was deleted.

1 change: 1 addition & 0 deletions src/multimeditron/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ def main_cli():

from .preprocess import *
from .verl import *
from .debug import *
from .train import *
44 changes: 44 additions & 0 deletions src/multimeditron/cli/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from multimeditron.cli import EPILOG, CONFIG_PATH, main_cli
from multimeditron.utils import get_torch_dtype
from datasets import load_dataset
import ray
from ray import serve
from fastapi import Request


@serve.deployment(num_replicas=2) # scale horizontally if needed
class PyExecService:
def __init__(self):
# create a single NsJailExecutor actor for each replica
# self.executor = NsJailExecutor.remote()
Comment thread
BoyeGuillaume marked this conversation as resolved.
pass

async def __call__(self, request: Request):
"""
HTTP handler:
- expects POST with JSON body {"code": "print('hello')", "timeout": 5}
- runs code in nsjail
- returns JSON result
"""
data = await request.json()
code = data.get("code", "")
# timeout = data.get("timeout", 5)

if not code.strip():
return {"error": "No code provided"}

# execute asynchronously via Ray
# result = await self.executor.execute.remote(code, wall_timeout=timeout)
Comment thread
BoyeGuillaume marked this conversation as resolved.

return {"not": "implemented"}

@main_cli.command("serve")
def _serve():
# Start ray if not already running
ray.init(address="auto", namespace="serve")

# Deploy service
app = PyExecService.bind()
serve.run(app, blocking=True)

print("🚀 Ray Serve running at http://127.0.0.1:8000/PyExecService")
Loading