Skip to content

Commit a14c6e5

Browse files
committed
tests: add queue and begin-time check in ephemeral cluster
Fixes: #155
1 parent 0eb9ffc commit a14c6e5

File tree

6 files changed

+210
-0
lines changed

6 files changed

+210
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Workflow to test latest climate-aware-task-scheduler release against ephemeral Slurm cluster
2+
name: cluster-tests
3+
4+
on:
5+
push:
6+
branches: [ main ]
7+
paths:
8+
- '.github/workflows/cluster-tests.yml'
9+
- 'cluster/**'
10+
- '!cluster/README.md'
11+
pull_request:
12+
branches: [ main ]
13+
paths:
14+
- '.github/workflows/cluster-tests.yml'
15+
- 'cluster/**'
16+
- '!cluster/README.md'
17+
workflow_dispatch:
18+
19+
jobs:
20+
build:
21+
22+
runs-on: ubuntu-latest
23+
steps:
24+
- uses: actions/checkout@v4
25+
- name: Build slurm container
26+
run: |
27+
./cluster/clone.sh
28+
./cluster/build.sh
29+
- name: Run tests
30+
run: |
31+
sleep 30 # wait for cluster to come up
32+
./cluster/tests.sh

cluster/Dockerfile

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Origin: https://github.com/giovtorres/slurm-docker-cluster/tree/c9aa93c080567121c6b28913152a1cd696465985
2+
# Modified Dockerfile to install uv, cats and jq
3+
FROM rockylinux:8
4+
5+
LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docker-cluster" \
6+
org.opencontainers.image.title="slurm-docker-cluster" \
7+
org.opencontainers.image.description="Slurm Docker cluster on Rocky Linux 8" \
8+
org.label-schema.docker.cmd="docker-compose up -d" \
9+
maintainer="Giovanni Torres"
10+
11+
RUN set -ex \
12+
&& yum makecache \
13+
&& yum -y update \
14+
&& yum -y install dnf-plugins-core \
15+
&& yum config-manager --set-enabled powertools \
16+
&& yum -y install \
17+
wget \
18+
bzip2 \
19+
perl \
20+
gcc \
21+
gcc-c++\
22+
git \
23+
gnupg \
24+
make \
25+
munge \
26+
munge-devel \
27+
python3-devel \
28+
python3-pip \
29+
python3 \
30+
mariadb-server \
31+
mariadb-devel \
32+
psmisc \
33+
bash-completion \
34+
vim-enhanced \
35+
http-parser-devel \
36+
json-c-devel \
37+
jq \
38+
&& yum clean all \
39+
&& rm -rf /var/cache/yum
40+
41+
RUN alternatives --set python /usr/bin/python3
42+
43+
RUN pip3 install Cython pytest
44+
45+
# install uv
46+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
47+
48+
ENV PATH="/root/.local/bin:${PATH}"
49+
50+
RUN uv tool install climate-aware-task-scheduler && cp /root/.local/bin/cats /usr/local/bin/
51+
52+
ARG GOSU_VERSION=1.17
53+
54+
RUN set -ex \
55+
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" \
56+
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64.asc" \
57+
&& export GNUPGHOME="$(mktemp -d)" \
58+
&& gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \
59+
&& gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \
60+
&& rm -rf "${GNUPGHOME}" /usr/local/bin/gosu.asc \
61+
&& chmod +x /usr/local/bin/gosu \
62+
&& gosu nobody true
63+
64+
ARG SLURM_TAG
65+
66+
RUN set -x \
67+
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
68+
&& pushd slurm \
69+
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
70+
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
71+
&& make install \
72+
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
73+
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
74+
&& install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \
75+
&& install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \
76+
&& popd \
77+
&& rm -rf slurm \
78+
&& groupadd -r --gid=990 slurm \
79+
&& useradd -r -g slurm --uid=990 slurm \
80+
&& mkdir /etc/sysconfig/slurm \
81+
/var/spool/slurmd \
82+
/var/run/slurmd \
83+
/var/run/slurmdbd \
84+
/var/lib/slurmd \
85+
/var/log/slurm \
86+
/data \
87+
&& touch /var/lib/slurmd/node_state \
88+
/var/lib/slurmd/front_end_state \
89+
/var/lib/slurmd/job_state \
90+
/var/lib/slurmd/resv_state \
91+
/var/lib/slurmd/trigger_state \
92+
/var/lib/slurmd/assoc_mgr_state \
93+
/var/lib/slurmd/assoc_usage \
94+
/var/lib/slurmd/qos_usage \
95+
/var/lib/slurmd/fed_mgr_state \
96+
&& chown -R slurm:slurm /var/*/slurm* \
97+
&& /sbin/create-munge-key
98+
99+
COPY slurm.conf /etc/slurm/slurm.conf
100+
COPY slurmdbd.conf /etc/slurm/slurmdbd.conf
101+
RUN set -x \
102+
&& chown slurm:slurm /etc/slurm/slurmdbd.conf \
103+
&& chmod 600 /etc/slurm/slurmdbd.conf
104+
105+
106+
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
107+
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
108+
109+
CMD ["slurmdbd"]

cluster/build.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
# Builds slurm-docker-cluster with patched Dockerfile that installs cats
3+
set -eou pipefail
4+
pushd slurm-docker-cluster
5+
echo :: Patching Dockerfile with version that installs cats
6+
cp ../cluster/Dockerfile .
7+
docker compose build
8+
docker compose up -d
9+
popd

cluster/cleanup.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
# Cleans up resources and shuts down containers, useful for local development of slurm-docker-cluster
3+
set -eou pipefail
4+
5+
docker compose down
6+
if [ -d slurm-docker-cluster ]; then
7+
rm -r slurm-docker-cluster
8+
fi

cluster/clone.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
set -eou pipefail
3+
git clone https://github.com/giovtorres/slurm-docker-cluster
4+
pushd slurm-docker-cluster
5+
git checkout c9aa93c080567121c6b28913152a1cd696465985
6+
popd

cluster/tests.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env bash
2+
# Run tests to check if slurm picks up begin time set by CATS
3+
# This relies on a cluster already setup and running, if not run
4+
# ./cluster/build.sh
5+
set -eou pipefail
6+
7+
# Step a) Run cats inside the slurmctld container and extract start time
8+
raw_output=$(docker exec -i slurmctld \
9+
cats -d 5 --loc RG1 --scheduler=sbatch --command='ls' --format=json | \
10+
awk 'BEGIN{found=0} {
11+
if(!found){
12+
i=index($0,"{");
13+
if(i){ print substr($0,i); found=1 }
14+
} else { print }
15+
}')
16+
job_id=$(echo "$raw_output" | grep ^Submitted | awk '{print $4}')
17+
echo "Detected job submission ID: $job_id"
18+
raw_json=$(echo "$raw_output" | grep -v ^Submitted)
19+
raw_start=$(printf '%s\n' "$raw_json" | jq -r '.carbonIntensityOptimal.start')
20+
21+
# Replace seconds with 00 (truncate last 6 chars and add "00")
22+
# Example: 2025-08-28T12:43:30.156434+00:00 -> 2025-08-28T12:43:00
23+
scheduled_start=$(echo "$raw_start" | sed -E 's/:[0-9]{2}\..*/:00/')
24+
25+
echo "Expected scheduled start time: $scheduled_start"
26+
27+
# Step b) Fetch job details
28+
job_output=$(docker exec -i slurmctld scontrol show job "$job_id")
29+
30+
# Check condition 1: job is pending for BeginTime
31+
if ! echo "$job_output" | grep -q "JobState=PENDING Reason=BeginTime Dependency=(null)"; then
32+
echo "❌ Job state/Reason is not correct!"
33+
echo "$job_output"
34+
exit 1
35+
fi
36+
37+
# Check condition 2: start time matches
38+
if ! echo "$job_output" | grep -q "StartTime=$scheduled_start"; then
39+
echo "❌ Start time does not match expected!"
40+
echo "Expected: StartTime=$scheduled_start"
41+
echo "Actual output:"
42+
echo "$job_output"
43+
exit 1
44+
fi
45+
46+
echo "✅ Job is correctly delayed until $scheduled_start"

0 commit comments

Comments
 (0)