Skip to content

Commit 398af29

Browse files
committed
[Minor] Refactor resuming example
1 parent de57f79 commit 398af29

File tree

3 files changed

+29
-29
lines changed

3 files changed

+29
-29
lines changed

reproducibility-scripts/template-sweep.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ parameters:
1818
value: True
1919
resuming.use_commit:
2020
value: True
21-
some_number:
21+
n:
2222
values: [1, 2, 3]
2323

2424
command:

src/template_package_name/configs/template_experiment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ defaults:
1313

1414
some_arg: "some_default_value"
1515
some_number: 10
16+
n: 10
1617
is_this_key_overridden: no

src/template_package_name/template_experiment.py

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def main(config: DictConfig) -> None:
6767
# Re-log to capture log with wandb.
6868
logger.info(f"Running command: {subprocess.list2cmdline(sys.argv)}")
6969
logger.info(f"Init directory: {config.run_dir}")
70+
logger.info(f"Run can be resumed from the directory: {resuming_dir}")
7071
logger.info(f"Working directory: {Path.cwd()}")
7172
logger.info(f"Running with config: \n{OmegaConf.to_yaml(config)}")
7273
if config.resuming.resume:
@@ -76,39 +77,37 @@ def main(config: DictConfig) -> None:
7677
utils.seeding.seed_everything(config)
7778

7879
# Example experiment
79-
n = 100
80-
# Loop from 1 to 27 and write 27 files to the disk.
81-
82-
# Attempt to resume
83-
# Find the latest checkpoint of format file_{i}.txt
84-
path = Path.cwd()
85-
files = path.glob("file_*.txt")
86-
files = sorted(files, key=lambda x: int(x.stem.split("_")[1]))
80+
files = sorted(
81+
Path.cwd().glob("file_*.txt"), key=lambda x: int(x.stem.split("_")[1])
82+
)
8783
if files:
8884
last_file = files[-1]
89-
logger.info(f"Resuming from {last_file}")
90-
j = int(last_file.stem.split("_")[1]) % (config.some_number * n)
85+
logger.info(f"Resuming from {last_file.stem}")
86+
i = int(last_file.stem.split("_")[1]) + 1
9187
else:
92-
j = 0
93-
94-
for i in range(j + 1, 28):
95-
wandb.log(
96-
{
97-
"iteration": i,
98-
"file_written": i,
99-
"some_metric": i + config.some_number * n,
100-
}
101-
)
102-
print(i)
103-
if i % 9 == 0:
88+
i = 0
89+
90+
steps = 0
91+
while i < 30:
92+
# Compute and log x**n.
93+
y = i * config.n
94+
logs = {"i": i, "y": y}
95+
print(logs)
96+
wandb.log(logs)
97+
98+
# Checkpoint every 5 steps.
99+
if i % 5 == 0:
104100
with open(f"file_{i}.txt", "w") as f:
105-
f.write(f"some_metric={i + config.some_number * n}")
106-
print(f"Checkpointing at {i}")
101+
f.write(f"y={y}")
102+
logger.info(f"Checkpointing at {i}")
103+
104+
i += 1
105+
steps += 1
106+
107+
# Preempt every 13 steps.
108+
if steps == 13:
109+
raise InterruptedError("Preempt after 13 steps.")
107110

108-
if j == 0 and i % 15 == 0:
109-
# Crash at first run to test resuming.
110-
raise ValueError("Crashing at i % 15 = 0")
111-
pass
112111
sleep(1)
113112

114113
logger.info("Finished writing files")

0 commit comments

Comments
 (0)