@@ -67,6 +67,7 @@ def main(config: DictConfig) -> None:
6767 # Re-log to capture log with wandb.
6868 logger .info (f"Running command: { subprocess .list2cmdline (sys .argv )} " )
6969 logger .info (f"Init directory: { config .run_dir } " )
70+ logger .info (f"Run can be resumed from the directory: { resuming_dir } " )
7071 logger .info (f"Working directory: { Path .cwd ()} " )
7172 logger .info (f"Running with config: \n { OmegaConf .to_yaml (config )} " )
7273 if config .resuming .resume :
@@ -76,39 +77,37 @@ def main(config: DictConfig) -> None:
7677 utils .seeding .seed_everything (config )
7778
7879 # Example experiment
79- n = 100
80- # Loop from 1 to 27 and write 27 files to the disk.
81-
82- # Attempt to resume
83- # Find the latest checkpoint of format file_{i}.txt
84- path = Path .cwd ()
85- files = path .glob ("file_*.txt" )
86- files = sorted (files , key = lambda x : int (x .stem .split ("_" )[1 ]))
80+ files = sorted (
81+ Path .cwd ().glob ("file_*.txt" ), key = lambda x : int (x .stem .split ("_" )[1 ])
82+ )
8783 if files :
8884 last_file = files [- 1 ]
89- logger .info (f"Resuming from { last_file } " )
90- j = int (last_file .stem .split ("_" )[1 ]) % ( config . some_number * n )
85+ logger .info (f"Resuming from { last_file . stem } " )
86+ i = int (last_file .stem .split ("_" )[1 ]) + 1
9187 else :
92- j = 0
93-
94- for i in range ( j + 1 , 28 ):
95- wandb . log (
96- {
97- "iteration" : i ,
98- "file_written " : i ,
99- "some_metric" : i + config . some_number * n ,
100- }
101- )
102- print ( i )
103- if i % 9 == 0 :
88+ i = 0
89+
90+ steps = 0
91+ while i < 30 :
92+ # Compute and log x**n.
93+ y = i * config . n
94+ logs = { "i " : i , "y" : y }
95+ print ( logs )
96+ wandb . log ( logs )
97+
98+ # Checkpoint every 5 steps.
99+ if i % 5 == 0 :
104100 with open (f"file_{ i } .txt" , "w" ) as f :
105- f .write (f"some_metric={ i + config .some_number * n } " )
106- print (f"Checkpointing at { i } " )
101+ f .write (f"y={ y } " )
102+ logger .info (f"Checkpointing at { i } " )
103+
104+ i += 1
105+ steps += 1
106+
107+ # Preempt every 13 steps.
108+ if steps == 13 :
109+ raise InterruptedError ("Preempt after 13 steps." )
107110
108- if j == 0 and i % 15 == 0 :
109- # Crash at first run to test resuming.
110- raise ValueError ("Crashing at i % 15 = 0" )
111- pass
112111 sleep (1 )
113112
114113 logger .info ("Finished writing files" )
0 commit comments