99from typing import Dict
1010
1111from experimaestro .scheduler import experiment
12- from experimaestro .scheduler .jobs import Job , JobError , JobState
12+ from experimaestro .scheduler .jobs import Job , JobState
1313from experimaestro .scheduler .services import Service
1414
1515
1616from experimaestro .utils import logger
17- from experimaestro .locking import Locks , LockError
1817from experimaestro .utils .asyncio import asyncThreadcheck
1918import concurrent .futures
2019
@@ -84,10 +83,6 @@ def start_scheduler(self):
8483 else :
8584 logger .warning ("Scheduler already started" )
8685
87- # @property
88- # def loop(self):
89- # return self.xp.loop
90-
9186 def addlistener (self , listener : Listener ):
9287 self .listeners .add (listener )
9388
@@ -262,9 +257,6 @@ async def aio_submit(self, job: Job) -> JobState: # noqa: C901
262257
263258 # Decrement the number of unfinished jobs and notify
264259 self .xp .unfinishedJobs -= 1
265- # async with self.xp.central.exitCondition:
266- # logging.debug("Updated number of unfinished jobs")
267- # self.xp.central.exitCondition.notify_all()
268260 async with self .exitCondition :
269261 logging .debug ("Updated number of unfinished jobs" )
270262 self .exitCondition .notify_all ()
@@ -282,96 +274,37 @@ async def aio_submit(self, job: Job) -> JobState: # noqa: C901
282274 return job .state
283275
284276 async def aio_start (self , job : Job ) -> Optional [JobState ]:
285- """Start a job
277+ """Start a job (scheduler coordination layer)
278+
279+ This method serves as a coordination layer that delegates the actual
280+ job starting logic to the job itself while handling scheduler-specific
281+ concerns like state notifications and providing coordination context.
286282
287- Returns None if the dependencies could not be locked after all
288- Returns DONE/ERROR depending on the process outcome
283+ :param job: The job to start
284+ :return: JobState.WAITING if dependencies could not be locked, JobState.DONE
285+ if job completed successfully, JobState.ERROR if job failed during execution,
286+ or None (should not occur in normal operation)
287+ :raises Exception: Various exceptions during scheduler coordination
289288 """
290289
291- # We first lock the job before proceeding
290+ # Assert preconditions
292291 assert job .launcher is not None
293- # assert self.xp.central is not None
294-
295- with Locks () as locks :
296- logger .debug ("[starting] Locking job %s" , job )
297- async with job .launcher .connector .lock (job .lockpath ):
298- logger .debug ("[starting] Locked job %s" , job )
299292
300- state = None
301- try :
302- logger .debug (
303- "Starting job %s with %d dependencies" ,
304- job ,
305- len (job .dependencies ),
306- )
307-
308- # async with self.xp.central.dependencyLock:
309- async with self .dependencyLock :
310- for dependency in job .dependencies :
311- try :
312- locks .append (dependency .lock ().acquire ())
313- except LockError :
314- logger .warning (
315- "Could not lock %s, aborting start for job %s" ,
316- dependency ,
317- job ,
318- )
319- dependency .check ()
320- return JobState .WAITING
321-
322- self .notify_job_state (job )
323-
324- job .starttime = time .time ()
325-
326- # Creates the main directory
327- directory = job .path
328- logger .debug ("Making directories job %s..." , directory )
329- if not directory .is_dir ():
330- directory .mkdir (parents = True , exist_ok = True )
331-
332- # Sets up the notification URL
333- if self .xp .server is not None :
334- job .add_notification_server (self .xp .server )
335-
336- except Exception :
337- logger .warning ("Error while locking job" , exc_info = True )
338- return JobState .WAITING
293+ try :
294+ # Call job's start method with scheduler context
295+ state = await job .aio_start (
296+ sched_dependency_lock = self .dependencyLock ,
297+ notification_server = self .xp .server if self .xp else None ,
298+ )
339299
340- try :
341- # Runs the job
342- process = await job .aio_run ()
343- except Exception :
344- logger .warning ("Error while starting job" , exc_info = True )
345- return JobState .ERROR
300+ if state is None :
301+ # Dependencies couldn't be locked, return WAITING state
302+ return JobState .WAITING
346303
347- try :
348- if isinstance (process , JobState ):
349- state = process
350- logger .debug ("Job %s ended (state %s)" , job , state )
351- else :
352- logger .debug ("Waiting for job %s process to end" , job )
353-
354- code = await process .aio_code ()
355- logger .debug ("Got return code %s for %s" , code , job )
356-
357- # Check the file if there is no return code
358- if code is None :
359- # Case where we cannot retrieve the code right away
360- if job .donepath .is_file ():
361- code = 0
362- else :
363- code = int (job .failedpath .read_text ())
364-
365- logger .debug ("Job %s ended with code %s" , job , code )
366- state = JobState .DONE if code == 0 else JobState .ERROR
367-
368- except JobError :
369- logger .warning ("Error while running job" )
370- state = JobState .ERROR
304+ # Notify scheduler listeners of job state after successful start
305+ self .notify_job_state (job )
306+ return state
371307
372- except Exception :
373- logger .warning (
374- "Error while running job (in experimaestro)" , exc_info = True
375- )
376- state = JobState .ERROR
377- return state
308+ except Exception :
309+ logger .warning ("Error in scheduler job coordination" , exc_info = True )
310+ return JobState .ERROR
0 commit comments