Skip to content

Commit a9bd804

Browse files
committed
cloud: Make status messages more detailed
1 parent 6cc45c3 commit a9bd804

File tree

9 files changed

+277
-69
lines changed

9 files changed

+277
-69
lines changed

cloud/blob.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,11 @@ func (c *Client) Output(ctx context.Context, job *cloudrpc.JobName) (*cloudrpc.J
6969
o := &cloudrpc.JobOutput{
7070
Files: make(map[string][]byte),
7171
}
72-
//k8sJob, err := c.getk8sJob(ctx, job)
72+
//TODO: k8sJob, err := c.getk8sJob(ctx, job)
7373
if err != nil {
7474
return nil, err
7575
}
76-
//addrs, err := c.jobOutputAddresses(ctx, job.Name, k8sJob.Spec.Template.Spec.Containers[0].Command)
76+
//TODO: addrs, err := c.jobOutputAddresses(ctx, job.Name, k8sJob.Spec.Template.Spec.Containers[0].Command)
7777
addrs, err := c.jobOutputAddresses(ctx, job.Name, []string{"inmap", "run", "steady"})
7878
if err != nil {
7979
return nil, err

cloud/client.go

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,24 @@ func NewClient(k kubernetes.Interface, root *cobra.Command, config *viper.Viper,
8787
return c, nil
8888
}
8989

90-
// Create creates (and queues) a Kubernetes job with the given name that executes
90+
// RunJob creates (and queues) a Kubernetes job with the given name that executes
9191
// the given command with the given command-line arguments on the given container
9292
// image. resources specifies the minimum required resources for execution.
9393
func (c *Client) RunJob(ctx context.Context, job *cloudrpc.JobSpec) (*cloudrpc.JobStatus, error) {
9494
if job.Version != inmap.Version {
9595
return nil, fmt.Errorf("incorrect InMAP version: %s != %s", job.Version, inmap.Version)
9696
}
97+
98+
status, err := c.Status(ctx, &cloudrpc.JobName{Name: job.Name, Version: job.Version})
99+
if err != nil {
100+
return nil, err
101+
}
102+
if status.Status != cloudrpc.Status_Failed { //TODO: status.Status != cloudrpc.Status_Missing && {
103+
// Only create the job if it is missing or failed.
104+
c.Delete(ctx, &cloudrpc.JobName{Name: job.Name, Version: job.Version})
105+
return status, nil
106+
}
107+
97108
if err := c.stageInputs(ctx, job); err != nil {
98109
return nil, err
99110
}
@@ -107,20 +118,11 @@ func (c *Client) RunJob(ctx context.Context, job *cloudrpc.JobSpec) (*cloudrpc.J
107118
k8sJob := createJob(userJobName(user, job.Name), job.Cmd, job.Args, c.Image, core.ResourceList{
108119
core.ResourceMemory: resource.MustParse(fmt.Sprintf("%dGi", job.MemoryGB)),
109120
})
110-
k8sJobResult, err := c.jobControl.Create(k8sJob)
121+
_, err = c.jobControl.Create(k8sJob)
111122
if err != nil {
112123
return nil, err
113124
}
114-
return c.jobStatus(k8sJobResult)
115-
}
116-
117-
// Status returns the status of the given job.
118-
func (c *Client) Status(ctx context.Context, job *cloudrpc.JobName) (*cloudrpc.JobStatus, error) {
119-
k8sJob, err := c.getk8sJob(ctx, job)
120-
if err != nil {
121-
return nil, err
122-
}
123-
return c.jobStatus(k8sJob)
125+
return c.Status(ctx, &cloudrpc.JobName{Name: job.Name, Version: job.Version})
124126
}
125127

126128
// Delete deletes the given job.
@@ -167,10 +169,36 @@ func userJobName(user, name string) string {
167169
return strings.Replace(user, "_", "-", -1) + "-" + strings.Replace(name, "_", "-", -1)
168170
}
169171

170-
func (c *Client) jobStatus(j *batch.Job) (*cloudrpc.JobStatus, error) {
171-
return &cloudrpc.JobStatus{
172-
Status: j.Status.String(),
173-
}, nil
172+
// Status returns the status of the given job.
173+
func (c *Client) Status(ctx context.Context, job *cloudrpc.JobName) (*cloudrpc.JobStatus, error) {
174+
s := new(cloudrpc.JobStatus)
175+
/*k8sJob, err := c.getk8sJob(ctx, job)
176+
if err != nil {
177+
return &cloudrpc.JobStatus{
178+
Status: cloudrpc.Status_Missing,
179+
Message: err.Error(),
180+
}, nil
181+
}
182+
for _, c := range k8sJob.Status.Conditions {
183+
if c.Type == batch.JobComplete && c.Status == core.ConditionTrue {
184+
s.Status = cloudrpc.Status_Complete
185+
s.StartTime = k8sJob.Status.StartTime.Time.Unix()
186+
s.CompletionTime = k8sJob.Status.CompletionTime.Time.Unix()
187+
} else if c.Type == batch.JobFailed && c.Status == core.ConditionTrue {
188+
s.Status = cloudrpc.Status_Failed
189+
}
190+
}
191+
if k8sJob.Status.Active > 0 {
192+
s.Status = cloudrpc.Status_Running
193+
s.StartTime = k8sJob.Status.StartTime.Time.Unix()
194+
}*/
195+
//TODO: err = c.checkOutputs(ctx, name, k8sJob.Spec.Template.Spec.Containers[0].Command)
196+
err := c.checkOutputs(ctx, job.Name, []string{"inmap", "run", "steady"})
197+
if err != nil {
198+
s.Status = cloudrpc.Status_Failed
199+
s.Message = fmt.Sprintf("job completed but the following error occurred when checking outputs: %s", err)
200+
}
201+
return s, nil
174202
}
175203

176204
// createJob creates a Kubernetes job specification with the given name that executes the

cloud/client_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func TestClient_fake(t *testing.T) {
9696
t.Fatal(err)
9797
}
9898
wantStatus := &cloudrpc.JobStatus{
99-
Status: "&JobStatus{Conditions:[],StartTime:<nil>,CompletionTime:<nil>,Active:0,Succeeded:0,Failed:0,}",
99+
// Status: "&JobStatus{Conditions:[],StartTime:<nil>,CompletionTime:<nil>,Active:0,Succeeded:0,Failed:0,}",
100100
}
101101
if !reflect.DeepEqual(wantStatus, status) {
102102
t.Errorf("status:\n%+v\n!=\n%+v", status, wantStatus)
@@ -112,7 +112,7 @@ func TestClient_fake(t *testing.T) {
112112
t.Fatal(err)
113113
}
114114
wantStatus := &cloudrpc.JobStatus{
115-
Status: "&JobStatus{Conditions:[],StartTime:<nil>,CompletionTime:<nil>,Active:0,Succeeded:0,Failed:0,}",
115+
// Status: "&JobStatus{Conditions:[],StartTime:<nil>,CompletionTime:<nil>,Active:0,Succeeded:0,Failed:0,}",
116116
}
117117
if !reflect.DeepEqual(wantStatus, status) {
118118
t.Errorf("status:\n%+v\n!=\n%+v", status, wantStatus)

cloud/cloud.proto

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ service CloudRPC {
2323
// output file(s).
2424
rpc RunJob(JobSpec) returns (JobStatus) {}
2525

26-
// OutputAddresses returns status and the addresses the output file(s) of the
27-
// requested simulation name.
26+
// Status returns the status of the simulation with the
27+
// requested name.
2828
rpc Status(JobName) returns(JobStatus) {}
2929

3030
// Output returns the output file(s) of the
@@ -61,9 +61,21 @@ message JobSpec {
6161
map<string,bytes> FileData = 7;
6262
}
6363

64+
enum Status {
65+
Complete = 0;
66+
Failed = 1;
67+
Missing = 2;
68+
Running = 3;
69+
}
70+
6471
message JobStatus {
6572
// Status holds the current status of the job.
66-
string Status = 1;
73+
Status Status = 1;
74+
string Message = 2;
75+
76+
// Unix time, the number of seconds elapsed since January 1, 1970 UTC
77+
int64 StartTime = 3;
78+
int64 CompletionTime = 4;
6779
}
6880

6981
message JobOutput {

cloud/cloudrpc/cloud.pb.go

Lines changed: 97 additions & 36 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)