@@ -338,12 +338,10 @@ void llamaCPP::inferenceImpl(
338338
339339 if (!pBuffer) {
340340 LOG_INFO << " Connection closed or buffer is null. Reset context" ;
341- state->instance ->llama .request_cancel (state->task_id );
342341 state->inferenceStatus = FINISHED;
343342 return 0 ;
344343 }
345344
346-
347345 task_result result = state->instance ->llama .next_result (state->task_id );
348346 if (!result.error ) {
349347 const std::string to_send = result.result_json [" content" ];
@@ -367,7 +365,6 @@ void llamaCPP::inferenceImpl(
367365 std::size_t nRead = std::min (str.size (), nBuffSize);
368366 memcpy (pBuffer, str.data (), nRead);
369367 LOG_INFO << " reached result stop" ;
370- state->instance ->llama .request_cancel (state->task_id );
371368 state->inferenceStatus = FINISHED;
372369 }
373370
@@ -401,11 +398,13 @@ void llamaCPP::inferenceImpl(
401398 if (state->inferenceStatus == PENDING) {
402399 retries += 1 ;
403400 }
404- LOG_INFO << " Wait for task to be released:" << state->task_id ;
405- std::this_thread::sleep_for (std::chrono::milliseconds (300 ));
401+ if (state->inferenceStatus != RUNNING)
402+ LOG_INFO << " Wait for task to be released:" << state->task_id ;
403+ std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
406404 }
405+ // Request completed, release it
406+ state->instance ->llama .request_cancel (state->task_id );
407407 });
408- return ;
409408 } else {
410409 Json::Value respData;
411410 auto resp = nitro_utils::nitroHttpResponse ();
@@ -424,11 +423,9 @@ void llamaCPP::inferenceImpl(
424423 prompt_tokens, predicted_tokens);
425424 resp->setBody (full_return);
426425 } else {
427- resp->setBody (" internal error during inference" );
428- return ;
426+ resp->setBody (" Internal error during inference" );
429427 }
430428 callback (resp);
431- return ;
432429 }
433430 }
434431}
0 commit comments