@@ -22,8 +22,9 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP *instance) {
2222// --------------------------------------------
2323
2424// Function to check if the model is loaded
25- void check_model_loaded (llama_server_context &llama, const HttpRequestPtr &req,
26- std::function<void (const HttpResponsePtr &)> &callback) {
25+ void check_model_loaded (
26+ llama_server_context &llama, const HttpRequestPtr &req,
27+ std::function<void (const HttpResponsePtr &)> &callback) {
2728 if (!llama.model_loaded_external ) {
2829 Json::Value jsonResp;
2930 jsonResp[" message" ] =
@@ -299,13 +300,10 @@ void llamaCPP::chatCompletion(
299300 LOG_INFO << " Current completion text" ;
300301 LOG_INFO << formatted_output;
301302#endif
302- int task_id;
303303
304- LOG_INFO << " Resolved request for task_id:" << task_id;
305304
306305 if (is_streamed) {
307306 auto state = create_inference_state (this );
308- state->task_id = task_id;
309307 auto chunked_content_provider =
310308 [state, data](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
311309 if (!state->is_streaming ) {
@@ -386,9 +384,12 @@ void llamaCPP::chatCompletion(
386384 } else {
387385 Json::Value respData;
388386 auto resp = nitro_utils::nitroHttpResponse ();
387+ int task_id = llama.request_completion (data, false , false , -1 );
388+ LOG_INFO << " sent the non stream, waiting for respone" ;
389389 if (!json_value (data, " stream" , false )) {
390390 std::string completion_text;
391391 task_result result = llama.next_result (task_id);
392+ LOG_INFO << " Here is the result:" << result.error ;
392393 if (!result.error && result.stop ) {
393394 int prompt_tokens = result.result_json [" tokens_evaluated" ];
394395 int predicted_tokens = result.result_json [" tokens_predicted" ];
0 commit comments