@@ -371,7 +371,7 @@ void llama_kv_cache_unified::commit() {
371371bool llama_kv_cache_unified::update (llama_context & lctx) {
372372 bool need_reserve = false ;
373373
374- const auto & sched = lctx.get_sched ();
374+ auto * sched = lctx.get_sched ();
375375
376376 if (has_shift) {
377377 if (!get_can_shift ()) {
@@ -382,13 +382,13 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
382382
383383 // apply K-shift if needed
384384 if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
385- ggml_backend_sched_reset (sched. get () );
385+ ggml_backend_sched_reset (sched);
386386
387387 auto * gf = lctx.graph_init ();
388388
389389 auto res = build_graph_shift (lctx, gf);
390390
391- ggml_backend_sched_alloc_graph (sched. get () , gf);
391+ ggml_backend_sched_alloc_graph (sched, gf);
392392
393393 res->set_inputs (nullptr );
394394
@@ -410,13 +410,13 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
410410 LLAMA_LOG_DEBUG (" %s: defragmenting KV cache\n " , __func__);
411411
412412 if (defrag_prepare (lctx.graph_max_nodes ())) {
413- ggml_backend_sched_reset (sched. get () );
413+ ggml_backend_sched_reset (sched);
414414
415415 auto * gf = lctx.graph_init ();
416416
417417 auto res = build_graph_defrag (lctx, gf);
418418
419- ggml_backend_sched_alloc_graph (sched. get () , gf);
419+ ggml_backend_sched_alloc_graph (sched, gf);
420420
421421 res->set_inputs (nullptr );
422422
@@ -602,7 +602,8 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
602602 ggml_backend_buffer * bbuf) const {
603603 const auto & cparams = lctx.get_cparams ();
604604 const auto & backends = lctx.get_backends ();
605- const auto & sched = lctx.get_sched ();
605+
606+ auto * sched = lctx.get_sched ();
606607
607608 const auto & n_ctx_orig = cparams.n_ctx_orig_yarn ;
608609
@@ -623,12 +624,12 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
623624 // dequantize to f32 -> RoPE -> quantize back
624625 tmp = ggml_cast (ctx, cur, GGML_TYPE_F32);
625626
626- // TODO: can we simplify/avoid this?
627+ // TODO: can we simplify/avoid this? [TAG_BACKENDS]
627628 if (bbuf) {
628629 for (const auto & backend : backends) {
629630 // Figure out which backend KV cache belongs to
630631 if (ggml_backend_supports_buft (backend.get (), ggml_backend_buffer_get_type (bbuf))) {
631- ggml_backend_sched_set_tensor_backend (sched. get () , tmp, backend.get ());
632+ ggml_backend_sched_set_tensor_backend (sched, tmp, backend.get ());
632633 break ;
633634 }
634635 }
@@ -680,7 +681,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
680681 ggml_cgraph * gf) const {
681682 auto res = std::make_unique<llm_graph_result>();
682683
683- auto * ctx = lctx.get_ctx_compute (). get () ;
684+ auto * ctx = lctx.get_ctx_compute ();
684685
685686 const auto & cparams = lctx.get_cparams ();
686687
@@ -733,7 +734,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
733734 ggml_cgraph * gf) const {
734735 auto res = std::make_unique<llm_graph_result>();
735736
736- auto * ctx = lctx.get_ctx_compute (). get () ;
737+ auto * ctx = lctx.get_ctx_compute ();
737738
738739 const auto & ids = defrag_info.ids ;
739740
0 commit comments