Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ class ScheduleMetrics:
active_blocks: int = 0
cached_blocks: int = 0
free_blocks: int = 0
prefix_cache_hit_rate: float = 0


@dataclass
Expand Down
4 changes: 3 additions & 1 deletion lmdeploy/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ def log(self):
f'Unfinished: {scheduler_stats.num_total_reqs-scheduler_stats.num_finished_reqs} reqs, '
f'Running: {scheduler_stats.num_running_reqs} reqs, '
f'Waiting: {scheduler_stats.num_waiting_reqs} reqs, '
f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%')
f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, '
f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%')

print(log_msg, flush=True)
self.log_spec_msg()

Expand Down
4 changes: 4 additions & 0 deletions lmdeploy/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ class SchedulerStats:
num_running_reqs: Currently executing requests.
num_waiting_reqs: Requests queued waiting for execution.
gpu_cache_usage: Fraction of GPU KV blocks utilized (0.0 to 1.0).
prefix_cache_hit_rate: Prefix caching hit rate.
"""

num_total_reqs: int = 0
num_finished_reqs: int = 0
num_running_reqs: int = 0
num_waiting_reqs: int = 0
gpu_cache_usage: float = 0.0
prefix_cache_hit_rate: float = 0.0

def __repr__(self):
return ('SchedulerStats(\n'
Expand All @@ -35,12 +37,14 @@ def __repr__(self):
f' num_running_reqs={self.num_running_reqs},\n'
f' num_waiting_reqs={self.num_waiting_reqs},\n'
f' gpu_cache_usage={self.gpu_cache_usage:.6f},\n'
f' prefix_cache_hit_rate={self.prefix_cache_hit_rate:.6f},\n'
')')

def update_from_schedule_metrics(self, scheduled_metrics: ScheduleMetrics):
self.num_running_reqs = scheduled_metrics.active_seqs
self.num_waiting_reqs = scheduled_metrics.waiting_seqs
self.gpu_cache_usage = 1.0 - (scheduled_metrics.free_blocks / scheduled_metrics.total_blocks)
self.prefix_cache_hit_rate = scheduled_metrics.prefix_cache_hit_rate


class RequestStats:
Expand Down
25 changes: 25 additions & 0 deletions lmdeploy/pytorch/paging/block_trie.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import heapq
from dataclasses import dataclass
from typing import Dict, Set

import numpy as np
Expand All @@ -10,6 +11,20 @@
from .block_manager import BaseBlockManager


@dataclass
class PrefixCacheStats:
"""Prefix caching stats."""
num_query_tokens: int = 0
num_hit_tokens: int = 0

def reset(self):
self.num_query_tokens = 0
self.num_hit_tokens = 0

def hit_rate(self):
return 0.0 if self.num_query_tokens <= 0 else float(self.num_hit_tokens) / self.num_query_tokens


class Node:
"""Node of block trie."""

Expand Down Expand Up @@ -54,6 +69,11 @@ def __init__(self, cache_config: CacheConfig, block_manager: BaseBlockManager):
# caches with different adapter should not be shared.
self._roots: Dict[str, Node] = dict()
self.leaves: Set[Node] = set()
self.stats = PrefixCacheStats()

def hit_rate(self):
"""Get hit rate."""
return self.stats.hit_rate()

def get_root(self, adapter_name: str):
"""Get root by adapter name."""
Expand All @@ -73,6 +93,7 @@ def match(self, seq: SchedulerSequence):
curr: Node = getattr(logical_blocks, 'last_shared_node', None)
if curr is None:
curr = self.get_root(seq.adapter_name)
init_num_matched = curr.num_matched
num_matched = curr.num_matched

def __match_success(node: Node):
Expand Down Expand Up @@ -101,6 +122,10 @@ def __match_success(node: Node):
seq.logical_blocks.append(matched_blocks)
seq.set_step(num_matched)

# record prefix hit
self.stats.num_query_tokens += seq.num_all_ids - init_num_matched
self.stats.num_hit_tokens += num_matched - init_num_matched

seq.logical_blocks.last_shared_node = curr

def allocate(self, seq: SchedulerSequence):
Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/pytorch/paging/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def _reorder_waiting():

# allocate session memory
self.block_manager.allocate(seq, prealloc_size)
self.block_trie.allocate(seq)
if self.is_ssm:
self.state_manager.allocate(seq)
_to_running(seq)
Expand Down Expand Up @@ -451,4 +452,5 @@ def schedule_metrics(self):
waiting_seqs=self.num_waiting() + self.num_running(),
total_blocks=self.block_manager.num_gpu_blocks,
free_blocks=self.block_manager.get_num_free_gpu_blocks(),
prefix_cache_hit_rate=self.block_trie.hit_rate(),
)