Skip to content

Commit 7643b70

Browse files
committed
config change
2 parents f921026 + d2623f6 commit 7643b70

File tree

115 files changed

+5778
-1240
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+5778
-1240
lines changed

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
99
option(BUILD_UCM_STORE "build ucm store module." ON)
1010
option(BUILD_UCM_SPARSE "build ucm sparse module." ON)
1111
option(BUILD_UNIT_TESTS "build all unit test suits." OFF)
12-
option(BUILD_NUMA "build numactl library" OFF)
12+
option(BUILD_NUMA "build numactl library." OFF)
13+
option(DOWNLOAD_DEPENDENCE "download dependence by cmake." ON)
1314
set(RUNTIME_ENVIRONMENT "simu" CACHE STRING "runtime: simu, ascend, musa or cuda.")
1415

1516
execute_process(COMMAND git rev-parse HEAD OUTPUT_VARIABLE UCM_COMMIT_ID OUTPUT_STRIP_TRAILING_WHITESPACE)

docs/source/getting-started/quick_start.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,9 @@ vllm serve ${MODEL_PATH} \
7777
"kv_connector_module_path": "ucm.integration.vllm.uc_connector",
7878
"kv_role": "kv_both",
7979
"kv_connector_extra_config": {
80-
"ucm_connector_name": "UcmDramStore",
80+
"ucm_connector_name": "UcmNfsStore",
8181
"ucm_connector_config": {
82-
"max_cache_size": 5368709120,
83-
"kv_block_size": 262144
82+
"storage_backends": "/home/test"
8483
}
8584
}
8685
}'

docs/source/user-guide/prefix-cache/dram_store.md

Lines changed: 0 additions & 133 deletions
This file was deleted.

docs/source/user-guide/prefix-cache/index.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,5 @@ performance.
7979

8080
:::{toctree}
8181
:maxdepth: 1
82-
dram_store
8382
nfs_store
8483
:::

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def _get_package_data_with_so():
135135

136136
setup(
137137
name="uc-manager",
138-
version="0.1.0rc2",
138+
version="0.1.0rc4",
139139
description="Unified Cache Management",
140140
author="Unified Cache Team",
141141
packages=find_packages(),
@@ -144,5 +144,4 @@ def _get_package_data_with_so():
144144
cmdclass={"build_ext": CMakeBuild},
145145
package_data=_get_package_data_with_so(),
146146
zip_safe=False,
147-
install_requires=["vllm==0.9.2"],
148147
)

test/common/capture_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import functools
12
from typing import Any, Dict, List
23

34
from common.db_utils import write_to_db
@@ -44,6 +45,7 @@ def post_process(table_name: str, **kwargs) -> List[Dict[str, Any]]:
4445

4546
# ---------------- decorator ----------------
4647
def export_vars(func):
48+
@functools.wraps(func)
4749
def wrapper(*args, **kwargs):
4850
result = func(*args, **kwargs)
4951
# If the function returns a dict containing '_data' or 'data', post-process it

test/common/llmperf/__init__.py

Whitespace-only changes.
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import json
2+
import os
3+
import random
4+
from pathlib import Path
5+
from typing import Any, Dict, List
6+
7+
import yaml
8+
from common.llmperf.utils.token_benchmark import run_token_benchmark
9+
from common.llmperf.utils.utils import reset_prefill_cache
10+
11+
12+
def run_test_cases(
13+
llm_api,
14+
model,
15+
timeout,
16+
max_num_completed_requests,
17+
concurrent_requests,
18+
mean_input_tokens,
19+
stddev_input,
20+
mean_output_tokens,
21+
stddev_output,
22+
additional_sampling_params,
23+
timestamp_dir,
24+
server_url,
25+
tokenizer_path,
26+
hit_rate,
27+
):
28+
print(f"[INFO] Total {len(mean_input_tokens)} test cases to be executed")
29+
all_summaries = []
30+
failed_case = []
31+
32+
# Clear proxy environment variables
33+
env = os.environ.copy()
34+
env.pop("http_proxy", None)
35+
env.pop("https_proxy", None)
36+
37+
for i, (
38+
mean_input,
39+
mean_output,
40+
max_completed,
41+
concurrent,
42+
additional_sampling_params,
43+
hit_rate_val,
44+
) in enumerate(
45+
zip(
46+
mean_input_tokens,
47+
mean_output_tokens,
48+
max_num_completed_requests,
49+
concurrent_requests,
50+
additional_sampling_params,
51+
hit_rate,
52+
),
53+
start=1,
54+
):
55+
# for i, case in enumerate(mean_input_tokens):
56+
print(f"\n>>> Executing test case {i} <<<")
57+
reset_prefill_cache(env, server_url)
58+
# Use a fixed random_seed for each test to control PC hit_rate
59+
random_seed = random.randint(1, 100000)
60+
61+
try:
62+
# Determine if two runs are needed (PC hit_rate test)
63+
if hit_rate_val == 0:
64+
summary = run_token_benchmark(
65+
llm_api=llm_api,
66+
model=model,
67+
test_timeout_s=timeout,
68+
max_num_completed_requests=max_completed,
69+
concurrent_requests=concurrent,
70+
mean_input_tokens=mean_input,
71+
stddev_input_tokens=stddev_input,
72+
mean_output_tokens=mean_output,
73+
stddev_output_tokens=stddev_output,
74+
additional_sampling_params=additional_sampling_params,
75+
results_dir=str(timestamp_dir),
76+
random_seed=random_seed,
77+
openai_api_base=server_url + "/v1",
78+
tokenizer_path=tokenizer_path,
79+
user_metadata={"case_idx": i, "phase": "normal"},
80+
)
81+
else:
82+
print(
83+
f"[INFO] hit_rate > 0 detected, entering prefill mode, PC hit rate: {hit_rate_val} %"
84+
)
85+
# hit_rate > 0: first prefill mode
86+
prefill_mean_input = int(mean_input * hit_rate_val / 100)
87+
print(
88+
f"[INFO] Prefill execution: mean_input_tokens={prefill_mean_input}"
89+
)
90+
run_token_benchmark(
91+
llm_api=llm_api,
92+
model=model,
93+
test_timeout_s=timeout,
94+
max_num_completed_requests=max_completed,
95+
concurrent_requests=concurrent,
96+
mean_input_tokens=prefill_mean_input,
97+
stddev_input_tokens=stddev_input,
98+
mean_output_tokens=2,
99+
stddev_output_tokens=stddev_output,
100+
additional_sampling_params=additional_sampling_params,
101+
results_dir=str(timestamp_dir),
102+
random_seed=random_seed,
103+
openai_api_base=server_url + "/v1",
104+
tokenizer_path=tokenizer_path,
105+
user_metadata={"case_idx": i, "phase": "prefill"},
106+
)
107+
reset_prefill_cache(env, server_url)
108+
# Then run normal mode
109+
print("[INFO] Prefill completed, switching to normal mode execution")
110+
summary = run_token_benchmark(
111+
llm_api=llm_api,
112+
model=model,
113+
test_timeout_s=timeout,
114+
max_num_completed_requests=max_completed,
115+
concurrent_requests=concurrent,
116+
mean_input_tokens=mean_input,
117+
stddev_input_tokens=stddev_input,
118+
mean_output_tokens=mean_output,
119+
stddev_output_tokens=stddev_output,
120+
additional_sampling_params=additional_sampling_params,
121+
results_dir=str(timestamp_dir),
122+
random_seed=random_seed,
123+
openai_api_base=server_url + "/v1",
124+
tokenizer_path=tokenizer_path,
125+
user_metadata={"case_idx": i, "phase": "normal"},
126+
)
127+
all_summaries.append(summary)
128+
except Exception as e:
129+
print(f"[Warning] {e}")
130+
failed_case.append(i)
131+
132+
return all_summaries, failed_case
133+
134+
135+
def inference_results(
136+
mean_input_tokens,
137+
mean_output_tokens,
138+
max_num_completed_requests,
139+
concurrent_requests,
140+
additional_sampling_params,
141+
hit_rate,
142+
):
143+
config_file = Path(__file__).parent.parent.parent / "config.yaml"
144+
print("[INFO] Initialization complete, starting main process")
145+
print(f"[INFO] Reading configuration file: {config_file}")
146+
with open(config_file, "r", encoding="utf-8") as f:
147+
config = yaml.safe_load(f)
148+
llm_api = config.get("llm_connection", {}).get("llm_api", "openai")
149+
model = config.get("llm_connection", {}).get("model", "")
150+
test_timeout_s = config.get("llm_connection", {}).get("test_timeout_s", 60000)
151+
stddev_input_tokens = config.get("llm_connection", {}).get(
152+
"stddev_input_tokens", 0
153+
)
154+
stddev_output_tokens = config.get("llm_connection", {}).get(
155+
"stddev_output_tokens", 0
156+
)
157+
timestamp_dir = Path("results")
158+
timestamp_dir.mkdir(parents=True, exist_ok=True)
159+
server_url = config.get("llm_connection", {}).get("server_url", "")
160+
tokenizer_path = config.get("llm_connection", {}).get("tokenizer_path", "")
161+
print(f"[INFO] Created results directory: {timestamp_dir}")
162+
163+
all_summaries, failed_cases = run_test_cases(
164+
llm_api,
165+
model,
166+
test_timeout_s,
167+
max_num_completed_requests,
168+
concurrent_requests,
169+
mean_input_tokens,
170+
stddev_input_tokens,
171+
mean_output_tokens,
172+
stddev_output_tokens,
173+
additional_sampling_params,
174+
timestamp_dir,
175+
server_url,
176+
tokenizer_path,
177+
hit_rate,
178+
)
179+
total = len(mean_input_tokens)
180+
print(
181+
f"\n[INFO] All tests completed! Success: {total - len(failed_cases)}/{total}"
182+
)
183+
if failed_cases:
184+
print(f"[WARN] Failed case indices: {failed_cases}")
185+
return all_summaries

test/common/llmperf/utils/__init__.py

Whitespace-only changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# TODO (Avnishn): compute metrics in class
2+
INTER_TOKEN_LAT = "inter_token_latency_s"
3+
TTFT = "ttft_s"
4+
E2E_LAT = "end_to_end_latency_s"
5+
NUM_INPUT_TOKENS = "number_input_tokens"
6+
NUM_OUTPUT_TOKENS = "number_output_tokens"
7+
NUM_TOTAL_TOKENS = "number_total_tokens"
8+
REQ_OUTPUT_THROUGHPUT = "request_output_throughput_token_per_s"
9+
ERROR_MSG = "error_msg"
10+
ERROR_CODE = "error_code"
11+
ERROR_CODE_FREQ = "error_code_frequency"
12+
NUM_ERRORS = "number_errors"
13+
OUTPUT_THROUGHPUT = "mean_output_throughput_token_per_s"
14+
NUM_COMPLETED_REQUESTS = "num_completed_requests"
15+
COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min"
16+
ERROR_RATE = "error_rate"
17+
NUM_REQ_STARTED = "num_requests_started"

0 commit comments

Comments
 (0)