From ab9fea6923df83c1ef1caaf13ac4ffbafb9e768c Mon Sep 17 00:00:00 2001 From: HaFred Date: Thu, 26 Feb 2026 15:55:43 +0800 Subject: [PATCH 1/2] fix: extract the gt sid/pid with the order preserved --- .../tasks/v1_0/recommendation/utils.py | 30 ++++++++----------- .../tasks/v1_0/recommendation/utils_by_pid.py | 20 ++++++++----- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py b/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py index eb12122..4763cc9 100644 --- a/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py +++ b/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py @@ -7,27 +7,23 @@ from typing import Set, Dict, List, Any -def extract_ids_from_answer(answer: str) -> Set[str]: - """ - Extract all SIDs from answer field - - Args: - answer: String containing multiple <|sid_begin|>...<|sid_end|> patterns - - Returns: - Set of extracted SIDs - - Examples: - >>> extract_ids_from_answer("<|sid_begin|>123<|sid_end|><|sid_begin|>456<|sid_end|>") - {'123', '456'} +def extract_ids_from_answer(answer: str) -> list[str]: + """Extract all SIDs from answer field, preserving original order. + + Returns a deduplicated list that keeps the first occurrence order. + + >>> extract_ids_from_answer_ordered("<|sid_begin|>123<|sid_end|><|sid_begin|>456<|sid_end|>") + ['123', '456'] """ - correct_answers = set() + seen: set[str] = set() + ordered: list[str] = [] for part in answer.split('<|sid_begin|>'): if '<|sid_end|>' in part: sid = part.split('<|sid_end|>')[0].strip() - if sid: - correct_answers.add(sid) - return correct_answers + if sid and sid not in seen: + ordered.append(sid) + seen.add(sid) + return ordered def extract_first_id_from_answer(answer: str) -> str: diff --git a/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py b/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py index f3c56f1..161386f 100644 --- a/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py +++ b/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py @@ -129,15 +129,21 @@ def apply_sid_to_pid_strategy(pid_info_list: List[Dict[str, int]], strategy: str raise ValueError(f"Unknown strategy: {strategy}. Must be 'most_popular_originally', 'most_popular_after_downsampling', or 'random'") -def extract_ids_from_answer(answer: List[int]) -> Set[int]: - """ - Extract all PIDs from answer field (metadata["answer_pid"]) or (metadata["answer_iid"]) +def extract_ids_from_answer(answer: list[int]) -> list[int]: + """Extract all PIDs from answer field, preserving original order. - Examples: - >>> extract_ids_from_answer([123, 456, 789]) - {123, 456, 789} + Returns a deduplicated list that keeps the first occurrence order. + + >>> extract_ids_from_answer_ordered([123, 456, 123, 789]) + [123, 456, 789] """ - return set([pid for pid in answer if pid != 0]) + seen: set[int] = set() + ordered: list[int] = [] + for pid in answer: + if pid != 0 and pid not in seen: + ordered.append(pid) + seen.add(pid) + return ordered def extract_first_id_from_answer(answer: List[int]) -> int: From 0fd1b39d57f038e476ad5e659ea8d957c9405f89 Mon Sep 17 00:00:00 2001 From: HaFred Date: Thu, 26 Feb 2026 15:57:52 +0800 Subject: [PATCH 2/2] fix: extract the gt sid/pid with the order preserved --- benchmarks/benchmark/tasks/v1_0/recommendation/utils.py | 8 ++++---- .../benchmark/tasks/v1_0/recommendation/utils_by_pid.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py b/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py index 4763cc9..42cf2a3 100644 --- a/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py +++ b/benchmarks/benchmark/tasks/v1_0/recommendation/utils.py @@ -12,18 +12,18 @@ def extract_ids_from_answer(answer: str) -> list[str]: Returns a deduplicated list that keeps the first occurrence order. - >>> extract_ids_from_answer_ordered("<|sid_begin|>123<|sid_end|><|sid_begin|>456<|sid_end|>") + >>> extract_ids_from_answer("<|sid_begin|>123<|sid_end|><|sid_begin|>456<|sid_end|>") ['123', '456'] """ seen: set[str] = set() - ordered: list[str] = [] + correct_answers: list[str] = [] for part in answer.split('<|sid_begin|>'): if '<|sid_end|>' in part: sid = part.split('<|sid_end|>')[0].strip() if sid and sid not in seen: - ordered.append(sid) + correct_answers.append(sid) seen.add(sid) - return ordered + return correct_answers def extract_first_id_from_answer(answer: str) -> str: diff --git a/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py b/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py index 161386f..bc06671 100644 --- a/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py +++ b/benchmarks/benchmark/tasks/v1_0/recommendation/utils_by_pid.py @@ -134,16 +134,16 @@ def extract_ids_from_answer(answer: list[int]) -> list[int]: Returns a deduplicated list that keeps the first occurrence order. - >>> extract_ids_from_answer_ordered([123, 456, 123, 789]) + >>> extract_ids_from_answer([123, 456, 123, 789]) [123, 456, 789] """ seen: set[int] = set() - ordered: list[int] = [] + correct_answers: list[int] = [] for pid in answer: if pid != 0 and pid not in seen: - ordered.append(pid) + correct_answers.append(pid) seen.add(pid) - return ordered + return correct_answers def extract_first_id_from_answer(answer: List[int]) -> int: