From cbed950c4fcad6ce0d0faa2e6c5ec2b5779f80c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Sun, 7 Dec 2025 15:17:58 +0800 Subject: [PATCH 01/17] fix: doc fine mode bug --- src/memos/mem_reader/multi_modal_struct.py | 38 +++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 0cb4e154..9a7a3054 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -377,21 +377,37 @@ def _process_string_fine( except Exception as e: logger.error(f"[MultiModalFine] Error calling LLM: {e}") continue - for m in resp.get("memory list", []): + if resp.get("memory list", []): + for m in resp.get("memory list", []): + try: + # Normalize memory_type (same as simple_struct) + memory_type = ( + m.get("memory_type", "LongTermMemory") + .replace("长期记忆", "LongTermMemory") + .replace("用户记忆", "UserMemory") + ) + # Create fine mode memory item (same as simple_struct) + node = self._make_memory_item( + value=m.get("value", ""), + info=info, + memory_type=memory_type, + tags=m.get("tags", []), + key=m.get("key", ""), + sources=sources, # Preserve sources from fast item + background=resp.get("summary", ""), + ) + fine_memory_items.append(node) + except Exception as e: + logger.error(f"[MultiModalFine] parse error: {e}") + elif isinstance(resp, dict): try: - # Normalize memory_type (same as simple_struct) - memory_type = ( - m.get("memory_type", "LongTermMemory") - .replace("长期记忆", "LongTermMemory") - .replace("用户记忆", "UserMemory") - ) # Create fine mode memory item (same as simple_struct) node = self._make_memory_item( - value=m.get("value", ""), + value=resp.get("value", "").strip(), info=info, - memory_type=memory_type, - tags=m.get("tags", []), - key=m.get("key", ""), + memory_type="LongTermMemory", + tags=resp.get("tags", []), + key=resp.get("key", None), sources=sources, # Preserve sources from fast item background=resp.get("summary", ""), ) From 20e08396b113f169b7e34f405098f1059a93b7f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Sun, 7 Dec 2025 15:21:06 +0800 Subject: [PATCH 02/17] fix: doc fine mode bug --- src/memos/mem_reader/multi_modal_struct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 9a7a3054..3a9aa014 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -399,7 +399,7 @@ def _process_string_fine( fine_memory_items.append(node) except Exception as e: logger.error(f"[MultiModalFine] parse error: {e}") - elif isinstance(resp, dict): + elif resp.get("value") and resp.get("key"): try: # Create fine mode memory item (same as simple_struct) node = self._make_memory_item( From fff0fb290627eb9cc6fbeb242222ec398858bd71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Sun, 7 Dec 2025 16:30:19 +0800 Subject: [PATCH 03/17] feat: init longbench_v2 --- evaluation/scripts/longbench/__init__.py | 1 + .../scripts/longbench/longbench_ingestion.py | 306 +++++++++++++++++ .../scripts/longbench/longbench_metric.py | 235 +++++++++++++ .../scripts/longbench/longbench_responses.py | 196 +++++++++++ .../scripts/longbench/longbench_search.py | 309 ++++++++++++++++++ .../scripts/longbench_v2/prepare_data.py | 0 6 files changed, 1047 insertions(+) create mode 100644 evaluation/scripts/longbench/__init__.py create mode 100644 evaluation/scripts/longbench/longbench_ingestion.py create mode 100644 evaluation/scripts/longbench/longbench_metric.py create mode 100644 evaluation/scripts/longbench/longbench_responses.py create mode 100644 evaluation/scripts/longbench/longbench_search.py create mode 100644 evaluation/scripts/longbench_v2/prepare_data.py diff --git a/evaluation/scripts/longbench/__init__.py b/evaluation/scripts/longbench/__init__.py new file mode 100644 index 00000000..38cc006e --- /dev/null +++ b/evaluation/scripts/longbench/__init__.py @@ -0,0 +1 @@ +# LongBench evaluation scripts diff --git a/evaluation/scripts/longbench/longbench_ingestion.py b/evaluation/scripts/longbench/longbench_ingestion.py new file mode 100644 index 00000000..e2d2a8e7 --- /dev/null +++ b/evaluation/scripts/longbench/longbench_ingestion.py @@ -0,0 +1,306 @@ +import argparse +import json +import os +import sys + +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone + +from dotenv import load_dotenv +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +# All LongBench datasets +LONGBENCH_DATASETS = [ + "narrativeqa", + "qasper", + "multifieldqa_en", + "multifieldqa_zh", + "hotpotqa", + "2wikimqa", + "musique", + "dureader", + "gov_report", + "qmsum", + "multi_news", + "vcsum", + "trec", + "triviaqa", + "samsum", + "lsht", + "passage_count", + "passage_retrieval_en", + "passage_retrieval_zh", + "lcc", + "repobench-p", +] + + +def ingest_sample(client, sample, dataset_name, sample_idx, frame, version): + """Ingest a single LongBench sample as memories.""" + user_id = f"longbench_{dataset_name}_{sample_idx}_{version}" + conv_id = f"longbench_{dataset_name}_{sample_idx}_{version}" + + # Get context and convert to messages + context = sample.get("context", "") + # not used now: input_text = sample.get("input", "") + + # For memos, we ingest the context as document content + # Split context into chunks if it's too long (optional, memos handles this internally) + # For now, we'll ingest the full context as a single message + messages = [ + { + "role": "assistant", + "content": context, + "chat_time": datetime.now(timezone.utc).isoformat(), + } + ] + + if "memos-api" in frame: + try: + client.add(messages=messages, user_id=user_id, conv_id=conv_id, batch_size=1) + print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") + return False + elif "mem0" in frame: + timestamp = int(datetime.now(timezone.utc).timestamp()) + try: + client.add(messages=messages, user_id=user_id, timestamp=timestamp, batch_size=1) + print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") + return False + elif frame == "memobase": + for m in messages: + m["created_at"] = messages[0]["chat_time"] + try: + client.add(messages=messages, user_id=user_id, batch_size=1) + print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") + return False + elif frame == "memu": + try: + client.add(messages=messages, user_id=user_id, iso_date=messages[0]["chat_time"]) + print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") + return False + elif frame == "supermemory": + try: + client.add(messages=messages, user_id=user_id) + print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") + return False + + return False + + +def load_dataset_from_local(dataset_name, use_e=False): + """Load LongBench dataset from local JSONL file.""" + # Determine data directory + data_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "data", + "long_bench_v2", + ) + + # Determine filename + filename = f"{dataset_name}_e.jsonl" if use_e else f"{dataset_name}.jsonl" + + filepath = os.path.join(data_dir, filename) + + if not os.path.exists(filepath): + raise FileNotFoundError(f"Dataset file not found: {filepath}") + + # Load JSONL file + samples = [] + with open(filepath, encoding="utf-8") as f: + for line in f: + if line.strip(): + samples.append(json.loads(line)) + + return samples + + +def ingest_dataset(dataset_name, frame, version, num_workers=10, max_samples=None, use_e=False): + """Ingest a single LongBench dataset.""" + print(f"\n{'=' * 80}") + print(f"🔄 [INGESTING DATASET: {dataset_name.upper()}]".center(80)) + print(f"{'=' * 80}\n") + + # Load dataset from local files + try: + dataset = load_dataset_from_local(dataset_name, use_e) + print(f"Loaded {len(dataset)} samples from {dataset_name}") + except FileNotFoundError as e: + print(f"❌ Error loading dataset {dataset_name}: {e}") + return + except Exception as e: + print(f"❌ Error loading dataset {dataset_name}: {e}") + return + + # Limit samples if specified + if max_samples: + dataset = dataset[:max_samples] + print(f"Limited to {len(dataset)} samples") + + # Initialize client + client = None + if frame == "mem0" or frame == "mem0_graph": + from utils.client import Mem0Client + + client = Mem0Client(enable_graph="graph" in frame) + elif frame == "memos-api": + from utils.client import MemosApiClient + + client = MemosApiClient() + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() + elif frame == "memobase": + from utils.client import MemobaseClient + + client = MemobaseClient() + elif frame == "memu": + from utils.client import MemuClient + + client = MemuClient() + elif frame == "supermemory": + from utils.client import SupermemoryClient + + client = SupermemoryClient() + else: + print(f"❌ Unsupported frame: {frame}") + return + + # Ingest samples + success_count = 0 + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for idx, sample in enumerate(dataset): + future = executor.submit( + ingest_sample, client, sample, dataset_name, idx, frame, version + ) + futures.append(future) + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc=f"Ingesting {dataset_name}", + ): + try: + if future.result(): + success_count += 1 + except Exception as e: + print(f"Error processing sample: {e}") + + print(f"\n✅ Completed ingesting {dataset_name}: {success_count}/{len(dataset)} samples") + return success_count + + +def main(frame, version="default", num_workers=10, datasets=None, max_samples=None, use_e=False): + """Main ingestion function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH INGESTION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Determine which datasets to process + dataset_list = [d.strip() for d in datasets.split(",")] if datasets else LONGBENCH_DATASETS + + # Filter valid datasets + valid_datasets = [d for d in dataset_list if d in LONGBENCH_DATASETS] + if not valid_datasets: + print("❌ No valid datasets specified") + return + + print(f"Processing {len(valid_datasets)} datasets: {valid_datasets}\n") + + # Ingest each dataset + total_success = 0 + total_samples = 0 + for dataset_name in valid_datasets: + success = ingest_dataset(dataset_name, frame, version, num_workers, max_samples, use_e) + if success is not None: + total_success += success + total_samples += max_samples if max_samples else 200 # Approximate + + print(f"\n{'=' * 80}") + print(f"✅ INGESTION COMPLETE: {total_success} samples ingested".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for saving results", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers", + ) + parser.add_argument( + "--datasets", + type=str, + default=None, + help="Comma-separated list of datasets to process (default: all)", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples per dataset (default: all)", + ) + parser.add_argument( + "--e", + action="store_true", + help="Use LongBench-E variant (uniform length distribution)", + ) + args = parser.parse_args() + + main( + args.lib, + args.version, + args.workers, + args.datasets, + args.max_samples, + args.e, + ) diff --git a/evaluation/scripts/longbench/longbench_metric.py b/evaluation/scripts/longbench/longbench_metric.py new file mode 100644 index 00000000..495a793a --- /dev/null +++ b/evaluation/scripts/longbench/longbench_metric.py @@ -0,0 +1,235 @@ +import argparse +import json +import os +import sys + +import numpy as np + + +# Import LongBench metrics +# Try to import from the LongBench directory +LONGBENCH_METRICS_DIR = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "longbench_v2", + "LongBench-main", + "LongBench", +) + +if os.path.exists(LONGBENCH_METRICS_DIR): + sys.path.insert(0, LONGBENCH_METRICS_DIR) + try: + from metrics import ( + classification_score, + code_sim_score, + count_score, + qa_f1_score, + qa_f1_zh_score, + retrieval_score, + retrieval_zh_score, + rouge_score, + rouge_zh_score, + ) + except ImportError: + print(f"Warning: Could not import metrics from {LONGBENCH_METRICS_DIR}") + print("Please ensure LongBench metrics.py is available") + raise +else: + print(f"Error: LongBench metrics directory not found at {LONGBENCH_METRICS_DIR}") + raise FileNotFoundError("LongBench metrics directory not found") + +# Dataset to metric mapping (from LongBench eval.py) +dataset2metric = { + "narrativeqa": qa_f1_score, + "qasper": qa_f1_score, + "multifieldqa_en": qa_f1_score, + "multifieldqa_zh": qa_f1_zh_score, + "hotpotqa": qa_f1_score, + "2wikimqa": qa_f1_score, + "musique": qa_f1_score, + "dureader": rouge_zh_score, + "gov_report": rouge_score, + "qmsum": rouge_score, + "multi_news": rouge_score, + "vcsum": rouge_zh_score, + "trec": classification_score, + "triviaqa": qa_f1_score, + "samsum": rouge_score, + "lsht": classification_score, + "passage_retrieval_en": retrieval_score, + "passage_count": count_score, + "passage_retrieval_zh": retrieval_zh_score, + "lcc": code_sim_score, + "repobench-p": code_sim_score, +} + + +def scorer(dataset, predictions, answers, all_classes): + """Calculate score for a dataset.""" + total_score = 0.0 + for prediction, ground_truths in zip(predictions, answers, strict=False): + score = 0.0 + # For some tasks, only take the first line + if dataset in ["trec", "triviaqa", "samsum", "lsht"]: + prediction = prediction.lstrip("\n").split("\n")[0] + + # Calculate max score across all ground truth answers + for ground_truth in ground_truths: + metric_func = dataset2metric.get(dataset) + if metric_func: + if dataset in ["trec", "lsht"]: + # Classification tasks need all_classes + score = max( + score, + metric_func(prediction, ground_truth, all_classes=all_classes), + ) + else: + score = max(score, metric_func(prediction, ground_truth)) + else: + print(f"Warning: No metric function for dataset {dataset}") + + total_score += score + + return round(100 * total_score / len(predictions), 2) if len(predictions) > 0 else 0.0 + + +def scorer_e(dataset, predictions, answers, lengths, all_classes): + """Calculate score for LongBench-E (with length-based analysis).""" + scores = {"0-4k": [], "4-8k": [], "8k+": []} + + for prediction, ground_truths, length in zip(predictions, answers, lengths, strict=False): + score = 0.0 + # For some tasks, only take the first line + if dataset in ["trec", "triviaqa", "samsum", "lsht"]: + prediction = prediction.lstrip("\n").split("\n")[0] + + # Calculate max score across all ground truth answers + metric_func = dataset2metric.get(dataset) + if metric_func: + for ground_truth in ground_truths: + if dataset in ["trec", "lsht"]: + score = max( + score, + metric_func(prediction, ground_truth, all_classes=all_classes), + ) + else: + score = max(score, metric_func(prediction, ground_truth)) + + # Categorize by length + if length < 4000: + scores["0-4k"].append(score) + elif length < 8000: + scores["4-8k"].append(score) + else: + scores["8k+"].append(score) + + # Calculate average scores per length category + for key in scores: + if len(scores[key]) > 0: + scores[key] = round(100 * np.mean(scores[key]), 2) + else: + scores[key] = 0.0 + + return scores + + +def main(frame, version="default", use_e=False): + """Main metric calculation function.""" + print("\n" + "=" * 80) + print(f"📊 LONGBENCH METRICS CALCULATION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load responses + responses_path = f"results/longbench/{frame}-{version}/{frame}_longbench_responses.json" + if not os.path.exists(responses_path): + print(f"❌ Responses not found: {responses_path}") + print("Please run longbench_responses.py first") + return + + with open(responses_path, encoding="utf-8") as f: + responses = json.load(f) + + # Calculate metrics for each dataset + all_scores = {} + overall_scores = [] + + for dataset_name, samples in responses.items(): + print(f"Calculating metrics for {dataset_name}...") + + predictions = [s.get("answer", "") for s in samples] + answers = [s.get("golden_answer", []) for s in samples] + all_classes = samples[0].get("all_classes") if samples else None + + if use_e: + lengths = [s.get("length", 0) for s in samples] + score = scorer_e(dataset_name, predictions, answers, lengths, all_classes) + else: + score = scorer(dataset_name, predictions, answers, all_classes) + + all_scores[dataset_name] = score + print(f" {dataset_name}: {score}") + + # For overall average, use single score (not length-based) + if use_e: + # Average across length categories + if isinstance(score, dict): + overall_scores.append(np.mean(list(score.values()))) + else: + overall_scores.append(score) + + # Calculate overall average + if overall_scores: + all_scores["average"] = round(np.mean(overall_scores), 2) + print(f"\nOverall Average: {all_scores['average']}") + + # Save metrics + output_path = f"results/longbench/{frame}-{version}/{frame}_longbench_metrics.json" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(all_scores, f, ensure_ascii=False, indent=4) + + print(f"\n{'=' * 80}") + print(f"✅ METRICS CALCULATION COMPLETE: Results saved to {output_path}".center(80)) + print(f"{'=' * 80}\n") + + # Print summary table + print("\n📊 Summary of Results:") + print("-" * 80) + for dataset, score in sorted(all_scores.items()): + if isinstance(score, dict): + print(f"{dataset:30s}: {score}") + else: + print(f"{dataset:30s}: {score:.2f}%") + print("-" * 80) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for loading results", + ) + parser.add_argument( + "--e", + action="store_true", + help="Use LongBench-E variant (uniform length distribution)", + ) + args = parser.parse_args() + + main(args.lib, args.version, args.e) diff --git a/evaluation/scripts/longbench/longbench_responses.py b/evaluation/scripts/longbench/longbench_responses.py new file mode 100644 index 00000000..2d160160 --- /dev/null +++ b/evaluation/scripts/longbench/longbench_responses.py @@ -0,0 +1,196 @@ +import argparse +import json +import os +import sys + +from concurrent.futures import ThreadPoolExecutor, as_completed +from time import time + +from dotenv import load_dotenv +from openai import OpenAI +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +# Dataset to prompt mapping (from LongBench config) +DATASET_PROMPTS = { + "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:", + "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:", + "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", + "dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:", + "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:", + "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:", + "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:", + "vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:", + "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}", + "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}", + "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}", + "lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}", + "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ", + "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + "passage_retrieval_zh": '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + "lcc": "Please complete the code given below. \n{context}Next line of code:\n", + "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n", +} + + +def generate_response(llm_client, dataset_name, context, input_text): + """Generate response using LLM.""" + # Get prompt template for dataset + prompt_template = DATASET_PROMPTS.get(dataset_name, "{context}\n\nQuestion: {input}\nAnswer:") + + # Format prompt + if "{input}" in prompt_template: + prompt = prompt_template.format(context=context, input=input_text) + else: + # Some prompts don't have {input} placeholder (like gov_report, vcsum) + prompt = prompt_template.format(context=context) + + try: + response = llm_client.chat.completions.create( + model=os.getenv("CHAT_MODEL"), + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt}, + ], + temperature=0, + ) + result = response.choices[0].message.content or "" + return result + except Exception as e: + print(f"Error generating response: {e}") + return "" + + +def process_sample(search_result, llm_client): + """Process a single sample: generate answer.""" + start = time() + + dataset_name = search_result.get("dataset") + context = search_result.get("context", "") + input_text = search_result.get("input", "") + + # Generate answer + answer = generate_response(llm_client, dataset_name, context, input_text) + + response_duration_ms = (time() - start) * 1000 + + return { + "dataset": dataset_name, + "sample_idx": search_result.get("sample_idx"), + "input": input_text, + "answer": answer, + "golden_answer": search_result.get("answers", []), + "all_classes": search_result.get("all_classes"), + "length": search_result.get("length", 0), + "search_context": context, + "response_duration_ms": response_duration_ms, + "search_duration_ms": search_result.get("search_duration_ms", 0), + } + + +def main(frame, version="default", num_workers=10): + """Main response generation function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH RESPONSE GENERATION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load search results + search_path = f"results/longbench/{frame}-{version}/{frame}_longbench_search_results.json" + if not os.path.exists(search_path): + print(f"❌ Search results not found: {search_path}") + print("Please run longbench_search.py first") + return + + with open(search_path, encoding="utf-8") as f: + search_results = json.load(f) + + # Initialize LLM client + llm_client = OpenAI( + api_key=os.getenv("CHAT_MODEL_API_KEY"), + base_url=os.getenv("CHAT_MODEL_BASE_URL"), + ) + print(f"🔌 Using OpenAI client with model: {os.getenv('CHAT_MODEL')}") + + # Process all samples + all_responses = [] + for dataset_name, samples in search_results.items(): + print(f"\nProcessing {len(samples)} samples from {dataset_name}...") + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(process_sample, sample, llm_client) for sample in samples] + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc=f"Generating responses for {dataset_name}", + ): + result = future.result() + if result: + all_responses.append(result) + + # Save responses + output_path = f"results/longbench/{frame}-{version}/{frame}_longbench_responses.json" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # Group by dataset + responses_by_dataset = {} + for response in all_responses: + dataset = response["dataset"] + if dataset not in responses_by_dataset: + responses_by_dataset[dataset] = [] + responses_by_dataset[dataset].append(response) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(responses_by_dataset, f, ensure_ascii=False, indent=2) + + print(f"\n{'=' * 80}") + print(f"✅ RESPONSE GENERATION COMPLETE: Results saved to {output_path}".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for loading results", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers", + ) + args = parser.parse_args() + + main(args.lib, args.version, args.workers) diff --git a/evaluation/scripts/longbench/longbench_search.py b/evaluation/scripts/longbench/longbench_search.py new file mode 100644 index 00000000..aaf7300e --- /dev/null +++ b/evaluation/scripts/longbench/longbench_search.py @@ -0,0 +1,309 @@ +import argparse +import json +import os +import sys + +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from time import time + +from dotenv import load_dotenv +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +# All LongBench datasets +LONGBENCH_DATASETS = [ + "narrativeqa", + "qasper", + "multifieldqa_en", + "multifieldqa_zh", + "hotpotqa", + "2wikimqa", + "musique", + "dureader", + "gov_report", + "qmsum", + "multi_news", + "vcsum", + "trec", + "triviaqa", + "samsum", + "lsht", + "passage_count", + "passage_retrieval_en", + "passage_retrieval_zh", + "lcc", + "repobench-p", +] + + +def memos_api_search(client, query, user_id, top_k, frame): + """Search using memos API.""" + start = time() + search_results = client.search(query=query, user_id=user_id, top_k=top_k) + + # Format context from search results based on frame type + context = "" + if frame == "memos-api" or frame == "memos-api-online": + if isinstance(search_results, dict) and "text_mem" in search_results: + context = "\n".join([i["memory"] for i in search_results["text_mem"][0]["memories"]]) + if "pref_string" in search_results: + context += f"\n{search_results.get('pref_string', '')}" + elif frame == "mem0" or frame == "mem0_graph": + if isinstance(search_results, dict) and "results" in search_results: + context = "\n".join( + [ + f"{m.get('created_at', '')}: {m.get('memory', '')}" + for m in search_results["results"] + ] + ) + elif frame == "memobase": + context = search_results if isinstance(search_results, str) else "" + elif frame == "memu": + context = "\n".join(search_results) if isinstance(search_results, list) else "" + elif frame == "supermemory": + context = search_results if isinstance(search_results, str) else "" + + duration_ms = (time() - start) * 1000 + return context, duration_ms + + +def process_sample(client, sample, dataset_name, sample_idx, frame, version, top_k): + """Process a single sample: search for relevant memories.""" + user_id = f"longbench_{dataset_name}_{sample_idx}_{version}" + query = sample.get("input", "") + + if not query: + return None + + context, duration_ms = memos_api_search(client, query, user_id, top_k, frame) + + return { + "dataset": dataset_name, + "sample_idx": sample_idx, + "input": query, + "context": context, + "search_duration_ms": duration_ms, + "answers": sample.get("answers", []), + "all_classes": sample.get("all_classes"), + "length": sample.get("length", 0), + } + + +def load_dataset_from_local(dataset_name, use_e=False): + """Load LongBench dataset from local JSONL file.""" + # Determine data directory + data_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "data", + "long_bench_v2", + ) + + # Determine filename + filename = f"{dataset_name}_e.jsonl" if use_e else f"{dataset_name}.jsonl" + + filepath = os.path.join(data_dir, filename) + + if not os.path.exists(filepath): + raise FileNotFoundError(f"Dataset file not found: {filepath}") + + # Load JSONL file + samples = [] + with open(filepath, encoding="utf-8") as f: + for line in f: + if line.strip(): + samples.append(json.loads(line)) + + return samples + + +def process_dataset( + dataset_name, frame, version, top_k=20, num_workers=10, max_samples=None, use_e=False +): + """Process a single dataset: search for all samples.""" + print(f"\n{'=' * 80}") + print(f"🔍 [SEARCHING DATASET: {dataset_name.upper()}]".center(80)) + print(f"{'=' * 80}\n") + + # Load dataset from local files + try: + dataset = load_dataset_from_local(dataset_name, use_e) + print(f"Loaded {len(dataset)} samples from {dataset_name}") + except FileNotFoundError as e: + print(f"❌ Error loading dataset {dataset_name}: {e}") + return [] + except Exception as e: + print(f"❌ Error loading dataset {dataset_name}: {e}") + return [] + + # Limit samples if specified + if max_samples: + dataset = dataset[:max_samples] + print(f"Limited to {len(dataset)} samples") + + # Initialize client + client = None + if frame == "mem0" or frame == "mem0_graph": + from utils.client import Mem0Client + + client = Mem0Client(enable_graph="graph" in frame) + elif frame == "memos-api": + from utils.client import MemosApiClient + + client = MemosApiClient() + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() + elif frame == "memobase": + from utils.client import MemobaseClient + + client = MemobaseClient() + elif frame == "memu": + from utils.client import MemuClient + + client = MemuClient() + elif frame == "supermemory": + from utils.client import SupermemoryClient + + client = SupermemoryClient() + else: + print(f"❌ Unsupported frame: {frame}") + return [] + + # Process samples + search_results = [] + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for idx, sample in enumerate(dataset): + future = executor.submit( + process_sample, client, sample, dataset_name, idx, frame, version, top_k + ) + futures.append(future) + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc=f"Searching {dataset_name}", + ): + result = future.result() + if result: + search_results.append(result) + + print(f"\n✅ Completed searching {dataset_name}: {len(search_results)} samples") + return search_results + + +def main( + frame, version="default", num_workers=10, top_k=20, datasets=None, max_samples=None, use_e=False +): + """Main search function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH SEARCH - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Determine which datasets to process + dataset_list = [d.strip() for d in datasets.split(",")] if datasets else LONGBENCH_DATASETS + + # Filter valid datasets + valid_datasets = [d for d in dataset_list if d in LONGBENCH_DATASETS] + if not valid_datasets: + print("❌ No valid datasets specified") + return + + print(f"Processing {len(valid_datasets)} datasets: {valid_datasets}\n") + + # Create output directory + os.makedirs(f"results/longbench/{frame}-{version}/", exist_ok=True) + + # Process each dataset + all_results = defaultdict(list) + for dataset_name in valid_datasets: + results = process_dataset( + dataset_name, frame, version, top_k, num_workers, max_samples, use_e + ) + all_results[dataset_name] = results + + # Save results + output_path = f"results/longbench/{frame}-{version}/{frame}_longbench_search_results.json" + with open(output_path, "w", encoding="utf-8") as f: + json.dump(dict(all_results), f, ensure_ascii=False, indent=2) + + print(f"\n{'=' * 80}") + print(f"✅ SEARCH COMPLETE: Results saved to {output_path}".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=[ + "mem0", + "mem0_graph", + "memos-api", + "memos-api-online", + "memobase", + "memu", + "supermemory", + ], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for saving results", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers", + ) + parser.add_argument( + "--top_k", + type=int, + default=20, + help="Number of results to retrieve in search queries", + ) + parser.add_argument( + "--datasets", + type=str, + default=None, + help="Comma-separated list of datasets to process (default: all)", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples per dataset (default: all)", + ) + parser.add_argument( + "--e", + action="store_true", + help="Use LongBench-E variant (uniform length distribution)", + ) + args = parser.parse_args() + + main( + args.lib, + args.version, + args.workers, + args.top_k, + args.datasets, + args.max_samples, + args.e, + ) diff --git a/evaluation/scripts/longbench_v2/prepare_data.py b/evaluation/scripts/longbench_v2/prepare_data.py new file mode 100644 index 00000000..e69de29b From 9beabbac3fae9ff0bc8c4335aa663e424840e101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Sun, 7 Dec 2025 18:03:45 +0800 Subject: [PATCH 04/17] feat: more strict embedder trucation --- src/memos/embedders/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/memos/embedders/base.py b/src/memos/embedders/base.py index d573521f..22ef0d30 100644 --- a/src/memos/embedders/base.py +++ b/src/memos/embedders/base.py @@ -79,7 +79,7 @@ def __init__(self, config: BaseEmbedderConfig): """Initialize the embedding model with the given configuration.""" self.config = config - def _truncate_texts(self, texts: list[str], approx_char_per_token=1.1) -> (list)[str]: + def _truncate_texts(self, texts: list[str], approx_char_per_token=1.0) -> (list)[str]: """ Truncate texts to fit within max_tokens limit if configured. @@ -98,7 +98,7 @@ def _truncate_texts(self, texts: list[str], approx_char_per_token=1.1) -> (list) if len(t) < max_tokens * approx_char_per_token: truncated.append(t) else: - truncated.append(_truncate_text_to_tokens(t, max_tokens)) + truncated.append(t[:max_tokens]) return truncated @abstractmethod From 8f368bb7b347132d7f93f4365f5180628310106c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Sun, 7 Dec 2025 18:41:51 +0800 Subject: [PATCH 05/17] feat: parallel processing fine mode in multi-modal-fine --- src/memos/mem_reader/multi_modal_struct.py | 30 +++++++++++++++++----- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 3a9aa014..4d4faff3 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -358,13 +358,15 @@ def _process_string_fine( if not fast_memory_items: return [] - fine_memory_items = [] + def _process_one_item(fast_item: TextualMemoryItem) -> list[TextualMemoryItem]: + """Process a single fast memory item and return a list of fine items.""" + fine_items: list[TextualMemoryItem] = [] - for fast_item in fast_memory_items: # Extract memory text (string content) mem_str = fast_item.memory or "" if not mem_str.strip(): - continue + return fine_items + sources = fast_item.metadata.sources or [] if not isinstance(sources, list): sources = [sources] @@ -376,7 +378,8 @@ def _process_string_fine( resp = self._get_llm_response(mem_str, custom_tags, sources, prompt_type) except Exception as e: logger.error(f"[MultiModalFine] Error calling LLM: {e}") - continue + return fine_items + if resp.get("memory list", []): for m in resp.get("memory list", []): try: @@ -396,7 +399,7 @@ def _process_string_fine( sources=sources, # Preserve sources from fast item background=resp.get("summary", ""), ) - fine_memory_items.append(node) + fine_items.append(node) except Exception as e: logger.error(f"[MultiModalFine] parse error: {e}") elif resp.get("value") and resp.get("key"): @@ -411,10 +414,25 @@ def _process_string_fine( sources=sources, # Preserve sources from fast item background=resp.get("summary", ""), ) - fine_memory_items.append(node) + fine_items.append(node) except Exception as e: logger.error(f"[MultiModalFine] parse error: {e}") + return fine_items + + fine_memory_items: list[TextualMemoryItem] = [] + + with ContextThreadPoolExecutor(max_workers=8) as executor: + futures = [executor.submit(_process_one_item, item) for item in fast_memory_items] + + for future in concurrent.futures.as_completed(futures): + try: + result = future.result() + if result: + fine_memory_items.extend(result) + except Exception as e: + logger.error(f"[MultiModalFine] worker error: {e}") + return fine_memory_items def _get_llm_tool_trajectory_response(self, mem_str: str) -> dict: From be293bcd059be12cf4226870b4b7c470d3503de2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 11:54:28 +0800 Subject: [PATCH 06/17] feat: update parsers; add chunk info into source; remove origin_part --- src/memos/mem_reader/multi_modal_struct.py | 2 +- .../read_multi_modal/file_content_parser.py | 96 ++++++++++++++----- .../read_multi_modal/image_parser.py | 5 - .../read_multi_modal/text_content_parser.py | 1 - .../read_multi_modal/tool_parser.py | 3 - .../read_multi_modal/user_parser.py | 5 - 6 files changed, 74 insertions(+), 38 deletions(-) diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 4d4faff3..ed139f95 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -422,7 +422,7 @@ def _process_one_item(fast_item: TextualMemoryItem) -> list[TextualMemoryItem]: fine_memory_items: list[TextualMemoryItem] = [] - with ContextThreadPoolExecutor(max_workers=8) as executor: + with ContextThreadPoolExecutor(max_workers=30) as executor: futures = [executor.submit(_process_one_item, item) for item in fast_memory_items] for future in concurrent.futures.as_completed(futures): diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 67de3020..fe1b4427 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -167,27 +167,40 @@ def create_source( self, message: File, info: dict[str, Any], + chunk_index: int | None = None, + chunk_total: int | None = None, + chunk_content: str | None = None, ) -> SourceMessage: """Create SourceMessage from file content part.""" if isinstance(message, dict): file_info = message.get("file", {}) - return SourceMessage( - type="file", - doc_path=file_info.get("filename") or file_info.get("file_id", ""), - content=file_info.get("file_data", ""), - original_part=message, - ) - return SourceMessage(type="file", doc_path=str(message)) + source_dict = { + "type": "file", + "doc_path": file_info.get("filename") or file_info.get("file_id", ""), + "content": chunk_content + if chunk_content is not None + else file_info.get("file_data", ""), + } + # Add chunk ordering information if provided + if chunk_index is not None: + source_dict["chunk_index"] = chunk_index + if chunk_total is not None: + source_dict["chunk_total"] = chunk_total + return SourceMessage(**source_dict) + source_dict = {"type": "file", "doc_path": str(message)} + if chunk_index is not None: + source_dict["chunk_index"] = chunk_index + if chunk_total is not None: + source_dict["chunk_total"] = chunk_total + if chunk_content is not None: + source_dict["content"] = chunk_content + return SourceMessage(**source_dict) def rebuild_from_source( self, source: SourceMessage, ) -> File: """Rebuild file content part from SourceMessage.""" - # Use original_part if available - if hasattr(source, "original_part") and source.original_part: - return source.original_part - # Rebuild from source fields return { "type": "file", @@ -311,9 +324,6 @@ def parse_fast( # Split content into chunks content_chunks = self._split_text(content) - # Create source - source = self.create_source(message, info) - # Extract info fields info_ = info.copy() if file_id: @@ -325,12 +335,23 @@ def parse_fast( # (since we don't have role information at this level) memory_type = "LongTermMemory" file_ids = [file_id] if file_id else [] + total_chunks = len(content_chunks) + # Create memory items for each chunk memory_items = [] for chunk_idx, chunk_text in enumerate(content_chunks): if not chunk_text.strip(): continue + # Create source for this specific chunk with its index and content + source = self.create_source( + message, + info, + chunk_index=chunk_idx, + chunk_total=total_chunks, + chunk_content=chunk_text, + ) + memory_item = TextualMemoryItem( memory=chunk_text, metadata=TreeNodeTextualMemoryMetadata( @@ -341,7 +362,7 @@ def parse_fast( tags=[ "mode:fast", "multimodal:file", - f"chunk:{chunk_idx + 1}/{len(content_chunks)}", + f"chunk:{chunk_idx + 1}/{total_chunks}", ], key=_derive_key(chunk_text), embedding=self.embedder.embed([chunk_text])[0], @@ -358,6 +379,14 @@ def parse_fast( # If no chunks were created, create a placeholder if not memory_items: + # Create source for placeholder (no chunk index since there are no chunks) + placeholder_source = self.create_source( + message, + info, + chunk_index=None, + chunk_total=0, + chunk_content=content, + ) memory_item = TextualMemoryItem( memory=content, metadata=TreeNodeTextualMemoryMetadata( @@ -369,7 +398,7 @@ def parse_fast( key=_derive_key(content), embedding=self.embedder.embed([content])[0], usage=[], - sources=[source], + sources=[placeholder_source], background="", confidence=0.99, type="fact", @@ -462,7 +491,9 @@ def parse_fine( parsed_text = self._handle_base64(file_data) else: - parsed_text = file_data + # TODO: discuss the proper place for processing + # string file-data + return [] # Priority 2: If file_id is provided but no file_data, try to use file_id as path elif file_id: logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}") @@ -490,9 +521,6 @@ def parse_fine( f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}" ) - # Create source - source = self.create_source(message, info) - # Extract info fields if not info: info = {} @@ -520,8 +548,25 @@ def _make_memory_item( mem_type: str = memory_type, tags: list[str] | None = None, key: str | None = None, + chunk_idx: int | None = None, ) -> TextualMemoryItem: - """Construct memory item with common fields.""" + """Construct memory item with common fields. + + Args: + value: Memory content (chunk text) + mem_type: Memory type + tags: Tags for the memory item + key: Key for the memory item + chunk_idx: Index of the chunk in the document (0-based) + """ + # Create source for this specific chunk with its index and content + chunk_source = self.create_source( + message, + info, + chunk_index=chunk_idx, + chunk_total=total_chunks, + chunk_content=value, + ) return TextualMemoryItem( memory=value, metadata=TreeNodeTextualMemoryMetadata( @@ -533,7 +578,7 @@ def _make_memory_item( key=key if key is not None else _derive_key(value), embedding=self.embedder.embed([value])[0], usage=[], - sources=[source], + sources=[chunk_source], background="", confidence=0.99, type="fact", @@ -555,6 +600,7 @@ def _make_fallback( f"fallback:{reason}", f"chunk:{chunk_idx + 1}/{total_chunks}", ], + chunk_idx=chunk_idx, ) # Handle empty chunks case @@ -563,6 +609,7 @@ def _make_fallback( _make_memory_item( value=parsed_text or "[File: empty content]", tags=["mode:fine", "multimodal:file"], + chunk_idx=None, ) ] @@ -591,6 +638,7 @@ def _process_chunk(chunk_idx: int, chunk_text: str) -> TextualMemoryItem: mem_type=llm_mem_type, tags=tags, key=response_json.get("key"), + chunk_idx=chunk_idx, ) except Exception as e: logger.error(f"[FileContentParser] LLM error for chunk {chunk_idx}: {e}") @@ -637,6 +685,8 @@ def _process_chunk(chunk_idx: int, chunk_text: str) -> TextualMemoryItem: return memory_items or [ _make_memory_item( - value=parsed_text or "[File: empty content]", tags=["mode:fine", "multimodal:file"] + value=parsed_text or "[File: empty content]", + tags=["mode:fine", "multimodal:file"], + chunk_idx=None, ) ] diff --git a/src/memos/mem_reader/read_multi_modal/image_parser.py b/src/memos/mem_reader/read_multi_modal/image_parser.py index 88991fbe..5a19393a 100644 --- a/src/memos/mem_reader/read_multi_modal/image_parser.py +++ b/src/memos/mem_reader/read_multi_modal/image_parser.py @@ -53,7 +53,6 @@ def create_source( return SourceMessage( type="image", content=url, - original_part=message, url=url, detail=detail, ) @@ -64,10 +63,6 @@ def rebuild_from_source( source: SourceMessage, ) -> ChatCompletionContentPartImageParam: """Rebuild image_url content part from SourceMessage.""" - # Use original_part if available - if hasattr(source, "original_part") and source.original_part: - return source.original_part - # Rebuild from source fields url = getattr(source, "url", "") or (source.content or "").replace("[image_url]: ", "") detail = getattr(source, "detail", "auto") diff --git a/src/memos/mem_reader/read_multi_modal/text_content_parser.py b/src/memos/mem_reader/read_multi_modal/text_content_parser.py index 5ff0a76f..febc166e 100644 --- a/src/memos/mem_reader/read_multi_modal/text_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/text_content_parser.py @@ -51,7 +51,6 @@ def create_source( return SourceMessage( type="text", content=text, - original_part=message, ) return SourceMessage(type="text", content=str(message)) diff --git a/src/memos/mem_reader/read_multi_modal/tool_parser.py b/src/memos/mem_reader/read_multi_modal/tool_parser.py index 09bd9e9d..e13b684a 100644 --- a/src/memos/mem_reader/read_multi_modal/tool_parser.py +++ b/src/memos/mem_reader/read_multi_modal/tool_parser.py @@ -79,7 +79,6 @@ def create_source( filename=file_info.get("filename", ""), file_id=file_info.get("file_id", ""), tool_call_id=tool_call_id, - original_part=part, ) ) elif part_type == "image_url": @@ -93,7 +92,6 @@ def create_source( content=file_info.get("url", ""), detail=file_info.get("detail", "auto"), tool_call_id=tool_call_id, - original_part=part, ) ) elif part_type == "input_audio": @@ -107,7 +105,6 @@ def create_source( content=file_info.get("data", ""), format=file_info.get("format", "wav"), tool_call_id=tool_call_id, - original_part=part, ) ) else: diff --git a/src/memos/mem_reader/read_multi_modal/user_parser.py b/src/memos/mem_reader/read_multi_modal/user_parser.py index c7b8ad4e..359506e1 100644 --- a/src/memos/mem_reader/read_multi_modal/user_parser.py +++ b/src/memos/mem_reader/read_multi_modal/user_parser.py @@ -68,8 +68,6 @@ def create_source( chat_time=chat_time, message_id=message_id, content=part.get("text", ""), - # Save original part for reconstruction - original_part=part, ) ) elif part_type == "file": @@ -82,7 +80,6 @@ def create_source( message_id=message_id, doc_path=file_info.get("filename") or file_info.get("file_id", ""), content=file_info.get("file_data", ""), - original_part=part, ) ) elif part_type == "image_url": @@ -94,7 +91,6 @@ def create_source( chat_time=chat_time, message_id=message_id, image_path=image_info.get("url"), - original_part=part, ) ) else: @@ -106,7 +102,6 @@ def create_source( chat_time=chat_time, message_id=message_id, content=f"[{part_type}]", - original_part=part, ) ) else: From 2edd0a3082e56d172abcc010fa381cd342002950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 12:09:19 +0800 Subject: [PATCH 07/17] feat: modify chunk_content in file-fine-parser --- src/memos/mem_reader/read_multi_modal/file_content_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 75b627af..cce99e76 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -547,6 +547,7 @@ def _make_memory_item( tags: list[str] | None = None, key: str | None = None, chunk_idx: int | None = None, + chunk_content: str | None = None, ) -> TextualMemoryItem: """Construct memory item with common fields. @@ -563,7 +564,7 @@ def _make_memory_item( info, chunk_index=chunk_idx, chunk_total=total_chunks, - chunk_content=value, + chunk_content=chunk_content, ) return TextualMemoryItem( memory=value, @@ -599,6 +600,7 @@ def _make_fallback( f"chunk:{chunk_idx + 1}/{total_chunks}", ], chunk_idx=chunk_idx, + chunk_content=chunk_text, ) # Handle empty chunks case @@ -637,6 +639,7 @@ def _process_chunk(chunk_idx: int, chunk_text: str) -> TextualMemoryItem: tags=tags, key=response_json.get("key"), chunk_idx=chunk_idx, + chunk_content=chunk_text, ) except Exception as e: logger.error(f"[FileContentParser] LLM error for chunk {chunk_idx}: {e}") From f80896e74801832135ab1039a7df25b4c8af6a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 16:08:32 +0800 Subject: [PATCH 08/17] fix: token counter bug --- evaluation/scripts/longbench/__init__.py | 1 - .../scripts/longbench/longbench_ingestion.py | 306 ----------------- .../scripts/longbench/longbench_metric.py | 235 ------------- .../scripts/longbench/longbench_responses.py | 196 ----------- .../scripts/longbench/longbench_search.py | 309 ------------------ .../scripts/longbench_v2/prepare_data.py | 0 src/memos/embedders/base.py | 2 +- src/memos/mem_reader/simple_struct.py | 2 +- .../tree_text_memory/organize/manager.py | 6 +- 9 files changed, 5 insertions(+), 1052 deletions(-) delete mode 100644 evaluation/scripts/longbench/__init__.py delete mode 100644 evaluation/scripts/longbench/longbench_ingestion.py delete mode 100644 evaluation/scripts/longbench/longbench_metric.py delete mode 100644 evaluation/scripts/longbench/longbench_responses.py delete mode 100644 evaluation/scripts/longbench/longbench_search.py delete mode 100644 evaluation/scripts/longbench_v2/prepare_data.py diff --git a/evaluation/scripts/longbench/__init__.py b/evaluation/scripts/longbench/__init__.py deleted file mode 100644 index 38cc006e..00000000 --- a/evaluation/scripts/longbench/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# LongBench evaluation scripts diff --git a/evaluation/scripts/longbench/longbench_ingestion.py b/evaluation/scripts/longbench/longbench_ingestion.py deleted file mode 100644 index e2d2a8e7..00000000 --- a/evaluation/scripts/longbench/longbench_ingestion.py +++ /dev/null @@ -1,306 +0,0 @@ -import argparse -import json -import os -import sys - -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime, timezone - -from dotenv import load_dotenv -from tqdm import tqdm - - -ROOT_DIR = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") - -sys.path.insert(0, ROOT_DIR) -sys.path.insert(0, EVAL_SCRIPTS_DIR) - - -# All LongBench datasets -LONGBENCH_DATASETS = [ - "narrativeqa", - "qasper", - "multifieldqa_en", - "multifieldqa_zh", - "hotpotqa", - "2wikimqa", - "musique", - "dureader", - "gov_report", - "qmsum", - "multi_news", - "vcsum", - "trec", - "triviaqa", - "samsum", - "lsht", - "passage_count", - "passage_retrieval_en", - "passage_retrieval_zh", - "lcc", - "repobench-p", -] - - -def ingest_sample(client, sample, dataset_name, sample_idx, frame, version): - """Ingest a single LongBench sample as memories.""" - user_id = f"longbench_{dataset_name}_{sample_idx}_{version}" - conv_id = f"longbench_{dataset_name}_{sample_idx}_{version}" - - # Get context and convert to messages - context = sample.get("context", "") - # not used now: input_text = sample.get("input", "") - - # For memos, we ingest the context as document content - # Split context into chunks if it's too long (optional, memos handles this internally) - # For now, we'll ingest the full context as a single message - messages = [ - { - "role": "assistant", - "content": context, - "chat_time": datetime.now(timezone.utc).isoformat(), - } - ] - - if "memos-api" in frame: - try: - client.add(messages=messages, user_id=user_id, conv_id=conv_id, batch_size=1) - print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") - return True - except Exception as e: - print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") - return False - elif "mem0" in frame: - timestamp = int(datetime.now(timezone.utc).timestamp()) - try: - client.add(messages=messages, user_id=user_id, timestamp=timestamp, batch_size=1) - print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") - return True - except Exception as e: - print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") - return False - elif frame == "memobase": - for m in messages: - m["created_at"] = messages[0]["chat_time"] - try: - client.add(messages=messages, user_id=user_id, batch_size=1) - print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") - return True - except Exception as e: - print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") - return False - elif frame == "memu": - try: - client.add(messages=messages, user_id=user_id, iso_date=messages[0]["chat_time"]) - print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") - return True - except Exception as e: - print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") - return False - elif frame == "supermemory": - try: - client.add(messages=messages, user_id=user_id) - print(f"✅ [{frame}] Ingested sample {sample_idx} from {dataset_name}") - return True - except Exception as e: - print(f"❌ [{frame}] Error ingesting sample {sample_idx} from {dataset_name}: {e}") - return False - - return False - - -def load_dataset_from_local(dataset_name, use_e=False): - """Load LongBench dataset from local JSONL file.""" - # Determine data directory - data_dir = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - "data", - "long_bench_v2", - ) - - # Determine filename - filename = f"{dataset_name}_e.jsonl" if use_e else f"{dataset_name}.jsonl" - - filepath = os.path.join(data_dir, filename) - - if not os.path.exists(filepath): - raise FileNotFoundError(f"Dataset file not found: {filepath}") - - # Load JSONL file - samples = [] - with open(filepath, encoding="utf-8") as f: - for line in f: - if line.strip(): - samples.append(json.loads(line)) - - return samples - - -def ingest_dataset(dataset_name, frame, version, num_workers=10, max_samples=None, use_e=False): - """Ingest a single LongBench dataset.""" - print(f"\n{'=' * 80}") - print(f"🔄 [INGESTING DATASET: {dataset_name.upper()}]".center(80)) - print(f"{'=' * 80}\n") - - # Load dataset from local files - try: - dataset = load_dataset_from_local(dataset_name, use_e) - print(f"Loaded {len(dataset)} samples from {dataset_name}") - except FileNotFoundError as e: - print(f"❌ Error loading dataset {dataset_name}: {e}") - return - except Exception as e: - print(f"❌ Error loading dataset {dataset_name}: {e}") - return - - # Limit samples if specified - if max_samples: - dataset = dataset[:max_samples] - print(f"Limited to {len(dataset)} samples") - - # Initialize client - client = None - if frame == "mem0" or frame == "mem0_graph": - from utils.client import Mem0Client - - client = Mem0Client(enable_graph="graph" in frame) - elif frame == "memos-api": - from utils.client import MemosApiClient - - client = MemosApiClient() - elif frame == "memos-api-online": - from utils.client import MemosApiOnlineClient - - client = MemosApiOnlineClient() - elif frame == "memobase": - from utils.client import MemobaseClient - - client = MemobaseClient() - elif frame == "memu": - from utils.client import MemuClient - - client = MemuClient() - elif frame == "supermemory": - from utils.client import SupermemoryClient - - client = SupermemoryClient() - else: - print(f"❌ Unsupported frame: {frame}") - return - - # Ingest samples - success_count = 0 - with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [] - for idx, sample in enumerate(dataset): - future = executor.submit( - ingest_sample, client, sample, dataset_name, idx, frame, version - ) - futures.append(future) - - for future in tqdm( - as_completed(futures), - total=len(futures), - desc=f"Ingesting {dataset_name}", - ): - try: - if future.result(): - success_count += 1 - except Exception as e: - print(f"Error processing sample: {e}") - - print(f"\n✅ Completed ingesting {dataset_name}: {success_count}/{len(dataset)} samples") - return success_count - - -def main(frame, version="default", num_workers=10, datasets=None, max_samples=None, use_e=False): - """Main ingestion function.""" - load_dotenv() - - print("\n" + "=" * 80) - print(f"🚀 LONGBENCH INGESTION - {frame.upper()} v{version}".center(80)) - print("=" * 80 + "\n") - - # Determine which datasets to process - dataset_list = [d.strip() for d in datasets.split(",")] if datasets else LONGBENCH_DATASETS - - # Filter valid datasets - valid_datasets = [d for d in dataset_list if d in LONGBENCH_DATASETS] - if not valid_datasets: - print("❌ No valid datasets specified") - return - - print(f"Processing {len(valid_datasets)} datasets: {valid_datasets}\n") - - # Ingest each dataset - total_success = 0 - total_samples = 0 - for dataset_name in valid_datasets: - success = ingest_dataset(dataset_name, frame, version, num_workers, max_samples, use_e) - if success is not None: - total_success += success - total_samples += max_samples if max_samples else 200 # Approximate - - print(f"\n{'=' * 80}") - print(f"✅ INGESTION COMPLETE: {total_success} samples ingested".center(80)) - print(f"{'=' * 80}\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--lib", - type=str, - choices=[ - "mem0", - "mem0_graph", - "memos-api", - "memos-api-online", - "memobase", - "memu", - "supermemory", - ], - default="memos-api", - ) - parser.add_argument( - "--version", - type=str, - default="default", - help="Version identifier for saving results", - ) - parser.add_argument( - "--workers", - type=int, - default=10, - help="Number of parallel workers", - ) - parser.add_argument( - "--datasets", - type=str, - default=None, - help="Comma-separated list of datasets to process (default: all)", - ) - parser.add_argument( - "--max_samples", - type=int, - default=None, - help="Maximum number of samples per dataset (default: all)", - ) - parser.add_argument( - "--e", - action="store_true", - help="Use LongBench-E variant (uniform length distribution)", - ) - args = parser.parse_args() - - main( - args.lib, - args.version, - args.workers, - args.datasets, - args.max_samples, - args.e, - ) diff --git a/evaluation/scripts/longbench/longbench_metric.py b/evaluation/scripts/longbench/longbench_metric.py deleted file mode 100644 index 495a793a..00000000 --- a/evaluation/scripts/longbench/longbench_metric.py +++ /dev/null @@ -1,235 +0,0 @@ -import argparse -import json -import os -import sys - -import numpy as np - - -# Import LongBench metrics -# Try to import from the LongBench directory -LONGBENCH_METRICS_DIR = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - "longbench_v2", - "LongBench-main", - "LongBench", -) - -if os.path.exists(LONGBENCH_METRICS_DIR): - sys.path.insert(0, LONGBENCH_METRICS_DIR) - try: - from metrics import ( - classification_score, - code_sim_score, - count_score, - qa_f1_score, - qa_f1_zh_score, - retrieval_score, - retrieval_zh_score, - rouge_score, - rouge_zh_score, - ) - except ImportError: - print(f"Warning: Could not import metrics from {LONGBENCH_METRICS_DIR}") - print("Please ensure LongBench metrics.py is available") - raise -else: - print(f"Error: LongBench metrics directory not found at {LONGBENCH_METRICS_DIR}") - raise FileNotFoundError("LongBench metrics directory not found") - -# Dataset to metric mapping (from LongBench eval.py) -dataset2metric = { - "narrativeqa": qa_f1_score, - "qasper": qa_f1_score, - "multifieldqa_en": qa_f1_score, - "multifieldqa_zh": qa_f1_zh_score, - "hotpotqa": qa_f1_score, - "2wikimqa": qa_f1_score, - "musique": qa_f1_score, - "dureader": rouge_zh_score, - "gov_report": rouge_score, - "qmsum": rouge_score, - "multi_news": rouge_score, - "vcsum": rouge_zh_score, - "trec": classification_score, - "triviaqa": qa_f1_score, - "samsum": rouge_score, - "lsht": classification_score, - "passage_retrieval_en": retrieval_score, - "passage_count": count_score, - "passage_retrieval_zh": retrieval_zh_score, - "lcc": code_sim_score, - "repobench-p": code_sim_score, -} - - -def scorer(dataset, predictions, answers, all_classes): - """Calculate score for a dataset.""" - total_score = 0.0 - for prediction, ground_truths in zip(predictions, answers, strict=False): - score = 0.0 - # For some tasks, only take the first line - if dataset in ["trec", "triviaqa", "samsum", "lsht"]: - prediction = prediction.lstrip("\n").split("\n")[0] - - # Calculate max score across all ground truth answers - for ground_truth in ground_truths: - metric_func = dataset2metric.get(dataset) - if metric_func: - if dataset in ["trec", "lsht"]: - # Classification tasks need all_classes - score = max( - score, - metric_func(prediction, ground_truth, all_classes=all_classes), - ) - else: - score = max(score, metric_func(prediction, ground_truth)) - else: - print(f"Warning: No metric function for dataset {dataset}") - - total_score += score - - return round(100 * total_score / len(predictions), 2) if len(predictions) > 0 else 0.0 - - -def scorer_e(dataset, predictions, answers, lengths, all_classes): - """Calculate score for LongBench-E (with length-based analysis).""" - scores = {"0-4k": [], "4-8k": [], "8k+": []} - - for prediction, ground_truths, length in zip(predictions, answers, lengths, strict=False): - score = 0.0 - # For some tasks, only take the first line - if dataset in ["trec", "triviaqa", "samsum", "lsht"]: - prediction = prediction.lstrip("\n").split("\n")[0] - - # Calculate max score across all ground truth answers - metric_func = dataset2metric.get(dataset) - if metric_func: - for ground_truth in ground_truths: - if dataset in ["trec", "lsht"]: - score = max( - score, - metric_func(prediction, ground_truth, all_classes=all_classes), - ) - else: - score = max(score, metric_func(prediction, ground_truth)) - - # Categorize by length - if length < 4000: - scores["0-4k"].append(score) - elif length < 8000: - scores["4-8k"].append(score) - else: - scores["8k+"].append(score) - - # Calculate average scores per length category - for key in scores: - if len(scores[key]) > 0: - scores[key] = round(100 * np.mean(scores[key]), 2) - else: - scores[key] = 0.0 - - return scores - - -def main(frame, version="default", use_e=False): - """Main metric calculation function.""" - print("\n" + "=" * 80) - print(f"📊 LONGBENCH METRICS CALCULATION - {frame.upper()} v{version}".center(80)) - print("=" * 80 + "\n") - - # Load responses - responses_path = f"results/longbench/{frame}-{version}/{frame}_longbench_responses.json" - if not os.path.exists(responses_path): - print(f"❌ Responses not found: {responses_path}") - print("Please run longbench_responses.py first") - return - - with open(responses_path, encoding="utf-8") as f: - responses = json.load(f) - - # Calculate metrics for each dataset - all_scores = {} - overall_scores = [] - - for dataset_name, samples in responses.items(): - print(f"Calculating metrics for {dataset_name}...") - - predictions = [s.get("answer", "") for s in samples] - answers = [s.get("golden_answer", []) for s in samples] - all_classes = samples[0].get("all_classes") if samples else None - - if use_e: - lengths = [s.get("length", 0) for s in samples] - score = scorer_e(dataset_name, predictions, answers, lengths, all_classes) - else: - score = scorer(dataset_name, predictions, answers, all_classes) - - all_scores[dataset_name] = score - print(f" {dataset_name}: {score}") - - # For overall average, use single score (not length-based) - if use_e: - # Average across length categories - if isinstance(score, dict): - overall_scores.append(np.mean(list(score.values()))) - else: - overall_scores.append(score) - - # Calculate overall average - if overall_scores: - all_scores["average"] = round(np.mean(overall_scores), 2) - print(f"\nOverall Average: {all_scores['average']}") - - # Save metrics - output_path = f"results/longbench/{frame}-{version}/{frame}_longbench_metrics.json" - os.makedirs(os.path.dirname(output_path), exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(all_scores, f, ensure_ascii=False, indent=4) - - print(f"\n{'=' * 80}") - print(f"✅ METRICS CALCULATION COMPLETE: Results saved to {output_path}".center(80)) - print(f"{'=' * 80}\n") - - # Print summary table - print("\n📊 Summary of Results:") - print("-" * 80) - for dataset, score in sorted(all_scores.items()): - if isinstance(score, dict): - print(f"{dataset:30s}: {score}") - else: - print(f"{dataset:30s}: {score:.2f}%") - print("-" * 80) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--lib", - type=str, - choices=[ - "mem0", - "mem0_graph", - "memos-api", - "memos-api-online", - "memobase", - "memu", - "supermemory", - ], - default="memos-api", - ) - parser.add_argument( - "--version", - type=str, - default="default", - help="Version identifier for loading results", - ) - parser.add_argument( - "--e", - action="store_true", - help="Use LongBench-E variant (uniform length distribution)", - ) - args = parser.parse_args() - - main(args.lib, args.version, args.e) diff --git a/evaluation/scripts/longbench/longbench_responses.py b/evaluation/scripts/longbench/longbench_responses.py deleted file mode 100644 index 2d160160..00000000 --- a/evaluation/scripts/longbench/longbench_responses.py +++ /dev/null @@ -1,196 +0,0 @@ -import argparse -import json -import os -import sys - -from concurrent.futures import ThreadPoolExecutor, as_completed -from time import time - -from dotenv import load_dotenv -from openai import OpenAI -from tqdm import tqdm - - -ROOT_DIR = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") - -sys.path.insert(0, ROOT_DIR) -sys.path.insert(0, EVAL_SCRIPTS_DIR) - - -# Dataset to prompt mapping (from LongBench config) -DATASET_PROMPTS = { - "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:", - "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', - "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", - "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:", - "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", - "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", - "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", - "dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:", - "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:", - "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:", - "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:", - "vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:", - "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}", - "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}", - "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}", - "lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}", - "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ", - "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', - "passage_retrieval_zh": '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', - "lcc": "Please complete the code given below. \n{context}Next line of code:\n", - "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n", -} - - -def generate_response(llm_client, dataset_name, context, input_text): - """Generate response using LLM.""" - # Get prompt template for dataset - prompt_template = DATASET_PROMPTS.get(dataset_name, "{context}\n\nQuestion: {input}\nAnswer:") - - # Format prompt - if "{input}" in prompt_template: - prompt = prompt_template.format(context=context, input=input_text) - else: - # Some prompts don't have {input} placeholder (like gov_report, vcsum) - prompt = prompt_template.format(context=context) - - try: - response = llm_client.chat.completions.create( - model=os.getenv("CHAT_MODEL"), - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt}, - ], - temperature=0, - ) - result = response.choices[0].message.content or "" - return result - except Exception as e: - print(f"Error generating response: {e}") - return "" - - -def process_sample(search_result, llm_client): - """Process a single sample: generate answer.""" - start = time() - - dataset_name = search_result.get("dataset") - context = search_result.get("context", "") - input_text = search_result.get("input", "") - - # Generate answer - answer = generate_response(llm_client, dataset_name, context, input_text) - - response_duration_ms = (time() - start) * 1000 - - return { - "dataset": dataset_name, - "sample_idx": search_result.get("sample_idx"), - "input": input_text, - "answer": answer, - "golden_answer": search_result.get("answers", []), - "all_classes": search_result.get("all_classes"), - "length": search_result.get("length", 0), - "search_context": context, - "response_duration_ms": response_duration_ms, - "search_duration_ms": search_result.get("search_duration_ms", 0), - } - - -def main(frame, version="default", num_workers=10): - """Main response generation function.""" - load_dotenv() - - print("\n" + "=" * 80) - print(f"🚀 LONGBENCH RESPONSE GENERATION - {frame.upper()} v{version}".center(80)) - print("=" * 80 + "\n") - - # Load search results - search_path = f"results/longbench/{frame}-{version}/{frame}_longbench_search_results.json" - if not os.path.exists(search_path): - print(f"❌ Search results not found: {search_path}") - print("Please run longbench_search.py first") - return - - with open(search_path, encoding="utf-8") as f: - search_results = json.load(f) - - # Initialize LLM client - llm_client = OpenAI( - api_key=os.getenv("CHAT_MODEL_API_KEY"), - base_url=os.getenv("CHAT_MODEL_BASE_URL"), - ) - print(f"🔌 Using OpenAI client with model: {os.getenv('CHAT_MODEL')}") - - # Process all samples - all_responses = [] - for dataset_name, samples in search_results.items(): - print(f"\nProcessing {len(samples)} samples from {dataset_name}...") - - with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [executor.submit(process_sample, sample, llm_client) for sample in samples] - - for future in tqdm( - as_completed(futures), - total=len(futures), - desc=f"Generating responses for {dataset_name}", - ): - result = future.result() - if result: - all_responses.append(result) - - # Save responses - output_path = f"results/longbench/{frame}-{version}/{frame}_longbench_responses.json" - os.makedirs(os.path.dirname(output_path), exist_ok=True) - - # Group by dataset - responses_by_dataset = {} - for response in all_responses: - dataset = response["dataset"] - if dataset not in responses_by_dataset: - responses_by_dataset[dataset] = [] - responses_by_dataset[dataset].append(response) - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(responses_by_dataset, f, ensure_ascii=False, indent=2) - - print(f"\n{'=' * 80}") - print(f"✅ RESPONSE GENERATION COMPLETE: Results saved to {output_path}".center(80)) - print(f"{'=' * 80}\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--lib", - type=str, - choices=[ - "mem0", - "mem0_graph", - "memos-api", - "memos-api-online", - "memobase", - "memu", - "supermemory", - ], - default="memos-api", - ) - parser.add_argument( - "--version", - type=str, - default="default", - help="Version identifier for loading results", - ) - parser.add_argument( - "--workers", - type=int, - default=10, - help="Number of parallel workers", - ) - args = parser.parse_args() - - main(args.lib, args.version, args.workers) diff --git a/evaluation/scripts/longbench/longbench_search.py b/evaluation/scripts/longbench/longbench_search.py deleted file mode 100644 index aaf7300e..00000000 --- a/evaluation/scripts/longbench/longbench_search.py +++ /dev/null @@ -1,309 +0,0 @@ -import argparse -import json -import os -import sys - -from collections import defaultdict -from concurrent.futures import ThreadPoolExecutor, as_completed -from time import time - -from dotenv import load_dotenv -from tqdm import tqdm - - -ROOT_DIR = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") - -sys.path.insert(0, ROOT_DIR) -sys.path.insert(0, EVAL_SCRIPTS_DIR) - - -# All LongBench datasets -LONGBENCH_DATASETS = [ - "narrativeqa", - "qasper", - "multifieldqa_en", - "multifieldqa_zh", - "hotpotqa", - "2wikimqa", - "musique", - "dureader", - "gov_report", - "qmsum", - "multi_news", - "vcsum", - "trec", - "triviaqa", - "samsum", - "lsht", - "passage_count", - "passage_retrieval_en", - "passage_retrieval_zh", - "lcc", - "repobench-p", -] - - -def memos_api_search(client, query, user_id, top_k, frame): - """Search using memos API.""" - start = time() - search_results = client.search(query=query, user_id=user_id, top_k=top_k) - - # Format context from search results based on frame type - context = "" - if frame == "memos-api" or frame == "memos-api-online": - if isinstance(search_results, dict) and "text_mem" in search_results: - context = "\n".join([i["memory"] for i in search_results["text_mem"][0]["memories"]]) - if "pref_string" in search_results: - context += f"\n{search_results.get('pref_string', '')}" - elif frame == "mem0" or frame == "mem0_graph": - if isinstance(search_results, dict) and "results" in search_results: - context = "\n".join( - [ - f"{m.get('created_at', '')}: {m.get('memory', '')}" - for m in search_results["results"] - ] - ) - elif frame == "memobase": - context = search_results if isinstance(search_results, str) else "" - elif frame == "memu": - context = "\n".join(search_results) if isinstance(search_results, list) else "" - elif frame == "supermemory": - context = search_results if isinstance(search_results, str) else "" - - duration_ms = (time() - start) * 1000 - return context, duration_ms - - -def process_sample(client, sample, dataset_name, sample_idx, frame, version, top_k): - """Process a single sample: search for relevant memories.""" - user_id = f"longbench_{dataset_name}_{sample_idx}_{version}" - query = sample.get("input", "") - - if not query: - return None - - context, duration_ms = memos_api_search(client, query, user_id, top_k, frame) - - return { - "dataset": dataset_name, - "sample_idx": sample_idx, - "input": query, - "context": context, - "search_duration_ms": duration_ms, - "answers": sample.get("answers", []), - "all_classes": sample.get("all_classes"), - "length": sample.get("length", 0), - } - - -def load_dataset_from_local(dataset_name, use_e=False): - """Load LongBench dataset from local JSONL file.""" - # Determine data directory - data_dir = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - "data", - "long_bench_v2", - ) - - # Determine filename - filename = f"{dataset_name}_e.jsonl" if use_e else f"{dataset_name}.jsonl" - - filepath = os.path.join(data_dir, filename) - - if not os.path.exists(filepath): - raise FileNotFoundError(f"Dataset file not found: {filepath}") - - # Load JSONL file - samples = [] - with open(filepath, encoding="utf-8") as f: - for line in f: - if line.strip(): - samples.append(json.loads(line)) - - return samples - - -def process_dataset( - dataset_name, frame, version, top_k=20, num_workers=10, max_samples=None, use_e=False -): - """Process a single dataset: search for all samples.""" - print(f"\n{'=' * 80}") - print(f"🔍 [SEARCHING DATASET: {dataset_name.upper()}]".center(80)) - print(f"{'=' * 80}\n") - - # Load dataset from local files - try: - dataset = load_dataset_from_local(dataset_name, use_e) - print(f"Loaded {len(dataset)} samples from {dataset_name}") - except FileNotFoundError as e: - print(f"❌ Error loading dataset {dataset_name}: {e}") - return [] - except Exception as e: - print(f"❌ Error loading dataset {dataset_name}: {e}") - return [] - - # Limit samples if specified - if max_samples: - dataset = dataset[:max_samples] - print(f"Limited to {len(dataset)} samples") - - # Initialize client - client = None - if frame == "mem0" or frame == "mem0_graph": - from utils.client import Mem0Client - - client = Mem0Client(enable_graph="graph" in frame) - elif frame == "memos-api": - from utils.client import MemosApiClient - - client = MemosApiClient() - elif frame == "memos-api-online": - from utils.client import MemosApiOnlineClient - - client = MemosApiOnlineClient() - elif frame == "memobase": - from utils.client import MemobaseClient - - client = MemobaseClient() - elif frame == "memu": - from utils.client import MemuClient - - client = MemuClient() - elif frame == "supermemory": - from utils.client import SupermemoryClient - - client = SupermemoryClient() - else: - print(f"❌ Unsupported frame: {frame}") - return [] - - # Process samples - search_results = [] - with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [] - for idx, sample in enumerate(dataset): - future = executor.submit( - process_sample, client, sample, dataset_name, idx, frame, version, top_k - ) - futures.append(future) - - for future in tqdm( - as_completed(futures), - total=len(futures), - desc=f"Searching {dataset_name}", - ): - result = future.result() - if result: - search_results.append(result) - - print(f"\n✅ Completed searching {dataset_name}: {len(search_results)} samples") - return search_results - - -def main( - frame, version="default", num_workers=10, top_k=20, datasets=None, max_samples=None, use_e=False -): - """Main search function.""" - load_dotenv() - - print("\n" + "=" * 80) - print(f"🚀 LONGBENCH SEARCH - {frame.upper()} v{version}".center(80)) - print("=" * 80 + "\n") - - # Determine which datasets to process - dataset_list = [d.strip() for d in datasets.split(",")] if datasets else LONGBENCH_DATASETS - - # Filter valid datasets - valid_datasets = [d for d in dataset_list if d in LONGBENCH_DATASETS] - if not valid_datasets: - print("❌ No valid datasets specified") - return - - print(f"Processing {len(valid_datasets)} datasets: {valid_datasets}\n") - - # Create output directory - os.makedirs(f"results/longbench/{frame}-{version}/", exist_ok=True) - - # Process each dataset - all_results = defaultdict(list) - for dataset_name in valid_datasets: - results = process_dataset( - dataset_name, frame, version, top_k, num_workers, max_samples, use_e - ) - all_results[dataset_name] = results - - # Save results - output_path = f"results/longbench/{frame}-{version}/{frame}_longbench_search_results.json" - with open(output_path, "w", encoding="utf-8") as f: - json.dump(dict(all_results), f, ensure_ascii=False, indent=2) - - print(f"\n{'=' * 80}") - print(f"✅ SEARCH COMPLETE: Results saved to {output_path}".center(80)) - print(f"{'=' * 80}\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--lib", - type=str, - choices=[ - "mem0", - "mem0_graph", - "memos-api", - "memos-api-online", - "memobase", - "memu", - "supermemory", - ], - default="memos-api", - ) - parser.add_argument( - "--version", - type=str, - default="default", - help="Version identifier for saving results", - ) - parser.add_argument( - "--workers", - type=int, - default=10, - help="Number of parallel workers", - ) - parser.add_argument( - "--top_k", - type=int, - default=20, - help="Number of results to retrieve in search queries", - ) - parser.add_argument( - "--datasets", - type=str, - default=None, - help="Comma-separated list of datasets to process (default: all)", - ) - parser.add_argument( - "--max_samples", - type=int, - default=None, - help="Maximum number of samples per dataset (default: all)", - ) - parser.add_argument( - "--e", - action="store_true", - help="Use LongBench-E variant (uniform length distribution)", - ) - args = parser.parse_args() - - main( - args.lib, - args.version, - args.workers, - args.top_k, - args.datasets, - args.max_samples, - args.e, - ) diff --git a/evaluation/scripts/longbench_v2/prepare_data.py b/evaluation/scripts/longbench_v2/prepare_data.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/memos/embedders/base.py b/src/memos/embedders/base.py index 22ef0d30..e46611d1 100644 --- a/src/memos/embedders/base.py +++ b/src/memos/embedders/base.py @@ -23,7 +23,7 @@ def _count_tokens_for_embedding(text: str) -> int: enc = tiktoken.encoding_for_model("gpt-4o-mini") except Exception: enc = tiktoken.get_encoding("cl100k_base") - return len(enc.encode(text or "")) + return len(enc.encode(text or "", disallowed_special=())) except Exception: # Heuristic fallback: zh chars ~1 token, others ~1 token per ~4 chars if not text: diff --git a/src/memos/mem_reader/simple_struct.py b/src/memos/mem_reader/simple_struct.py index f43ad01b..2dcf7584 100644 --- a/src/memos/mem_reader/simple_struct.py +++ b/src/memos/mem_reader/simple_struct.py @@ -89,7 +89,7 @@ def from_config(_config): _ENC = tiktoken.get_encoding("cl100k_base") def _count_tokens_text(s: str) -> int: - return len(_ENC.encode(s or "")) + return len(_ENC.encode(s or "", disallowed_special=())) except Exception: # Heuristic fallback: zh chars ~1 token, others ~1 token per ~4 chars def _count_tokens_text(s: str) -> int: diff --git a/src/memos/memories/textual/tree_text_memory/organize/manager.py b/src/memos/memories/textual/tree_text_memory/organize/manager.py index 3226f7ca..2a3bae94 100644 --- a/src/memos/memories/textual/tree_text_memory/organize/manager.py +++ b/src/memos/memories/textual/tree_text_memory/organize/manager.py @@ -92,9 +92,9 @@ def add( """ added_ids: list[str] = [] - with ContextThreadPoolExecutor(max_workers=200) as executor: + with ContextThreadPoolExecutor(max_workers=50) as executor: futures = {executor.submit(self._process_memory, m, user_name): m for m in memories} - for future in as_completed(futures, timeout=60): + for future in as_completed(futures, timeout=500): try: ids = future.result() added_ids.extend(ids) @@ -102,7 +102,7 @@ def add( logger.exception("Memory processing error: ", exc_info=e) if mode == "sync": - for mem_type in ["WorkingMemory", "LongTermMemory", "UserMemory"]: + for mem_type in ["WorkingMemory"]: try: self.graph_store.remove_oldest_memory( memory_type="WorkingMemory", From b375d51ff09bbe92bc89de990c614246020b1837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 16:17:48 +0800 Subject: [PATCH 09/17] feat: enlarge polardb --- evaluation/scripts/long_bench-v2/__init__.py | 1 + .../long_bench-v2/longbench_v2_ingestion.py | 199 +++++++++++++++++ .../longbench_v2_ingestion_async.py | 158 ++++++++++++++ .../long_bench-v2/longbench_v2_metric.py | 142 ++++++++++++ .../long_bench-v2/longbench_v2_responses.py | 206 ++++++++++++++++++ .../long_bench-v2/longbench_v2_search.py | 192 ++++++++++++++++ src/memos/graph_dbs/polardb.py | 2 +- 7 files changed, 899 insertions(+), 1 deletion(-) create mode 100644 evaluation/scripts/long_bench-v2/__init__.py create mode 100644 evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py create mode 100644 evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py create mode 100644 evaluation/scripts/long_bench-v2/longbench_v2_metric.py create mode 100644 evaluation/scripts/long_bench-v2/longbench_v2_responses.py create mode 100644 evaluation/scripts/long_bench-v2/longbench_v2_search.py diff --git a/evaluation/scripts/long_bench-v2/__init__.py b/evaluation/scripts/long_bench-v2/__init__.py new file mode 100644 index 00000000..786c0ce0 --- /dev/null +++ b/evaluation/scripts/long_bench-v2/__init__.py @@ -0,0 +1 @@ +# LongBench v2 evaluation scripts diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py new file mode 100644 index 00000000..d84a63d9 --- /dev/null +++ b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py @@ -0,0 +1,199 @@ +import argparse +import json +import os +import sys +import threading + +from concurrent.futures import ThreadPoolExecutor, as_completed + +from dotenv import load_dotenv +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +def ingest_sample( + client, sample, sample_idx, frame, version, success_records, record_file, file_lock +): + """Ingest a single LongBench v2 sample as memories.""" + # Skip if already processed + if str(sample_idx) in success_records: + return True + + user_id = f"longbench_v2_{sample_idx}_{version}" + conv_id = f"longbench_v2_{sample_idx}_{version}" + + # Get context and convert to messages + context = sample.get("context", "") + + # For memos, we ingest the context as document content + messages = [ + { + "type": "file", + "file": { + "file_data": context, + "file_id": str(sample_idx), + }, + } + ] + + if "memos-api" in frame: + try: + client.add(messages=messages, user_id=user_id, conv_id=conv_id, batch_size=1) + print(f"✅ [{frame}] Ingested sample {sample_idx}") + # Record successful ingestion (thread-safe) + with file_lock, open(record_file, "a") as f: + f.write(f"{sample_idx}\n") + f.flush() + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx}: {e}") + return False + + return False + + +def load_dataset_from_local(): + """Load LongBench v2 dataset from local JSON file.""" + data_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "data", + "long_bench_v2", + ) + + filepath = os.path.join(data_dir, "data.json") + + if not os.path.exists(filepath): + raise FileNotFoundError(f"Dataset file not found: {filepath}") + + # Load JSON file + with open(filepath, encoding="utf-8") as f: + samples = json.load(f) + + return samples + + +def main(frame, version="default", num_workers=10, max_samples=None): + """Main ingestion function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH V2 INGESTION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load dataset from local file + try: + dataset = load_dataset_from_local() + print(f"Loaded {len(dataset)} samples from LongBench v2") + except FileNotFoundError as e: + print(f"❌ Error loading dataset: {e}") + return + except Exception as e: + print(f"❌ Error loading dataset: {e}") + return + + # Limit samples if specified + if max_samples: + dataset = dataset[:max_samples] + print(f"Limited to {len(dataset)} samples") + + # Initialize checkpoint file for resume functionality + checkpoint_dir = os.path.join( + ROOT_DIR, "evaluation", "results", "longbench_v2", f"{frame}-{version}" + ) + os.makedirs(checkpoint_dir, exist_ok=True) + record_file = os.path.join(checkpoint_dir, "success_records.txt") + + # Load existing success records for resume + success_records = set() + if os.path.exists(record_file): + with open(record_file) as f: + for line in f: + line = line.strip() + if line: + success_records.add(line) + print(f"📋 Found {len(success_records)} already processed samples (resume mode)") + else: + print("📋 Starting fresh ingestion (no checkpoint found)") + + # Initialize client + client = None + if frame == "memos-api": + from utils.client import MemosApiClient + + client = MemosApiClient() + else: + print(f"❌ Unsupported frame: {frame}") + return + + # Ingest samples + success_count = len(success_records) # Start with already processed count + file_lock = threading.Lock() # Lock for thread-safe file writing + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for idx, sample in enumerate(dataset): + future = executor.submit( + ingest_sample, + client, + sample, + idx, + frame, + version, + success_records, + record_file, + file_lock, + ) + futures.append(future) + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="Ingesting LongBench v2", + ): + try: + if future.result(): + success_count += 1 + except Exception as e: + print(f"Error processing sample: {e}") + + print(f"\n{'=' * 80}") + print(f"✅ INGESTION COMPLETE: {success_count}/{len(dataset)} samples ingested".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=["memos-api", "memos-api-online"], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="long-bench-v2-1208-1556", + help="Version identifier for saving results", + ) + parser.add_argument( + "--workers", + type=int, + default=20, + help="Number of parallel workers", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples to process (default: all)", + ) + args = parser.parse_args() + + main(args.lib, args.version, args.workers, args.max_samples) diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py new file mode 100644 index 00000000..c23d7885 --- /dev/null +++ b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py @@ -0,0 +1,158 @@ +import argparse +import json +import os +import sys + +from concurrent.futures import ThreadPoolExecutor, as_completed + +from dotenv import load_dotenv +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +def ingest_sample(client, sample, sample_idx, frame, version): + """Ingest a single LongBench v2 sample as memories.""" + user_id = f"longbench_v2_{sample_idx}_{version}" + conv_id = f"longbench_v2_{sample_idx}_{version}" + + # Get context and convert to messages + context = sample.get("context", "") + + # For memos, we ingest the context as document content + messages = [ + { + "type": "file", + "file": { + "file_data": context, + "file_id": str(sample_idx), + }, + } + ] + + if "memos-api" in frame: + try: + client.add(messages=messages, user_id=user_id, conv_id=conv_id, batch_size=1) + print(f"✅ [{frame}] Ingested sample {sample_idx}") + return True + except Exception as e: + print(f"❌ [{frame}] Error ingesting sample {sample_idx}: {e}") + return False + + return False + + +def load_dataset_from_local(): + """Load LongBench v2 dataset from local JSON file.""" + data_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "data", + "long_bench_v2", + ) + + filepath = os.path.join(data_dir, "data.json") + + if not os.path.exists(filepath): + raise FileNotFoundError(f"Dataset file not found: {filepath}") + + # Load JSON file + with open(filepath, encoding="utf-8") as f: + samples = json.load(f) + + return samples + + +def main(frame, version="default", num_workers=10, max_samples=None): + """Main ingestion function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH V2 INGESTION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load dataset from local file + try: + dataset = load_dataset_from_local() + print(f"Loaded {len(dataset)} samples from LongBench v2") + except FileNotFoundError as e: + print(f"❌ Error loading dataset: {e}") + return + except Exception as e: + print(f"❌ Error loading dataset: {e}") + return + + # Limit samples if specified + if max_samples: + dataset = dataset[:max_samples] + print(f"Limited to {len(dataset)} samples") + + # Initialize client + client = None + if frame == "memos-api": + from utils.client import MemosApiClient + + client = MemosApiClient() + else: + print(f"❌ Unsupported frame: {frame}") + return + + # Ingest samples + success_count = 0 + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for idx, sample in enumerate(dataset): + future = executor.submit(ingest_sample, client, sample, idx, frame, version) + futures.append(future) + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="Ingesting LongBench v2", + ): + try: + if future.result(): + success_count += 1 + except Exception as e: + print(f"Error processing sample: {e}") + + print(f"\n{'=' * 80}") + print(f"✅ INGESTION COMPLETE: {success_count}/{len(dataset)} samples ingested".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=["memos-api", "memos-api-online"], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="long-bench-v2-1208-1556-async", + help="Version identifier for saving results", + ) + parser.add_argument( + "--workers", + type=int, + default=20, + help="Number of parallel workers", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples to process (default: all)", + ) + args = parser.parse_args() + + main(args.lib, args.version, args.workers, args.max_samples) diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_metric.py b/evaluation/scripts/long_bench-v2/longbench_v2_metric.py new file mode 100644 index 00000000..5fee9a3d --- /dev/null +++ b/evaluation/scripts/long_bench-v2/longbench_v2_metric.py @@ -0,0 +1,142 @@ +import argparse +import json +import os + + +def calculate_accuracy(responses): + """Calculate accuracy metrics for LongBench v2.""" + total = len(responses) + if total == 0: + return {} + + # Overall accuracy + correct = sum(1 for r in responses if r.get("judge", False)) + overall_acc = round(100 * correct / total, 1) + + # By difficulty + easy_items = [r for r in responses if r.get("difficulty") == "easy"] + hard_items = [r for r in responses if r.get("difficulty") == "hard"] + easy_acc = ( + round(100 * sum(1 for r in easy_items if r.get("judge", False)) / len(easy_items), 1) + if easy_items + else 0.0 + ) + hard_acc = ( + round(100 * sum(1 for r in hard_items if r.get("judge", False)) / len(hard_items), 1) + if hard_items + else 0.0 + ) + + # By length + short_items = [r for r in responses if r.get("length") == "short"] + medium_items = [r for r in responses if r.get("length") == "medium"] + long_items = [r for r in responses if r.get("length") == "long"] + + short_acc = ( + round(100 * sum(1 for r in short_items if r.get("judge", False)) / len(short_items), 1) + if short_items + else 0.0 + ) + medium_acc = ( + round(100 * sum(1 for r in medium_items if r.get("judge", False)) / len(medium_items), 1) + if medium_items + else 0.0 + ) + long_acc = ( + round(100 * sum(1 for r in long_items if r.get("judge", False)) / len(long_items), 1) + if long_items + else 0.0 + ) + + # By domain + domain_stats = {} + for response in responses: + domain = response.get("domain", "Unknown") + if domain not in domain_stats: + domain_stats[domain] = {"total": 0, "correct": 0} + domain_stats[domain]["total"] += 1 + if response.get("judge", False): + domain_stats[domain]["correct"] += 1 + + domain_acc = { + domain: round(100 * stats["correct"] / stats["total"], 1) + for domain, stats in domain_stats.items() + } + + return { + "overall": overall_acc, + "easy": easy_acc, + "hard": hard_acc, + "short": short_acc, + "medium": medium_acc, + "long": long_acc, + "by_domain": domain_acc, + "total_samples": total, + "correct_samples": correct, + } + + +def main(frame, version="default"): + """Main metric calculation function.""" + print("\n" + "=" * 80) + print(f"📊 LONGBENCH V2 METRICS CALCULATION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load responses + responses_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_responses.json" + if not os.path.exists(responses_path): + print(f"❌ Responses not found: {responses_path}") + print("Please run longbench_v2_responses.py first") + return + + with open(responses_path, encoding="utf-8") as f: + responses = json.load(f) + + # Calculate metrics + metrics = calculate_accuracy(responses) + + # Save metrics + output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_metrics.json" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(metrics, f, ensure_ascii=False, indent=4) + + print(f"\n{'=' * 80}") + print(f"✅ METRICS CALCULATION COMPLETE: Results saved to {output_path}".center(80)) + print(f"{'=' * 80}\n") + + # Print summary table + print("\n📊 Summary of Results:") + print("-" * 80) + print(f"{'Overall Accuracy':<30s}: {metrics['overall']:.1f}%") + print(f"{'Easy':<30s}: {metrics['easy']:.1f}%") + print(f"{'Hard':<30s}: {metrics['hard']:.1f}%") + print(f"{'Short':<30s}: {metrics['short']:.1f}%") + print(f"{'Medium':<30s}: {metrics['medium']:.1f}%") + print(f"{'Long':<30s}: {metrics['long']:.1f}%") + print("\nBy Domain:") + for domain, acc in metrics["by_domain"].items(): + print(f" {domain:<28s}: {acc:.1f}%") + print(f"\nTotal Samples: {metrics['total_samples']}") + print(f"Correct: {metrics['correct_samples']}") + print("-" * 80) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=["memos-api", "memos-api-online"], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for loading results", + ) + args = parser.parse_args() + + main(args.lib, args.version) diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_responses.py b/evaluation/scripts/long_bench-v2/longbench_v2_responses.py new file mode 100644 index 00000000..3e19dc95 --- /dev/null +++ b/evaluation/scripts/long_bench-v2/longbench_v2_responses.py @@ -0,0 +1,206 @@ +import argparse +import json +import os +import re +import sys + +from concurrent.futures import ThreadPoolExecutor, as_completed +from time import time + +from dotenv import load_dotenv +from openai import OpenAI +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +# Prompt template from LongBench v2 +LONGBENCH_V2_PROMPT = """Please read the following text and answer the question below. + + +{context} + + +What is the correct answer to this question: {question} +Choices: +(A) {choice_A} +(B) {choice_B} +(C) {choice_C} +(D) {choice_D} + +Format your response as follows: "The correct answer is (insert answer here)".""" + + +def extract_answer(response): + """Extract answer from response (A, B, C, or D).""" + response = response.replace("*", "") + # Try to find "The correct answer is (X)" pattern + match = re.search(r"The correct answer is \(([A-D])\)", response, re.IGNORECASE) + if match: + return match.group(1).upper() + else: + match = re.search(r"The correct answer is ([A-D])", response, re.IGNORECASE) + if match: + return match.group(1).upper() + else: + # Try to find standalone A, B, C, or D + match = re.search(r"\b([A-D])\b", response) + if match: + return match.group(1).upper() + return None + + +def generate_response(llm_client, context, question, choice_a, choice_b, choice_c, choice_d): + """Generate response using LLM.""" + prompt = LONGBENCH_V2_PROMPT.format( + context=context, + question=question, + choice_A=choice_a, + choice_B=choice_b, + choice_C=choice_c, + choice_D=choice_d, + ) + + try: + response = llm_client.chat.completions.create( + model=os.getenv("CHAT_MODEL"), + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt}, + ], + temperature=0.1, + max_tokens=128, + ) + result = response.choices[0].message.content or "" + return result + except Exception as e: + print(f"Error generating response: {e}") + return "" + + +def process_sample(search_result, llm_client): + """Process a single sample: generate answer.""" + start = time() + + context = search_result.get("context", "") + question = search_result.get("question", "") + choice_a = search_result.get("choice_A", "") + choice_b = search_result.get("choice_B", "") + choice_c = search_result.get("choice_C", "") + choice_d = search_result.get("choice_D", "") + + # Generate answer + response = generate_response( + llm_client, context, question, choice_a, choice_b, choice_c, choice_d + ) + + # Extract answer (A, B, C, or D) + pred = extract_answer(response) + + response_duration_ms = (time() - start) * 1000 + + return { + "sample_idx": search_result.get("sample_idx"), + "_id": search_result.get("_id"), + "domain": search_result.get("domain"), + "sub_domain": search_result.get("sub_domain"), + "difficulty": search_result.get("difficulty"), + "length": search_result.get("length"), + "question": question, + "choice_A": choice_a, + "choice_B": choice_b, + "choice_C": choice_c, + "choice_D": choice_d, + "answer": search_result.get("answer"), + "pred": pred, + "response": response, + "judge": pred == search_result.get("answer") if pred else False, + "search_context": context, + "response_duration_ms": response_duration_ms, + "search_duration_ms": search_result.get("search_duration_ms", 0), + } + + +def main(frame, version="default", num_workers=10): + """Main response generation function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH V2 RESPONSE GENERATION - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load search results + search_path = ( + f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_search_results.json" + ) + if not os.path.exists(search_path): + print(f"❌ Search results not found: {search_path}") + print("Please run longbench_v2_search.py first") + return + + with open(search_path, encoding="utf-8") as f: + search_results = json.load(f) + + # Initialize LLM client + llm_client = OpenAI( + api_key=os.getenv("CHAT_MODEL_API_KEY"), + base_url=os.getenv("CHAT_MODEL_BASE_URL"), + ) + print(f"🔌 Using OpenAI client with model: {os.getenv('CHAT_MODEL')}") + + # Process all samples + all_responses = [] + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(process_sample, sample, llm_client) for sample in search_results] + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="Generating responses", + ): + result = future.result() + if result: + all_responses.append(result) + + # Save responses + output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_responses.json" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(all_responses, f, ensure_ascii=False, indent=2) + + print(f"\n{'=' * 80}") + print(f"✅ RESPONSE GENERATION COMPLETE: Results saved to {output_path}".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=["memos-api", "memos-api-online"], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for loading results", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers", + ) + args = parser.parse_args() + + main(args.lib, args.version, args.workers) diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_search.py b/evaluation/scripts/long_bench-v2/longbench_v2_search.py new file mode 100644 index 00000000..f4692849 --- /dev/null +++ b/evaluation/scripts/long_bench-v2/longbench_v2_search.py @@ -0,0 +1,192 @@ +import argparse +import json +import os +import sys + +from concurrent.futures import ThreadPoolExecutor, as_completed +from time import time + +from dotenv import load_dotenv +from tqdm import tqdm + + +ROOT_DIR = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) +EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") + +sys.path.insert(0, ROOT_DIR) +sys.path.insert(0, EVAL_SCRIPTS_DIR) + + +def memos_api_search(client, query, user_id, top_k, frame): + """Search using memos API.""" + start = time() + search_results = client.search(query=query, user_id=user_id, top_k=top_k) + + # Format context from search results based on frame type + context = "" + if ( + (frame == "memos-api" or frame == "memos-api-online") + and isinstance(search_results, dict) + and "text_mem" in search_results + ): + context = "\n".join([i["memory"] for i in search_results["text_mem"][0]["memories"]]) + if "pref_string" in search_results: + context += f"\n{search_results.get('pref_string', '')}" + + duration_ms = (time() - start) * 1000 + return context, duration_ms + + +def process_sample(client, sample, sample_idx, frame, version, top_k): + """Process a single sample: search for relevant memories.""" + user_id = f"longbench_v2_{sample_idx}_{version}" + query = sample.get("question", "") + + if not query: + return None + + context, duration_ms = memos_api_search(client, query, user_id, top_k, frame) + + return { + "sample_idx": sample_idx, + "_id": sample.get("_id"), + "domain": sample.get("domain"), + "sub_domain": sample.get("sub_domain"), + "difficulty": sample.get("difficulty"), + "length": sample.get("length"), + "question": query, + "choice_A": sample.get("choice_A"), + "choice_B": sample.get("choice_B"), + "choice_C": sample.get("choice_C"), + "choice_D": sample.get("choice_D"), + "answer": sample.get("answer"), + "context": context, + "search_duration_ms": duration_ms, + } + + +def load_dataset_from_local(): + """Load LongBench v2 dataset from local JSON file.""" + data_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "data", + "long_bench_v2", + ) + + filepath = os.path.join(data_dir, "data.json") + + if not os.path.exists(filepath): + raise FileNotFoundError(f"Dataset file not found: {filepath}") + + # Load JSON file + with open(filepath, encoding="utf-8") as f: + samples = json.load(f) + + return samples + + +def main(frame, version="default", num_workers=10, top_k=20, max_samples=None): + """Main search function.""" + load_dotenv() + + print("\n" + "=" * 80) + print(f"🚀 LONGBENCH V2 SEARCH - {frame.upper()} v{version}".center(80)) + print("=" * 80 + "\n") + + # Load dataset from local file + try: + dataset = load_dataset_from_local() + print(f"Loaded {len(dataset)} samples from LongBench v2") + except FileNotFoundError as e: + print(f"❌ Error loading dataset: {e}") + return + except Exception as e: + print(f"❌ Error loading dataset: {e}") + return + + # Limit samples if specified + if max_samples: + dataset = dataset[:max_samples] + print(f"Limited to {len(dataset)} samples") + + # Initialize client + client = None + if frame == "memos-api": + from utils.client import MemosApiClient + + client = MemosApiClient() + elif frame == "memos-api-online": + from utils.client import MemosApiOnlineClient + + client = MemosApiOnlineClient() + else: + print(f"❌ Unsupported frame: {frame}") + return + + # Process samples + search_results = [] + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [] + for idx, sample in enumerate(dataset): + future = executor.submit(process_sample, client, sample, idx, frame, version, top_k) + futures.append(future) + + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="Searching LongBench v2", + ): + result = future.result() + if result: + search_results.append(result) + + # Save results + os.makedirs(f"results/long_bench-v2/{frame}-{version}/", exist_ok=True) + output_path = ( + f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_search_results.json" + ) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(search_results, f, ensure_ascii=False, indent=2) + + print(f"\n{'=' * 80}") + print(f"✅ SEARCH COMPLETE: Results saved to {output_path}".center(80)) + print(f"{'=' * 80}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--lib", + type=str, + choices=["memos-api", "memos-api-online"], + default="memos-api", + ) + parser.add_argument( + "--version", + type=str, + default="default", + help="Version identifier for saving results", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers", + ) + parser.add_argument( + "--top_k", + type=int, + default=20, + help="Number of results to retrieve in search queries", + ) + parser.add_argument( + "--max_samples", + type=int, + default=None, + help="Maximum number of samples to process (default: all)", + ) + args = parser.parse_args() + + main(args.lib, args.version, args.workers, args.top_k, args.max_samples) diff --git a/src/memos/graph_dbs/polardb.py b/src/memos/graph_dbs/polardb.py index ddcbfe28..85e5d14f 100644 --- a/src/memos/graph_dbs/polardb.py +++ b/src/memos/graph_dbs/polardb.py @@ -151,7 +151,7 @@ def __init__(self, config: PolarDBGraphDBConfig): # Create connection pool self.connection_pool = psycopg2.pool.ThreadedConnectionPool( minconn=5, - maxconn=100, + maxconn=2000, host=host, port=port, user=user, From 69dd3a8bc9695cf0d1deda329f100cdba1f1a718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 20:20:40 +0800 Subject: [PATCH 10/17] feat: derease parallrl --- src/memos/memories/textual/tree_text_memory/organize/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memos/memories/textual/tree_text_memory/organize/manager.py b/src/memos/memories/textual/tree_text_memory/organize/manager.py index 2a3bae94..470d2c48 100644 --- a/src/memos/memories/textual/tree_text_memory/organize/manager.py +++ b/src/memos/memories/textual/tree_text_memory/organize/manager.py @@ -92,7 +92,7 @@ def add( """ added_ids: list[str] = [] - with ContextThreadPoolExecutor(max_workers=50) as executor: + with ContextThreadPoolExecutor(max_workers=10) as executor: futures = {executor.submit(self._process_memory, m, user_name): m for m in memories} for future in as_completed(futures, timeout=500): try: From ac38046ff22e96c6d84265236434fc9befdc4244 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 20:28:04 +0800 Subject: [PATCH 11/17] feat: add image parser in file --- .../read_multi_modal/file_content_parser.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index cce99e76..8edcbfe5 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -2,6 +2,7 @@ import concurrent.futures import os +import re import tempfile from typing import Any @@ -13,6 +14,7 @@ from memos.llms.base import BaseLLM from memos.log import get_logger from memos.mem_reader.read_multi_modal.base import BaseMessageParser, _derive_key +from memos.mem_reader.read_multi_modal.image_parser import ImageParser from memos.mem_reader.read_multi_modal.utils import ( detect_lang, get_parser, @@ -129,6 +131,91 @@ def _handle_local(self, data: str) -> str: logger.info("[FileContentParser] Local file paths are not supported in fine mode.") return "" + def _extract_and_process_images(self, text: str, info: dict[str, Any], **kwargs) -> str: + """ + Extract all images from markdown text and process them using ImageParser. + Replaces image references with extracted text content. + + Args: + text: Markdown text containing image references + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters for ImageParser + + Returns: + Text with image references replaced by extracted content + """ + if not text or not self.image_parser: + return text + + # Pattern to match markdown images: ![](url) or ![alt](url) + image_pattern = r"!\[([^\]]*)\]\(([^)]+)\)" + + # Find all image matches first + image_matches = list(re.finditer(image_pattern, text)) + if not image_matches: + return text + + logger.info(f"[FileContentParser] Found {len(image_matches)} images to process") + + # Process images and build replacement map + replacements = {} + for idx, match in enumerate(image_matches, 1): + image_url = match.group(2) + + try: + # Construct image message format for ImageParser + image_message = { + "type": "image_url", + "image_url": { + "url": image_url, + "detail": "auto", + }, + } + + # Process image using ImageParser + logger.info( + f"[FileContentParser] Processing image {idx}/{len(image_matches)}: {image_url}" + ) + memory_items = self.image_parser.parse_fine(image_message, info, **kwargs) + + # Extract text content from memory items (only strings as requested) + extracted_texts = [] + for item in memory_items: + if hasattr(item, "memory") and item.memory: + extracted_texts.append(str(item.memory)) + + if extracted_texts: + # Combine all extracted texts + extracted_content = "\n".join(extracted_texts) + # Replace image with extracted content + replacements[match.group(0)] = ( + f"\n[Image Content from {image_url}]:\n{extracted_content}\n" + ) + else: + # If no content extracted, keep original with a note + logger.warning( + f"[FileContentParser] No content extracted from image: {image_url}" + ) + replacements[match.group(0)] = ( + f"\n[Image: {image_url} - No content extracted]\n" + ) + + except Exception as e: + logger.error(f"[FileContentParser] Error processing image {image_url}: {e}") + # On error, keep original image reference + replacements[match.group(0)] = match.group(0) + + # Replace all images in the text + processed_text = text + for original, replacement in replacements.items(): + processed_text = processed_text.replace(original, replacement, 1) + + logger.info( + f"[FileContentParser] Processed {len(image_matches)} images, " + f"extracted content for {sum(1 for r in replacements.values() if 'Image Content' in r)} images" + ) + return processed_text + def __init__( self, embedder: BaseEmbedder, @@ -149,6 +236,8 @@ def __init__( """ super().__init__(embedder, llm) self.parser = parser + # Initialize ImageParser for processing images in markdown + self.image_parser = ImageParser(embedder, llm) if llm else None # Get inner markdown hostnames from config or environment if direct_markdown_hostnames is not None: @@ -519,6 +608,10 @@ def parse_fine( f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}" ) + # Extract and process images from parsed_text + if is_markdown and parsed_text and self.image_parser: + parsed_text = self._extract_and_process_images(parsed_text, info, **kwargs) + # Extract info fields if not info: info = {} From 37bcc904e4b79932f63bc60e5d665acc5e062ef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Mon, 8 Dec 2025 20:42:20 +0800 Subject: [PATCH 12/17] feat: update file_content_parser --- .../read_multi_modal/file_content_parser.py | 142 ++++++++++++------ 1 file changed, 94 insertions(+), 48 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 972758c4..408736d2 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -131,9 +131,65 @@ def _handle_local(self, data: str) -> str: logger.info("[FileContentParser] Local file paths are not supported in fine mode.") return "" + def _process_single_image( + self, image_url: str, original_ref: str, info: dict[str, Any], **kwargs + ) -> tuple[str, str]: + """ + Process a single image and return (original_ref, replacement_text). + + Args: + image_url: URL of the image to process + original_ref: Original markdown image reference to replace + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters for ImageParser + + Returns: + Tuple of (original_ref, replacement_text) + """ + try: + # Construct image message format for ImageParser + image_message = { + "type": "image_url", + "image_url": { + "url": image_url, + "detail": "auto", + }, + } + + # Process image using ImageParser + logger.debug(f"[FileContentParser] Processing image: {image_url}") + memory_items = self.image_parser.parse_fine(image_message, info, **kwargs) + + # Extract text content from memory items (only strings as requested) + extracted_texts = [] + for item in memory_items: + if hasattr(item, "memory") and item.memory: + extracted_texts.append(str(item.memory)) + + if extracted_texts: + # Combine all extracted texts + extracted_content = "\n".join(extracted_texts) + # Replace image with extracted content + return ( + original_ref, + f"\n[Image Content from {image_url}]:\n{extracted_content}\n", + ) + else: + # If no content extracted, keep original with a note + logger.warning(f"[FileContentParser] No content extracted from image: {image_url}") + return ( + original_ref, + f"\n[Image: {image_url} - No content extracted]\n", + ) + + except Exception as e: + logger.error(f"[FileContentParser] Error processing image {image_url}: {e}") + # On error, keep original image reference + return (original_ref, original_ref) + def _extract_and_process_images(self, text: str, info: dict[str, Any], **kwargs) -> str: """ - Extract all images from markdown text and process them using ImageParser. + Extract all images from markdown text and process them using ImageParser in parallel. Replaces image references with extracted text content. Args: @@ -155,64 +211,54 @@ def _extract_and_process_images(self, text: str, info: dict[str, Any], **kwargs) if not image_matches: return text - logger.info(f"[FileContentParser] Found {len(image_matches)} images to process") + logger.info(f"[FileContentParser] Found {len(image_matches)} images to process in parallel") - # Process images and build replacement map - replacements = {} - for idx, match in enumerate(image_matches, 1): + # Prepare tasks for parallel processing + tasks = [] + for match in image_matches: image_url = match.group(2) + original_ref = match.group(0) + tasks.append((image_url, original_ref)) - try: - # Construct image message format for ImageParser - image_message = { - "type": "image_url", - "image_url": { - "url": image_url, - "detail": "auto", - }, - } - - # Process image using ImageParser - logger.info( - f"[FileContentParser] Processing image {idx}/{len(image_matches)}: {image_url}" - ) - memory_items = self.image_parser.parse_fine(image_message, info, **kwargs) - - # Extract text content from memory items (only strings as requested) - extracted_texts = [] - for item in memory_items: - if hasattr(item, "memory") and item.memory: - extracted_texts.append(str(item.memory)) - - if extracted_texts: - # Combine all extracted texts - extracted_content = "\n".join(extracted_texts) - # Replace image with extracted content - replacements[match.group(0)] = ( - f"\n[Image Content from {image_url}]:\n{extracted_content}\n" - ) - else: - # If no content extracted, keep original with a note - logger.warning( - f"[FileContentParser] No content extracted from image: {image_url}" - ) - replacements[match.group(0)] = ( - f"\n[Image: {image_url} - No content extracted]\n" - ) + # Process images in parallel + replacements = {} + max_workers = min(len(tasks), 10) # Limit concurrent image processing - except Exception as e: - logger.error(f"[FileContentParser] Error processing image {image_url}: {e}") - # On error, keep original image reference - replacements[match.group(0)] = match.group(0) + with ContextThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + self._process_single_image, image_url, original_ref, info, **kwargs + ): (image_url, original_ref) + for image_url, original_ref in tasks + } + + # Collect results with progress tracking + for future in tqdm( + concurrent.futures.as_completed(futures), + total=len(futures), + desc="[FileContentParser] Processing images", + ): + try: + original_ref, replacement = future.result() + replacements[original_ref] = replacement + except Exception as e: + image_url, original_ref = futures[future] + logger.error(f"[FileContentParser] Future failed for image {image_url}: {e}") + # On error, keep original image reference + replacements[original_ref] = original_ref # Replace all images in the text processed_text = text for original, replacement in replacements.items(): processed_text = processed_text.replace(original, replacement, 1) + # Count successfully extracted images + success_count = sum( + 1 for replacement in replacements.values() if "Image Content from" in replacement + ) logger.info( - f"[FileContentParser] Processed {len(image_matches)} images, " - f"extracted content for {sum(1 for r in replacements.values() if 'Image Content' in r)} images" + f"[FileContentParser] Processed {len(image_matches)} images in parallel, " + f"extracted content for {success_count} images" ) return processed_text From 20af5d0204af0feb1dc88e2d814bcc7e75541f44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 9 Dec 2025 12:34:09 +0800 Subject: [PATCH 13/17] feat: modify long_bench_v2 --- .../long_bench-v2/longbench_v2_ingestion.py | 4 +- .../long_bench-v2/longbench_v2_metric.py | 5 +- .../long_bench-v2/longbench_v2_responses.py | 85 ++++++++++++-- .../long_bench-v2/longbench_v2_search.py | 92 ++++++++++++--- .../scripts/long_bench-v2/wait_scheduler.py | 67 +++++++++++ evaluation/scripts/run_longbench_v2_eval.sh | 110 ++++++++++++++++++ 6 files changed, 334 insertions(+), 29 deletions(-) create mode 100644 evaluation/scripts/long_bench-v2/wait_scheduler.py create mode 100755 evaluation/scripts/run_longbench_v2_eval.sh diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py index d84a63d9..72a02397 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py @@ -179,13 +179,13 @@ def main(frame, version="default", num_workers=10, max_samples=None): parser.add_argument( "--version", type=str, - default="long-bench-v2-1208-1556", + default="default", help="Version identifier for saving results", ) parser.add_argument( "--workers", type=int, - default=20, + default=3, help="Number of parallel workers", ) parser.add_argument( diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_metric.py b/evaluation/scripts/long_bench-v2/longbench_v2_metric.py index 5fee9a3d..6489dc40 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_metric.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_metric.py @@ -92,8 +92,11 @@ def main(frame, version="default"): with open(responses_path, encoding="utf-8") as f: responses = json.load(f) + # Only keep entries with non-empty context (search_context) to align with response generation + filtered = [r for r in responses if str(r.get("search_context", "")).strip() != ""] + # Calculate metrics - metrics = calculate_accuracy(responses) + metrics = calculate_accuracy(filtered) # Save metrics output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_metrics.json" diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_responses.py b/evaluation/scripts/long_bench-v2/longbench_v2_responses.py index 3e19dc95..8c443943 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_responses.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_responses.py @@ -3,6 +3,7 @@ import os import re import sys +import threading from concurrent.futures import ThreadPoolExecutor, as_completed from time import time @@ -85,8 +86,13 @@ def generate_response(llm_client, context, question, choice_a, choice_b, choice_ return "" -def process_sample(search_result, llm_client): +def process_sample(search_result, llm_client, success_records, record_file, file_lock): """Process a single sample: generate answer.""" + sample_idx = search_result.get("sample_idx") + # Skip if already processed + if sample_idx is not None and str(sample_idx) in success_records: + return None + start = time() context = search_result.get("context", "") @@ -96,6 +102,10 @@ def process_sample(search_result, llm_client): choice_c = search_result.get("choice_C", "") choice_d = search_result.get("choice_D", "") + # Skip empty/placeholder contexts (e.g., "\n" or whitespace-only) + if not context or context.strip() == "": + return None + # Generate answer response = generate_response( llm_client, context, question, choice_a, choice_b, choice_c, choice_d @@ -106,7 +116,7 @@ def process_sample(search_result, llm_client): response_duration_ms = (time() - start) * 1000 - return { + result = { "sample_idx": search_result.get("sample_idx"), "_id": search_result.get("_id"), "domain": search_result.get("domain"), @@ -123,10 +133,20 @@ def process_sample(search_result, llm_client): "response": response, "judge": pred == search_result.get("answer") if pred else False, "search_context": context, + # Preserve full search results payload (e.g., list of memories) + "search_results": search_result.get("search_results"), "response_duration_ms": response_duration_ms, "search_duration_ms": search_result.get("search_duration_ms", 0), } + # Record successful processing (thread-safe) + if sample_idx is not None: + with file_lock, open(record_file, "a") as f: + f.write(f"{sample_idx}\n") + f.flush() + + return result + def main(frame, version="default", num_workers=10): """Main response generation function.""" @@ -136,10 +156,16 @@ def main(frame, version="default", num_workers=10): print(f"🚀 LONGBENCH V2 RESPONSE GENERATION - {frame.upper()} v{version}".center(80)) print("=" * 80 + "\n") - # Load search results - search_path = ( - f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_search_results.json" + # Initialize checkpoint file for resume functionality + checkpoint_dir = os.path.join( + ROOT_DIR, "evaluation", "results", "long_bench-v2", f"{frame}-{version}" ) + os.makedirs(checkpoint_dir, exist_ok=True) + record_file = os.path.join(checkpoint_dir, "response_success_records.txt") + search_path = os.path.join(checkpoint_dir, f"{frame}_longbench_v2_search_results.json") + output_path = os.path.join(checkpoint_dir, f"{frame}_longbench_v2_responses.json") + + # Load search results if not os.path.exists(search_path): print(f"❌ Search results not found: {search_path}") print("Please run longbench_v2_search.py first") @@ -148,6 +174,30 @@ def main(frame, version="default", num_workers=10): with open(search_path, encoding="utf-8") as f: search_results = json.load(f) + # Load existing results and success records for resume + existing_results = {} + success_records = set() + if os.path.exists(output_path): + with open(output_path, encoding="utf-8") as f: + existing_results_list = json.load(f) + for result in existing_results_list: + sample_idx = result.get("sample_idx") + if sample_idx is not None: + existing_results[sample_idx] = result + success_records.add(str(sample_idx)) + print(f"📋 Found {len(existing_results)} existing responses (resume mode)") + else: + print("📋 Starting fresh response generation (no checkpoint found)") + + # Load additional success records from checkpoint file + if os.path.exists(record_file): + with open(record_file) as f: + for line in f: + line = line.strip() + if line and line not in success_records: + success_records.add(line) + print(f"📋 Total {len(success_records)} samples already processed") + # Initialize LLM client llm_client = OpenAI( api_key=os.getenv("CHAT_MODEL_API_KEY"), @@ -156,9 +206,15 @@ def main(frame, version="default", num_workers=10): print(f"🔌 Using OpenAI client with model: {os.getenv('CHAT_MODEL')}") # Process all samples - all_responses = [] + new_results = [] + file_lock = threading.Lock() # Lock for thread-safe file writing with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [executor.submit(process_sample, sample, llm_client) for sample in search_results] + futures = [ + executor.submit( + process_sample, sample, llm_client, success_records, record_file, file_lock + ) + for sample in search_results + ] for future in tqdm( as_completed(futures), @@ -167,11 +223,16 @@ def main(frame, version="default", num_workers=10): ): result = future.result() if result: - all_responses.append(result) - - # Save responses - output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_responses.json" - os.makedirs(os.path.dirname(output_path), exist_ok=True) + new_results.append(result) + # Update existing results with new result + sample_idx = result.get("sample_idx") + if sample_idx is not None: + existing_results[sample_idx] = result + + # Merge and save all results + all_responses = list(existing_results.values()) + # Sort by sample_idx to maintain order + all_responses.sort(key=lambda x: x.get("sample_idx", 0)) with open(output_path, "w", encoding="utf-8") as f: json.dump(all_responses, f, ensure_ascii=False, indent=2) diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_search.py b/evaluation/scripts/long_bench-v2/longbench_v2_search.py index f4692849..686ff0ba 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_search.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_search.py @@ -2,6 +2,7 @@ import json import os import sys +import threading from concurrent.futures import ThreadPoolExecutor, as_completed from time import time @@ -24,7 +25,7 @@ def memos_api_search(client, query, user_id, top_k, frame): start = time() search_results = client.search(query=query, user_id=user_id, top_k=top_k) - # Format context from search results based on frame type + # Format context from search results based on frame type for backward compatibility context = "" if ( (frame == "memos-api" or frame == "memos-api-online") @@ -36,20 +37,26 @@ def memos_api_search(client, query, user_id, top_k, frame): context += f"\n{search_results.get('pref_string', '')}" duration_ms = (time() - start) * 1000 - return context, duration_ms + return context, duration_ms, search_results -def process_sample(client, sample, sample_idx, frame, version, top_k): +def process_sample( + client, sample, sample_idx, frame, version, top_k, success_records, record_file, file_lock +): """Process a single sample: search for relevant memories.""" + # Skip if already processed + if str(sample_idx) in success_records: + return None + user_id = f"longbench_v2_{sample_idx}_{version}" query = sample.get("question", "") if not query: return None - context, duration_ms = memos_api_search(client, query, user_id, top_k, frame) + context, duration_ms, search_results = memos_api_search(client, query, user_id, top_k, frame) - return { + result = { "sample_idx": sample_idx, "_id": sample.get("_id"), "domain": sample.get("domain"), @@ -63,9 +70,18 @@ def process_sample(client, sample, sample_idx, frame, version, top_k): "choice_D": sample.get("choice_D"), "answer": sample.get("answer"), "context": context, + # Preserve full search results instead of only the concatenated context + "search_results": search_results, "search_duration_ms": duration_ms, } + # Record successful processing (thread-safe) + with file_lock, open(record_file, "a") as f: + f.write(f"{sample_idx}\n") + f.flush() + + return result + def load_dataset_from_local(): """Load LongBench v2 dataset from local JSON file.""" @@ -111,6 +127,38 @@ def main(frame, version="default", num_workers=10, top_k=20, max_samples=None): dataset = dataset[:max_samples] print(f"Limited to {len(dataset)} samples") + # Initialize checkpoint file for resume functionality + checkpoint_dir = os.path.join( + ROOT_DIR, "evaluation", "results", "long_bench-v2", f"{frame}-{version}" + ) + os.makedirs(checkpoint_dir, exist_ok=True) + record_file = os.path.join(checkpoint_dir, "search_success_records.txt") + output_path = os.path.join(checkpoint_dir, f"{frame}_longbench_v2_search_results.json") + + # Load existing results and success records for resume + existing_results = {} + success_records = set() + if os.path.exists(output_path): + with open(output_path, encoding="utf-8") as f: + existing_results_list = json.load(f) + for result in existing_results_list: + sample_idx = result.get("sample_idx") + if sample_idx is not None: + existing_results[sample_idx] = result + success_records.add(str(sample_idx)) + print(f"📋 Found {len(existing_results)} existing search results (resume mode)") + else: + print("📋 Starting fresh search (no checkpoint found)") + + # Load additional success records from checkpoint file + if os.path.exists(record_file): + with open(record_file) as f: + for line in f: + line = line.strip() + if line and line not in success_records: + success_records.add(line) + print(f"📋 Total {len(success_records)} samples already processed") + # Initialize client client = None if frame == "memos-api": @@ -126,11 +174,23 @@ def main(frame, version="default", num_workers=10, top_k=20, max_samples=None): return # Process samples - search_results = [] + new_results = [] + file_lock = threading.Lock() # Lock for thread-safe file writing with ThreadPoolExecutor(max_workers=num_workers) as executor: futures = [] for idx, sample in enumerate(dataset): - future = executor.submit(process_sample, client, sample, idx, frame, version, top_k) + future = executor.submit( + process_sample, + client, + sample, + idx, + frame, + version, + top_k, + success_records, + record_file, + file_lock, + ) futures.append(future) for future in tqdm( @@ -140,13 +200,17 @@ def main(frame, version="default", num_workers=10, top_k=20, max_samples=None): ): result = future.result() if result: - search_results.append(result) + new_results.append(result) + # Update existing results with new result + sample_idx = result.get("sample_idx") + if sample_idx is not None: + existing_results[sample_idx] = result + + # Merge and save all results + search_results = list(existing_results.values()) + # Sort by sample_idx to maintain order + search_results.sort(key=lambda x: x.get("sample_idx", 0)) - # Save results - os.makedirs(f"results/long_bench-v2/{frame}-{version}/", exist_ok=True) - output_path = ( - f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_search_results.json" - ) with open(output_path, "w", encoding="utf-8") as f: json.dump(search_results, f, ensure_ascii=False, indent=2) @@ -172,7 +236,7 @@ def main(frame, version="default", num_workers=10, top_k=20, max_samples=None): parser.add_argument( "--workers", type=int, - default=10, + default=1, help="Number of parallel workers", ) parser.add_argument( diff --git a/evaluation/scripts/long_bench-v2/wait_scheduler.py b/evaluation/scripts/long_bench-v2/wait_scheduler.py new file mode 100644 index 00000000..716869a1 --- /dev/null +++ b/evaluation/scripts/long_bench-v2/wait_scheduler.py @@ -0,0 +1,67 @@ +import os +import time + +import requests + +from dotenv import load_dotenv + + +def wait_until_completed(params: dict, interval: float = 2.0, timeout: float = 600.0): + """ + Keep polling /product/scheduler/status until status == 'completed' (or terminal). + + params: dict passed as query params, e.g. {"user_id": "xxx"} or {"user_id": "xxx", "task_id": "..."} + interval: seconds between polls + timeout: max seconds to wait before raising TimeoutError + """ + load_dotenv() + base_url = os.getenv("MEMOS_URL") + if not base_url: + raise RuntimeError("MEMOS_URL not set in environment") + + url = f"{base_url}/product/scheduler/status" + start = time.time() + active_states = {"waiting", "pending", "in_progress"} + + while True: + resp = requests.get(url, params=params, timeout=10) + resp.raise_for_status() + data = resp.json() + + items = data.get("data", []) if isinstance(data, dict) else [] + statuses = [item.get("status") for item in items if isinstance(item, dict)] + status_set = set(statuses) + + # Print current status snapshot + print(f"Current status: {status_set or 'empty'}") + + # Completed if no active states remain + if not status_set or status_set.isdisjoint(active_states): + print("Task completed!") + return data + + if (time.time() - start) > timeout: + raise TimeoutError(f"Timeout after {timeout}s; last statuses={status_set or 'empty'}") + + time.sleep(interval) + + +if __name__ == "__main__": + import argparse + import json + + parser = argparse.ArgumentParser() + parser.add_argument( + "--user_id", default="longbench_v2_0_long-bench-v2-1208-2119-async", help="User ID to query" + ) + parser.add_argument("--task_id", help="Optional task_id to query") + parser.add_argument("--interval", type=float, default=2.0, help="Poll interval seconds") + parser.add_argument("--timeout", type=float, default=600.0, help="Timeout seconds") + args = parser.parse_args() + + params = {"user_id": args.user_id} + if args.task_id: + params["task_id"] = args.task_id + + result = wait_until_completed(params, interval=args.interval, timeout=args.timeout) + print(json.dumps(result, indent=2, ensure_ascii=False)) diff --git a/evaluation/scripts/run_longbench_v2_eval.sh b/evaluation/scripts/run_longbench_v2_eval.sh new file mode 100755 index 00000000..917c57bf --- /dev/null +++ b/evaluation/scripts/run_longbench_v2_eval.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Common parameters for all scripts +LIB="memos-api" +VERSION="long-bench-v2-1208-1556-async" +WORKERS=10 +TOPK=20 +MAX_SAMPLES="" # Empty means all samples +WAIT_INTERVAL=2 # seconds between polls +WAIT_TIMEOUT=900 # seconds per user + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --lib) + LIB="$2" + shift 2 + ;; + --version) + VERSION="$2" + shift 2 + ;; + --workers) + WORKERS="$2" + shift 2 + ;; + --top_k) + TOPK="$2" + shift 2 + ;; + --max_samples) + MAX_SAMPLES="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Build max_samples argument +MAX_SAMPLES_ARG="" +if [ -n "$MAX_SAMPLES" ]; then + MAX_SAMPLES_ARG="--max_samples $MAX_SAMPLES" +fi + +echo "Running LongBench v2 evaluation with:" +echo " LIB: $LIB" +echo " VERSION: $VERSION" +echo " WORKERS: $WORKERS" +echo " TOPK: $TOPK" +echo " MAX_SAMPLES: ${MAX_SAMPLES:-all}" +echo "" + +# Step 2: Search +echo "" +echo "==========================================" +echo "Step 2: Running longbench_v2_search.py..." +echo "==========================================" +python scripts/long_bench-v2/longbench_v2_search.py \ + --lib $LIB \ + --version $VERSION \ + --top_k $TOPK \ + --workers $WORKERS \ + $MAX_SAMPLES_ARG + +if [ $? -ne 0 ]; then + echo "Error running longbench_v2_search.py" + exit 1 +fi + +# Step 3: Response Generation +echo "" +echo "==========================================" +echo "Step 3: Running longbench_v2_responses.py..." +echo "==========================================" +python scripts/long_bench-v2/longbench_v2_responses.py \ + --lib $LIB \ + --version $VERSION \ + --workers $WORKERS + +if [ $? -ne 0 ]; then + echo "Error running longbench_v2_responses.py" + exit 1 +fi + +# Step 4: Metrics Calculation +echo "" +echo "==========================================" +echo "Step 4: Running longbench_v2_metric.py..." +echo "==========================================" +python scripts/long_bench-v2/longbench_v2_metric.py \ + --lib $LIB \ + --version $VERSION + +if [ $? -ne 0 ]; then + echo "Error running longbench_v2_metric.py" + exit 1 +fi + +echo "" +echo "==========================================" +echo "All steps completed successfully!" +echo "==========================================" +echo "" +echo "Results are saved in: results/long_bench-v2/$LIB-$VERSION/" +echo " - Search results: ${LIB}_longbench_v2_search_results.json" +echo " - Responses: ${LIB}_longbench_v2_responses.json" +echo " - Metrics: ${LIB}_longbench_v2_metrics.json" From 0ef1bb54c5173632733852dbc9eef4ff0d348004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 9 Dec 2025 15:06:40 +0800 Subject: [PATCH 14/17] feat: modify long_bench_v2 --- .../long_bench-v2/longbench_v2_ingestion.py | 2 +- .../longbench_v2_ingestion_async.py | 158 ------------------ .../long_bench-v2/longbench_v2_metric.py | 4 +- .../long_bench-v2/longbench_v2_responses.py | 2 +- .../long_bench-v2/longbench_v2_search.py | 48 +++++- 5 files changed, 50 insertions(+), 164 deletions(-) delete mode 100644 evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py index 72a02397..fc65e497 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion.py @@ -106,7 +106,7 @@ def main(frame, version="default", num_workers=10, max_samples=None): # Initialize checkpoint file for resume functionality checkpoint_dir = os.path.join( - ROOT_DIR, "evaluation", "results", "longbench_v2", f"{frame}-{version}" + ROOT_DIR, "evaluation", "results", "long_bench_v2", f"{frame}-{version}" ) os.makedirs(checkpoint_dir, exist_ok=True) record_file = os.path.join(checkpoint_dir, "success_records.txt") diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py b/evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py deleted file mode 100644 index c23d7885..00000000 --- a/evaluation/scripts/long_bench-v2/longbench_v2_ingestion_async.py +++ /dev/null @@ -1,158 +0,0 @@ -import argparse -import json -import os -import sys - -from concurrent.futures import ThreadPoolExecutor, as_completed - -from dotenv import load_dotenv -from tqdm import tqdm - - -ROOT_DIR = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -) -EVAL_SCRIPTS_DIR = os.path.join(ROOT_DIR, "evaluation", "scripts") - -sys.path.insert(0, ROOT_DIR) -sys.path.insert(0, EVAL_SCRIPTS_DIR) - - -def ingest_sample(client, sample, sample_idx, frame, version): - """Ingest a single LongBench v2 sample as memories.""" - user_id = f"longbench_v2_{sample_idx}_{version}" - conv_id = f"longbench_v2_{sample_idx}_{version}" - - # Get context and convert to messages - context = sample.get("context", "") - - # For memos, we ingest the context as document content - messages = [ - { - "type": "file", - "file": { - "file_data": context, - "file_id": str(sample_idx), - }, - } - ] - - if "memos-api" in frame: - try: - client.add(messages=messages, user_id=user_id, conv_id=conv_id, batch_size=1) - print(f"✅ [{frame}] Ingested sample {sample_idx}") - return True - except Exception as e: - print(f"❌ [{frame}] Error ingesting sample {sample_idx}: {e}") - return False - - return False - - -def load_dataset_from_local(): - """Load LongBench v2 dataset from local JSON file.""" - data_dir = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - "data", - "long_bench_v2", - ) - - filepath = os.path.join(data_dir, "data.json") - - if not os.path.exists(filepath): - raise FileNotFoundError(f"Dataset file not found: {filepath}") - - # Load JSON file - with open(filepath, encoding="utf-8") as f: - samples = json.load(f) - - return samples - - -def main(frame, version="default", num_workers=10, max_samples=None): - """Main ingestion function.""" - load_dotenv() - - print("\n" + "=" * 80) - print(f"🚀 LONGBENCH V2 INGESTION - {frame.upper()} v{version}".center(80)) - print("=" * 80 + "\n") - - # Load dataset from local file - try: - dataset = load_dataset_from_local() - print(f"Loaded {len(dataset)} samples from LongBench v2") - except FileNotFoundError as e: - print(f"❌ Error loading dataset: {e}") - return - except Exception as e: - print(f"❌ Error loading dataset: {e}") - return - - # Limit samples if specified - if max_samples: - dataset = dataset[:max_samples] - print(f"Limited to {len(dataset)} samples") - - # Initialize client - client = None - if frame == "memos-api": - from utils.client import MemosApiClient - - client = MemosApiClient() - else: - print(f"❌ Unsupported frame: {frame}") - return - - # Ingest samples - success_count = 0 - with ThreadPoolExecutor(max_workers=num_workers) as executor: - futures = [] - for idx, sample in enumerate(dataset): - future = executor.submit(ingest_sample, client, sample, idx, frame, version) - futures.append(future) - - for future in tqdm( - as_completed(futures), - total=len(futures), - desc="Ingesting LongBench v2", - ): - try: - if future.result(): - success_count += 1 - except Exception as e: - print(f"Error processing sample: {e}") - - print(f"\n{'=' * 80}") - print(f"✅ INGESTION COMPLETE: {success_count}/{len(dataset)} samples ingested".center(80)) - print(f"{'=' * 80}\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--lib", - type=str, - choices=["memos-api", "memos-api-online"], - default="memos-api", - ) - parser.add_argument( - "--version", - type=str, - default="long-bench-v2-1208-1556-async", - help="Version identifier for saving results", - ) - parser.add_argument( - "--workers", - type=int, - default=20, - help="Number of parallel workers", - ) - parser.add_argument( - "--max_samples", - type=int, - default=None, - help="Maximum number of samples to process (default: all)", - ) - args = parser.parse_args() - - main(args.lib, args.version, args.workers, args.max_samples) diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_metric.py b/evaluation/scripts/long_bench-v2/longbench_v2_metric.py index 6489dc40..6a4fc2b7 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_metric.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_metric.py @@ -83,7 +83,7 @@ def main(frame, version="default"): print("=" * 80 + "\n") # Load responses - responses_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_responses.json" + responses_path = f"results/long_bench_v2/{frame}-{version}/{frame}_longbench_v2_responses.json" if not os.path.exists(responses_path): print(f"❌ Responses not found: {responses_path}") print("Please run longbench_v2_responses.py first") @@ -99,7 +99,7 @@ def main(frame, version="default"): metrics = calculate_accuracy(filtered) # Save metrics - output_path = f"results/long_bench-v2/{frame}-{version}/{frame}_longbench_v2_metrics.json" + output_path = f"results/long_bench_v2/{frame}-{version}/{frame}_longbench_v2_metrics.json" os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_responses.py b/evaluation/scripts/long_bench-v2/longbench_v2_responses.py index 8c443943..cc158611 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_responses.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_responses.py @@ -158,7 +158,7 @@ def main(frame, version="default", num_workers=10): # Initialize checkpoint file for resume functionality checkpoint_dir = os.path.join( - ROOT_DIR, "evaluation", "results", "long_bench-v2", f"{frame}-{version}" + ROOT_DIR, "evaluation", "results", "long_bench_v2", f"{frame}-{version}" ) os.makedirs(checkpoint_dir, exist_ok=True) record_file = os.path.join(checkpoint_dir, "response_success_records.txt") diff --git a/evaluation/scripts/long_bench-v2/longbench_v2_search.py b/evaluation/scripts/long_bench-v2/longbench_v2_search.py index 686ff0ba..9730e937 100644 --- a/evaluation/scripts/long_bench-v2/longbench_v2_search.py +++ b/evaluation/scripts/long_bench-v2/longbench_v2_search.py @@ -25,6 +25,46 @@ def memos_api_search(client, query, user_id, top_k, frame): start = time() search_results = client.search(query=query, user_id=user_id, top_k=top_k) + def _reorder_memories_by_sources(sr: dict) -> list: + """ + Reorder text_mem[0].memories using sources' chunk_index (ascending). + Falls back to original order if no chunk_index is found. + """ + if not isinstance(sr, dict): + return [] + text_mem = sr.get("text_mem") or [] + if not text_mem or not text_mem[0].get("memories"): + return [] + memories = list(text_mem[0]["memories"]) + + def _first_source(mem: dict): + if not isinstance(mem, dict): + return None + # Prefer top-level sources, else metadata.sources + return (mem.get("sources") or mem.get("metadata", {}).get("sources") or []) or None + + def _chunk_index(mem: dict): + srcs = _first_source(mem) + if not srcs or not isinstance(srcs, list): + return None + for s in srcs: + if isinstance(s, dict) and s.get("chunk_index") is not None: + return s.get("chunk_index") + return None + + # Collect keys + keyed = [] + for i, mem in enumerate(memories): + ci = _chunk_index(mem) + keyed.append((ci, i, mem)) # keep original order as tie-breaker + + # If no chunk_index present at all, return original + if all(ci is None for ci, _, _ in keyed): + return memories + + keyed.sort(key=lambda x: (float("inf") if x[0] is None else x[0], x[1])) + return [k[2] for k in keyed] + # Format context from search results based on frame type for backward compatibility context = "" if ( @@ -32,7 +72,11 @@ def memos_api_search(client, query, user_id, top_k, frame): and isinstance(search_results, dict) and "text_mem" in search_results ): - context = "\n".join([i["memory"] for i in search_results["text_mem"][0]["memories"]]) + ordered_memories = _reorder_memories_by_sources(search_results) + if not ordered_memories and search_results["text_mem"][0].get("memories"): + ordered_memories = search_results["text_mem"][0]["memories"] + + context = "\n".join([i.get("memory", "") for i in ordered_memories]) if "pref_string" in search_results: context += f"\n{search_results.get('pref_string', '')}" @@ -129,7 +173,7 @@ def main(frame, version="default", num_workers=10, top_k=20, max_samples=None): # Initialize checkpoint file for resume functionality checkpoint_dir = os.path.join( - ROOT_DIR, "evaluation", "results", "long_bench-v2", f"{frame}-{version}" + ROOT_DIR, "evaluation", "results", "long_bench_v2", f"{frame}-{version}" ) os.makedirs(checkpoint_dir, exist_ok=True) record_file = os.path.join(checkpoint_dir, "search_success_records.txt") From b58ee88db79efdb448a9783a69068070ce8d807c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 9 Dec 2025 15:46:34 +0800 Subject: [PATCH 15/17] fix: image bug --- .../mem_reader/read_multi_modal/file_content_parser.py | 3 ++- src/memos/mem_reader/read_multi_modal/image_parser.py | 6 +++++- src/memos/mem_reader/read_multi_modal/tool_parser.py | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 408736d2..20fc03ec 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -471,6 +471,7 @@ def parse_fast( total_chunks = len(content_chunks) # Create memory items for each chunk + content_chunk_embeddings = self.embedder.embed(content_chunks) memory_items = [] for chunk_idx, chunk_text in enumerate(content_chunks): if not chunk_text.strip(): @@ -499,7 +500,7 @@ def parse_fast( f"chunk:{chunk_idx + 1}/{total_chunks}", ], key=_derive_key(chunk_text), - embedding=self.embedder.embed([chunk_text])[0], + embedding=content_chunk_embeddings[chunk_idx], usage=[], sources=[source], background="", diff --git a/src/memos/mem_reader/read_multi_modal/image_parser.py b/src/memos/mem_reader/read_multi_modal/image_parser.py index 5a19393a..74129508 100644 --- a/src/memos/mem_reader/read_multi_modal/image_parser.py +++ b/src/memos/mem_reader/read_multi_modal/image_parser.py @@ -64,7 +64,11 @@ def rebuild_from_source( ) -> ChatCompletionContentPartImageParam: """Rebuild image_url content part from SourceMessage.""" # Rebuild from source fields - url = getattr(source, "url", "") or (source.content or "").replace("[image_url]: ", "") + url = ( + getattr(source, "url", "") + or getattr(source, "image_path", "") + or (source.content or "").replace("[image_url]: ", "") + ) detail = getattr(source, "detail", "auto") return { "type": "image_url", diff --git a/src/memos/mem_reader/read_multi_modal/tool_parser.py b/src/memos/mem_reader/read_multi_modal/tool_parser.py index e13b684a..70589648 100644 --- a/src/memos/mem_reader/read_multi_modal/tool_parser.py +++ b/src/memos/mem_reader/read_multi_modal/tool_parser.py @@ -79,6 +79,7 @@ def create_source( filename=file_info.get("filename", ""), file_id=file_info.get("file_id", ""), tool_call_id=tool_call_id, + file_info=file_info, ) ) elif part_type == "image_url": From f94b0012f813ac23002da4600ec1853d7f0c9557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 9 Dec 2025 15:48:42 +0800 Subject: [PATCH 16/17] feat: increase playground depth --- src/memos/memories/textual/tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memos/memories/textual/tree.py b/src/memos/memories/textual/tree.py index 7f022b43..75eae30e 100644 --- a/src/memos/memories/textual/tree.py +++ b/src/memos/memories/textual/tree.py @@ -210,7 +210,7 @@ def search( def get_relevant_subgraph( self, query: str, - top_k: int = 5, + top_k: int = 20, depth: int = 2, center_status: str = "activated", user_name: str | None = None, From eba9e96216975495591ccd62b730b88e09dd0449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 9 Dec 2025 17:48:01 +0800 Subject: [PATCH 17/17] feat: set parsed_text None in file parser --- .../read_multi_modal/file_content_parser.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 20fc03ec..8fa0f245 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -612,8 +612,6 @@ def parse_fine( # Use parser from utils if parser: parsed_text = parser.parse(temp_file_path) - else: - parsed_text = "[File parsing error: Parser not available]" except Exception as e: logger.error( f"[FileContentParser] Error parsing downloaded file: {e}" @@ -633,18 +631,9 @@ def parse_fine( # Priority 2: If file_id is provided but no file_data, try to use file_id as path elif file_id: logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}") - parsed_text = f"[File ID: {file_id}]: File data not provided" - - # If no content could be parsed, create a placeholder - if not parsed_text: - if filename: - parsed_text = f"[File: {filename}] File data not provided" - else: - parsed_text = "[File: unknown] File data not provided" except Exception as e: logger.error(f"[FileContentParser] Error in parse_fine: {e}") - parsed_text = f"[File parsing error: {e!s}]" finally: # Clean up temporary file @@ -656,7 +645,8 @@ def parse_fine( logger.warning( f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}" ) - + if not parsed_text: + return [] # Extract and process images from parsed_text if is_markdown and parsed_text and self.image_parser: parsed_text = self._extract_and_process_images(parsed_text, info, **kwargs)