Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
## Accuracy testing of Sparse method

### Overview
We use two Chinese subsets of [LongBench](https://huggingface.co/datasets/zai-org/LongBench) to test the accuracy of single-document QA (multifieldqa_zh) and multi-document QA (dureader). The F1 score is adopted to evaluate the accuracy of these sparse methods. For more information about LongBench, please refer to https://github.com/THUDM/LongBench.

### Quick Start

#### Environment Preparation
```shell
pip install jieba fuzzywuzzy rouge
```
#### Test Data Preparation
Dowdload the Longbench dataset

```shell
wget https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip && unzip data.zip

```

#### Configure Specific Sparse Method

Settings for different sparse methods are written in a JSON file, for example:
```python
{"ESA":
{
"init_window_sz": 1,
"local_window_sz": 2,
"min_blocks":4,
"sparse_ratio": 0.2,
"retrieval_stride": 10
}
}
```

Run accuracy testing with:
```shell
cd eval

# Run with default settings: Qwen2.5-14B-Instruct batch=20
bash eval_inference_F1.sh

# Run with custom parameters
# --strip_think: extract the text after </think> from model predictions
# --batch: number of requests processed per batch
bash eval_inference_F1.sh \
--model /home/models/QwQ-32B \
--config ./eval/ucm_sparse_config_esa.json \
--data ./eval/data \
--strip_think 1 \
--batch 1

```
The result files will be saved in the eval/ucm_sparse_predictions folder.

### Results
Test results of Full Attention (Qwen2.5-14B-Instruct):

| Dataset | F1-Score |
|-------|-----------:|
| multifieldqa_zh | 66.6 |
| dureader | 29.33 |

282 changes: 282 additions & 0 deletions eval/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import json
import argparse
import numpy as np
import re
import string

import jieba
from fuzzywuzzy import fuzz
from collections import Counter
from rouge import Rouge


def extract_pred_after_think(text):
if text is None:
return ""
t = text.strip()
idx = t.find("</think>")
if idx != -1:
return t[idx + len("</think>"):].strip()
return t.strip()

def has_think_tag(text):
if text is None:
return False
return ("</think>" in text)


def normalize_answer(s):
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))

def normalize_zh_answer(s):
def white_space_fix(text):
return "".join(text.split())
def remove_punc(text):
cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
all_punctuation = set(string.punctuation + cn_punctuation)
return "".join(ch for ch in text if ch not in all_punctuation)
def lower(text):
return text.lower()
return white_space_fix(remove_punc(lower(s)))

def count_score(prediction, ground_truth, **kwargs):
numbers = re.findall(r"\d+", prediction)
right_num = 0
for number in numbers:
if str(number) == str(ground_truth):
right_num += 1
final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
return float(final_score)

def retrieval_score(prediction, ground_truth, **kwargs):
pattern = r'Paragraph (\d+)'
matches = re.findall(pattern, ground_truth)
ground_truth_id = matches[0]
numbers = re.findall(r"\d+", prediction)
right_num = 0
for number in numbers:
if str(number) == str(ground_truth_id):
right_num += 1
final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
return float(final_score)

def retrieval_zh_score(prediction, ground_truth, **kwargs):
pattern = r'段落(\d+)'
matches = re.findall(pattern, ground_truth)
ground_truth_id = matches[0]
numbers = re.findall(r"\d+", prediction)
right_num = 0
for number in numbers:
if str(number) == str(ground_truth_id):
right_num += 1
final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
return float(final_score)

def code_sim_score(prediction, ground_truth, **kwargs):
all_lines = prediction.lstrip('\n').split('\n')
prediction = ""
for line in all_lines:
if ('`' not in line) and ('#' not in line) and ('//' not in line):
prediction = line
break
return (fuzz.ratio(prediction, ground_truth) / 100)

def classification_score(prediction, ground_truth, **kwargs):
em_match_list = []
all_classes = kwargs["all_classes"]
for class_name in all_classes:
if class_name in prediction:
em_match_list.append(class_name)
for match_term in em_match_list:
if match_term in ground_truth and match_term != ground_truth:
em_match_list.remove(match_term)
if ground_truth in em_match_list:
score = (1.0 / len(em_match_list))
else:
score = 0.0
return score

def rouge_score(prediction, ground_truth, **kwargs):
rouge = Rouge()
try:
scores = rouge.get_scores([prediction], [ground_truth], avg=True)
except:
return 0.0
return scores["rouge-l"]["f"]

def rouge_zh_score(prediction, ground_truth, **kwargs):
prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
score = rouge_score(prediction, ground_truth)
return score

def f1_score(prediction, ground_truth, **kwargs):
common = Counter(prediction) & Counter(ground_truth)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction)
recall = 1.0 * num_same / len(ground_truth)
f1 = (2 * precision * recall) / (precision + recall)
return f1

def qa_f1_score(prediction, ground_truth, **kwargs):
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)
prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
return f1_score(prediction_tokens, ground_truth_tokens)

def qa_f1_zh_score(prediction, ground_truth, **kwargs):
prediction_tokens = list(jieba.cut(prediction, cut_all=False))
ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
return f1_score(prediction_tokens, ground_truth_tokens)


dataset2metric = {
"narrativeqa": qa_f1_score,
"qasper": qa_f1_score,
"multifieldqa_en": qa_f1_score,
"multifieldqa_zh": qa_f1_zh_score,
"clongeval": qa_f1_zh_score,
"hotpotqa": qa_f1_score,
"2wikimqa": qa_f1_score,
"musique": qa_f1_score,
"dureader": rouge_zh_score,
"gov_report": rouge_score,
"qmsum": rouge_score,
"multi_news": rouge_score,
"vcsum": rouge_zh_score,
"trec": classification_score,
"triviaqa": qa_f1_score,
"samsum": rouge_score,
"lsht": classification_score,
"passage_retrieval_en": retrieval_score,
"passage_count": count_score,
"passage_retrieval_zh": retrieval_zh_score,
"lcc": code_sim_score,
"repobench-p": code_sim_score,
}


def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default=None)
parser.add_argument('--answer', type=str, default=None)
parser.add_argument('--dataset', type=str, default=None)
parser.add_argument('--strip_think', action='store_true', help="Extract </think> after content")
parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
return parser.parse_args(args)

def scorer_e(dataset, predictions, answers, lengths, all_classes):
scores = {"0-4k": [], "4-8k": [], "8k+": []}
for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
score = 0.
if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
prediction = prediction.lstrip('\n').split('\n')[0]
for ground_truth in ground_truths:
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
if length < 4000:
scores["0-4k"].append(score)
elif length < 8000:
scores["4-8k"].append(score)
else:
scores["8k+"].append(score)
for key in scores.keys():
scores[key] = round(100 * np.mean(scores[key]), 2)
return scores

def scorer(dataset, predictions, answers, all_classes):
total_score = 0.
# count = 0
for (prediction, ground_truths) in zip(predictions, answers):
score = 0.
if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
prediction = prediction.lstrip('\n').split('\n')[0]
for ground_truth in ground_truths:
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))

total_score += score
return round(100 * total_score / len(predictions), 2)

def fix_json_format(line):
line = re.sub(r'"answers": \[\[(.*?)\]\]', r'"answers": [\1]', line)

line = line.replace("'", '"')
line = line.replace("None", "null")
line = line.strip().replace("\n", "").replace("\r", "").replace("\t", "")
pattern = re.compile(r'"pred":"(.*?)"(?=,)', re.DOTALL)
def escape_quotes(match):
escaped_value = match.group(1).replace('"', '\\"')
return f'"pred":"{escaped_value}"'

line = pattern.sub(escape_quotes, line)

pattern = re.compile(r'"answers":\s*\[([^\]]+)\]', re.DOTALL)
def escape_quotes_in_answers(match):
internel_content = match.group(1)

items = internel_content.split('","')
# import pdb; pdb.set_trace()
escaped_items = [item.replace('"', '\\"') for item in items]

escaped_content = '","'.join(escaped_items)

return f'"answers": ["{escaped_content}"]'
line = pattern.sub(escape_quotes_in_answers, line)

return line


if __name__ == '__main__':
args = parse_args()

predictions, answers, lengths = [], [], []
all_classes = None
with open(args.answer, "r", encoding="utf-8") as f:
for line in f:
data = json.loads(line)
pred_raw = data["pred"]
if args.strip_think:
if not has_think_tag(pred_raw):
continue
pred_clean = extract_pred_after_think(pred_raw)
print(pred_clean)
else:
pred_clean = pred_raw

predictions.append(pred_clean)
answers.append(data["answers"])

if "length" in data:
lengths.append(data["length"])

print("----"*10)
print("有效条数:", len(predictions))
print("----"*10)

if args.e:
score = scorer_e(args.dataset, predictions, answers, lengths, all_classes)
print("All score:", score)
else:
score50 = scorer(args.dataset, predictions[:50], answers[:50], all_classes)
score_all = scorer(args.dataset, predictions, answers, all_classes)
print("50 score:", score50)
print("All score:", score_all)
Loading
Loading