Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,18 @@ you can find more information about its installation [here](https://docs.astral.
Then run:

```bash
# Base installation (works on all platforms)
uv sync
source .venv/bin/activate

# With GPU extras (required for local evaluation)
uv sync --extra gpu
source .venv/bin/activate
uv pip install --no-build-isolation flash-attn # Needed for faiss
```

> **Note:** The base installation supports all search agents and retrieval functionality. The `--extra gpu` option adds vllm and deepspeed, which are needed for running local GPU-accelerated evaluation with `scripts_evaluation/evaluate_run.py`.

Additionally, this repo depends on java 21. One way to install it is through conda:

```bash
Expand Down Expand Up @@ -114,7 +121,7 @@ python scripts_evaluation/evaluate_run.py --input_dir runs/my_model
```
where you may pass in `--tensor_parallel_size {num_gpus}` to adjust the number of GPUs you have available.

> Note that the script above evalutes using Qwen3-32B as a judge. For more details on evaluation, please refer to [docs/llm_as_judge.md](docs/llm_as_judge.md).
> **Note:** The script above evaluates using Qwen3-32B as a judge and requires `uv sync --extra gpu` installation with GPU access. For alternative evaluation methods including legacy GPT-4.1 evaluation (no GPU required), please refer to [docs/llm_as_judge.md](docs/llm_as_judge.md).

### Submitting to the Leaderboard

Expand Down Expand Up @@ -194,14 +201,18 @@ where you can find the decrypted trajectory data in `data/decrypted_run_files/`.
## 📚 Docs

To reproduce results from BrowseComp-Plus, you can refer to the following docs for running the various search agents:

### API-Based Agents (Base Installation, No GPU Required)
- [OpenAI API](docs/openai.md)
- [Gemini API](docs/gemini.md)
- [Anthropic API](docs/anthropic.md)
- [GLM API](docs/glm.md)
- [OSS](docs/oss.md)
- [Qwen-3](docs/qwen.md)
- [Search-R1](docs/search-r1.md)
- [Tongyi-DeepResearch-30B-A3B](docs/tongyi.md)

### Self-Hosted Model Agents (Requires GPU)
- [OSS](docs/oss.md) - Requires GPU for vLLM server
- [Qwen-3](docs/qwen.md) - Requires GPU for model serving
- [Search-R1](docs/search-r1.md) - Requires GPU for model serving
- [Tongyi-DeepResearch-30B-A3B](docs/tongyi.md) - Requires GPU for model serving

As an alternative to deploying the retrieval server yourself locally, you can also use a [pre-deployed MCP server from NetMind](docs/netmind_mcp.md).

Expand Down
25 changes: 22 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[build-system]
requires = ["setuptools>=77.0.0"]
build-backend = "setuptools.build_meta"

[project]
name = "browsecomp-plus"
version = "0.1.0"
Expand All @@ -7,7 +11,6 @@ dependencies = [
"accelerate>=1.9.0",
"anthropic>=0.58.2",
"datasets>=4.0.0",
"deepspeed>=0.17.2",
"faiss-cpu>=1.11.0.post1",
"fastmcp==2.9.2",
"google-genai>=1.27.0",
Expand All @@ -22,9 +25,25 @@ dependencies = [
"tevatron",
"torchvision",
"tqdm>=4.67.1",
"transformers>=4.53.2",
"vllm>=0.9.0",
"transformers>=4.53.2,<5.0",
"tevatron @ git+https://github.com/texttron/tevatron.git@main",
]

[project.optional-dependencies]
gpu = [
"deepspeed>=0.17.2",
"vllm>=0.9.0",
]

[tool.uv.sources]
tevatron = { git = "https://github.com/texttron/tevatron.git", rev = "main" }

[tool.setuptools]

script-files = [
"scripts_build_index/download_indexes.sh",
"scripts_build_index/download_run_files.sh",
]

[tool.setuptools.packages.find]
include = ["searcher*", "search_agent*", "scripts_evaluation*", "scripts_build_index*"]
Empty file added scripts_build_index/__init__.py
Empty file.
24 changes: 1 addition & 23 deletions scripts_evaluation/evaluate_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,9 @@
import numpy as np
from tqdm import tqdm
from vllm import LLM, SamplingParams

from search_agent.prompts import GRADER_TEMPLATE_QWEN as GRADER_TEMPLATE
sys.path.append(str(Path(__file__).parent.parent))

GRADER_TEMPLATE = """
Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.

[question]: {question}

[response]: {response}

[correct_answer]: {correct_answer}

Your judgement must be in the format and criteria specified below:

extracted_final_answer: The final exact answer extracted from the [response].

[correct_answer]: Repeat the [correct_answer] given above.

reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], in the context of this [question]. You should judge whether the extracted_final_answer is semantically equivalent to [correct_answer], allowing the extracted_final_answer to be string variations of [correct_answer]. You should also allow the extracted_final_answer to be more precise or verbose than [correct_answer], as long as its additional details are correct. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers are semantically equivalent.

correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.


confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
""".strip()


def load_ground_truth(jsonl_path: Path) -> Dict[str, Dict[str, str]]:
Expand Down
23 changes: 23 additions & 0 deletions search_agent/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,29 @@
correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.


confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
""".strip()

GRADER_TEMPLATE_QWEN = """
Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.

[question]: {question}

[response]: {response}

[correct_answer]: {correct_answer}

Your judgement must be in the format and criteria specified below:

extracted_final_answer: The final exact answer extracted from the [response].

[correct_answer]: Repeat the [correct_answer] given above.

reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], in the context of this [question]. You should judge whether the extracted_final_answer is semantically equivalent to [correct_answer], allowing the extracted_final_answer to be string variations of [correct_answer]. You should also allow the extracted_final_answer to be more precise or verbose than [correct_answer], as long as its additional details are correct. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers are semantically equivalent.

correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.


confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
""".strip()

Expand Down
24 changes: 22 additions & 2 deletions search_agent/search_r1_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,25 @@

os.makedirs(args.output_dir, exist_ok=True)

def detect_device():
if torch.cuda.is_available():
print(f"CUDA available: {torch.cuda.get_device_name(0)}")
return torch.device("cuda")
elif torch.backends.mps.is_available():
print("MPS available (Apple Silicon GPU)")
return torch.device("mps")
else:
print("Using CPU only (no GPU backend available)")
return torch.device("cpu")

def get_model_dtype_and_device():
if torch.cuda.is_available():
return torch.bfloat16, "auto"
elif torch.backends.mps.is_available():
return torch.float16, None # Don't use device_map with MPS
else:
return torch.float32, None

if args.query.endswith(".tsv"):
print(f"Loading questions from TSV file: {args.query}...")
questions_to_process = pd.read_csv(
Expand All @@ -53,16 +72,17 @@
print("Processing 1 question")

model_id = args.model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = detect_device()
curr_eos = [151645, 151643] # for Qwen2.5 series models
curr_search_template = (
"\n\n{output_text}<information>{search_results}</information>\n\n"
)

print("Loading model and tokenizer...")
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
dtype, device_map = get_model_dtype_and_device()
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="auto"
model_id, dtype=dtype, device_map=device_map
)


Expand Down
34 changes: 18 additions & 16 deletions searcher/searchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,18 @@
from enum import Enum

from .base import BaseSearcher
from .bm25_searcher import BM25Searcher
from .custom_searcher import CustomSearcher
from .faiss_searcher import FaissSearcher, ReasonIrSearcher


class SearcherType(Enum):
"""Enum for managing available searcher types and their CLI mappings."""

BM25 = ("bm25", BM25Searcher)
FAISS = ("faiss", FaissSearcher)
REASONIR = ("reasonir", ReasonIrSearcher)
CUSTOM = (
"custom",
CustomSearcher,
) # Your custom searcher class, yet to be implemented
BM25 = "bm25"
FAISS = "faiss"
REASONIR = "reasonir"
CUSTOM = "custom"

def __init__(self, cli_name, searcher_class):
def __init__(self, cli_name):
self.cli_name = cli_name
self.searcher_class = searcher_class

@classmethod
def get_choices(cls):
Expand All @@ -32,10 +25,19 @@ def get_choices(cls):

@classmethod
def get_searcher_class(cls, cli_name):
"""Get searcher class by CLI name."""
for searcher_type in cls:
if searcher_type.cli_name == cli_name:
return searcher_type.searcher_class
"""Get searcher class by CLI name, importing only when needed."""
if cli_name == cls.BM25.cli_name:
from .bm25_searcher import BM25Searcher
return BM25Searcher
elif cli_name == cls.FAISS.cli_name:
from .faiss_searcher import FaissSearcher
return FaissSearcher
elif cli_name == cls.REASONIR.cli_name:
from .faiss_searcher import ReasonIrSearcher
return ReasonIrSearcher
elif cli_name == cls.CUSTOM.cli_name:
from .custom_searcher import CustomSearcher
return CustomSearcher
raise ValueError(f"Unknown searcher type: {cli_name}")


Expand Down
60 changes: 49 additions & 11 deletions searcher/searchers/faiss_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import os
import pickle
from contextlib import nullcontext
from itertools import chain
from typing import Any, Dict, List, Optional

Expand Down Expand Up @@ -83,6 +84,7 @@ def __init__(self, args):
self.tokenizer = None
self.lookup = None
self.docid_to_text = None
self.device = detect_device()

logger.info("Initializing FAISS searcher...")

Expand Down Expand Up @@ -176,11 +178,12 @@ def _load_model(self) -> None:
normalize=model_args.normalize,
lora_name_or_path=model_args.lora_name_or_path,
cache_dir=model_args.cache_dir,
torch_dtype=torch_dtype,
attn_implementation=model_args.attn_implementation,
dtype=torch_dtype,
attn_implementation=get_attn_implementation(self.device) #model_args.attn_implementation,
)

self.model = self.model.to("cuda" if torch.cuda.is_available() else "cpu")
self.model = self.model.to(self.device)

self.model.eval()

self.tokenizer = AutoTokenizer.from_pretrained(
Expand Down Expand Up @@ -260,10 +263,10 @@ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
return_tensors="pt",
)

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
batch_dict = {k: v.to(self.device) for k, v in batch_dict.items()}


with torch.amp.autocast(device):
with self.get_autocast_ctx():
with torch.no_grad():
q_reps = self.model.encode_query(batch_dict)
q_reps = q_reps.cpu().detach().numpy()
Expand Down Expand Up @@ -298,6 +301,13 @@ def get_document(self, docid: str) -> Optional[Dict[str, Any]]:
def search_type(self) -> str:
return "FAISS"

def get_autocast_ctx(self):
if self.device.type in ["cpu", "cuda"]:
return torch.amp.autocast(device_type=self.device.type)
else:
# MPS: autocast not supported -> use nullcontext
return nullcontext()


class ReasonIrSearcher(FaissSearcher):
def _load_model(self) -> None:
Expand Down Expand Up @@ -326,10 +336,10 @@ def _load_model(self) -> None:
self.model = AutoModel.from_pretrained(
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
torch_dtype=torch_dtype,
dtype=torch_dtype,
trust_remote_code=True,
)
self.model = self.model.to("cuda" if torch.cuda.is_available() else "cpu")
self.model = self.model.to(self.device)
self.model.eval()

logger.info("Model loaded successfully")
Expand All @@ -338,9 +348,7 @@ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
if not all([self.retriever, self.model, self.lookup]):
raise RuntimeError("Searcher not properly initialized")

device = "cuda" if torch.cuda.is_available() else "cpu"

with torch.amp.autocast(device):
with self.get_autocast_ctx():
with torch.no_grad():
q_reps = self.model.encode(
[query],
Expand All @@ -359,3 +367,33 @@ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
)

return results

def detect_device():
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"CUDA available: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
device = torch.device("mps")
print("MPS available (Apple Silicon GPU)")
else:
device = torch.device("cpu")
print("Using CPU only (no GPU backend available)")
return device


def has_flash_attn():
try:
import flash_attn
return True
except ImportError:
return False

def get_attn_implementation(device):
if device == torch.device("cuda") and has_flash_attn():
return "flash_attention_2"
elif device == torch.device("mps"):
return "sdpa"
elif device == torch.device("cpu"):
return "eager"
raise ValueError("Unknown device {}".format(device))

Loading