texttron · lilacheden · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/README.md b/README.md
@@ -47,11 +47,18 @@ you can find more information about its installation [here](https://docs.astral.
 Then run:
 
 ```bash
+# Base installation (works on all platforms)
 uv sync
 source .venv/bin/activate
+
+# With GPU extras (required for local evaluation)
+uv sync --extra gpu
+source .venv/bin/activate
 uv pip install --no-build-isolation flash-attn # Needed for faiss
 ```
 
+> **Note:** The base installation supports all search agents and retrieval functionality. The `--extra gpu` option adds vllm and deepspeed, which are needed for running local GPU-accelerated evaluation with `scripts_evaluation/evaluate_run.py`.
+
 Additionally, this repo depends on java 21. One way to install it is through conda:
 
 ```bash
@@ -114,7 +121,7 @@ python scripts_evaluation/evaluate_run.py --input_dir runs/my_model
 ```
 where you may pass in `--tensor_parallel_size {num_gpus}` to adjust the number of GPUs you have available.
 
-> Note that the script above evalutes using Qwen3-32B as a judge. For more details on evaluation, please refer to [docs/llm_as_judge.md](docs/llm_as_judge.md).
+> **Note:** The script above evaluates using Qwen3-32B as a judge and requires `uv sync --extra gpu` installation with GPU access. For alternative evaluation methods including legacy GPT-4.1 evaluation (no GPU required), please refer to [docs/llm_as_judge.md](docs/llm_as_judge.md).
 
 ### Submitting to the Leaderboard
 
@@ -194,14 +201,18 @@ where you can find the decrypted trajectory data in `data/decrypted_run_files/`.
 ## 📚 Docs
 
 To reproduce results from BrowseComp-Plus, you can refer to the following docs for running the various search agents:
+
+### API-Based Agents (Base Installation, No GPU Required)
 - [OpenAI API](docs/openai.md)
 - [Gemini API](docs/gemini.md)
 - [Anthropic API](docs/anthropic.md)
 - [GLM API](docs/glm.md)
-- [OSS](docs/oss.md)
-- [Qwen-3](docs/qwen.md)
-- [Search-R1](docs/search-r1.md)
-- [Tongyi-DeepResearch-30B-A3B](docs/tongyi.md)
+
+### Self-Hosted Model Agents (Requires GPU)
+- [OSS](docs/oss.md) - Requires GPU for vLLM server
+- [Qwen-3](docs/qwen.md) - Requires GPU for model serving
+- [Search-R1](docs/search-r1.md) - Requires GPU for model serving
+- [Tongyi-DeepResearch-30B-A3B](docs/tongyi.md) - Requires GPU for model serving
 
 As an alternative to deploying the retrieval server yourself locally, you can also use a [pre-deployed MCP server from NetMind](docs/netmind_mcp.md).
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,7 @@
+[build-system]
+requires = ["setuptools>=77.0.0"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "browsecomp-plus"
 version = "0.1.0"
@@ -7,7 +11,6 @@ dependencies = [
     "accelerate>=1.9.0",
     "anthropic>=0.58.2",
     "datasets>=4.0.0",
-    "deepspeed>=0.17.2",
     "faiss-cpu>=1.11.0.post1",
     "fastmcp==2.9.2",
     "google-genai>=1.27.0",
@@ -22,9 +25,25 @@ dependencies = [
     "tevatron",
     "torchvision",
     "tqdm>=4.67.1",
-    "transformers>=4.53.2",
-    "vllm>=0.9.0",
+    "transformers>=4.53.2,<5.0",
+    "tevatron @ git+https://github.com/texttron/tevatron.git@main",
+]
+
+[project.optional-dependencies]
+gpu = [
+  "deepspeed>=0.17.2",
+  "vllm>=0.9.0",
 ]
 
 [tool.uv.sources]
 tevatron = { git = "https://github.com/texttron/tevatron.git", rev = "main" }
+
+[tool.setuptools]
+
+script-files = [
+  "scripts_build_index/download_indexes.sh",
+  "scripts_build_index/download_run_files.sh",
+]
+
+[tool.setuptools.packages.find]
+include = ["searcher*", "search_agent*", "scripts_evaluation*", "scripts_build_index*"]
diff --git a/scripts_build_index/__init__.py b/scripts_build_index/__init__.py
diff --git a/scripts_evaluation/evaluate_run.py b/scripts_evaluation/evaluate_run.py
@@ -11,31 +11,9 @@
 import numpy as np
 from tqdm import tqdm
 from vllm import LLM, SamplingParams
-
+from search_agent.prompts import GRADER_TEMPLATE_QWEN as GRADER_TEMPLATE
 sys.path.append(str(Path(__file__).parent.parent))
 
-GRADER_TEMPLATE = """
-Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
-
-[question]: {question}
-
-[response]: {response}
-
-[correct_answer]: {correct_answer}
-
-Your judgement must be in the format and criteria specified below:
-
-extracted_final_answer: The final exact answer extracted from the [response]. 
-
-[correct_answer]: Repeat the [correct_answer] given above.
-
-reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], in the context of this [question]. You should judge whether the extracted_final_answer is semantically equivalent to [correct_answer], allowing the extracted_final_answer to be string variations of [correct_answer]. You should also allow the extracted_final_answer to be more precise or verbose than [correct_answer], as long as its additional details are correct. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers are semantically equivalent.
-
-correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
-
-
-confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
-""".strip()
 
 
 def load_ground_truth(jsonl_path: Path) -> Dict[str, Dict[str, str]]:

diff --git a/search_agent/prompts.py b/search_agent/prompts.py
@@ -61,6 +61,29 @@
 correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
 
 
+confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
+""".strip()
+
+GRADER_TEMPLATE_QWEN = """
+Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
+
+[question]: {question}
+
+[response]: {response}
+
+[correct_answer]: {correct_answer}
+
+Your judgement must be in the format and criteria specified below:
+
+extracted_final_answer: The final exact answer extracted from the [response]. 
+
+[correct_answer]: Repeat the [correct_answer] given above.
+
+reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], in the context of this [question]. You should judge whether the extracted_final_answer is semantically equivalent to [correct_answer], allowing the extracted_final_answer to be string variations of [correct_answer]. You should also allow the extracted_final_answer to be more precise or verbose than [correct_answer], as long as its additional details are correct. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers are semantically equivalent.
+
+correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
+
+
 confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.
 """.strip()
 

diff --git a/search_agent/search_r1_client.py b/search_agent/search_r1_client.py
@@ -41,6 +41,25 @@
 
 os.makedirs(args.output_dir, exist_ok=True)
 
+def detect_device():
+    if torch.cuda.is_available():
+        print(f"CUDA available: {torch.cuda.get_device_name(0)}")
+        return torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        print("MPS available (Apple Silicon GPU)")
+        return torch.device("mps")
+    else:
+        print("Using CPU only (no GPU backend available)")
+        return torch.device("cpu")
+
+def get_model_dtype_and_device():
+    if torch.cuda.is_available():
+        return torch.bfloat16, "auto"
+    elif torch.backends.mps.is_available():
+        return torch.float16, None  # Don't use device_map with MPS
+    else:
+        return torch.float32, None
+
 if args.query.endswith(".tsv"):
     print(f"Loading questions from TSV file: {args.query}...")
     questions_to_process = pd.read_csv(
@@ -53,16 +72,17 @@
     print("Processing 1 question")
 
 model_id = args.model
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device = detect_device()
 curr_eos = [151645, 151643]  # for Qwen2.5 series models
 curr_search_template = (
     "\n\n{output_text}<information>{search_results}</information>\n\n"
 )
 
 print("Loading model and tokenizer...")
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+dtype, device_map = get_model_dtype_and_device()
 model = transformers.AutoModelForCausalLM.from_pretrained(
-    model_id, torch_dtype=torch.bfloat16, device_map="auto"
+    model_id, dtype=dtype, device_map=device_map
 )
 
 

diff --git a/searcher/searchers/__init__.py b/searcher/searchers/__init__.py
@@ -5,25 +5,18 @@
 from enum import Enum
 
 from .base import BaseSearcher
-from .bm25_searcher import BM25Searcher
-from .custom_searcher import CustomSearcher
-from .faiss_searcher import FaissSearcher, ReasonIrSearcher
 
 
 class SearcherType(Enum):
     """Enum for managing available searcher types and their CLI mappings."""
 
-    BM25 = ("bm25", BM25Searcher)
-    FAISS = ("faiss", FaissSearcher)
-    REASONIR = ("reasonir", ReasonIrSearcher)
-    CUSTOM = (
-        "custom",
-        CustomSearcher,
-    )  # Your custom searcher class, yet to be implemented
+    BM25 = "bm25"
+    FAISS = "faiss"
+    REASONIR = "reasonir"
+    CUSTOM = "custom"
 
-    def __init__(self, cli_name, searcher_class):
+    def __init__(self, cli_name):
         self.cli_name = cli_name
-        self.searcher_class = searcher_class
 
     @classmethod
     def get_choices(cls):
@@ -32,10 +25,19 @@ def get_choices(cls):
 
     @classmethod
     def get_searcher_class(cls, cli_name):
-        """Get searcher class by CLI name."""
-        for searcher_type in cls:
-            if searcher_type.cli_name == cli_name:
-                return searcher_type.searcher_class
+        """Get searcher class by CLI name, importing only when needed."""
+        if cli_name == cls.BM25.cli_name:
+            from .bm25_searcher import BM25Searcher
+            return BM25Searcher
+        elif cli_name == cls.FAISS.cli_name:
+            from .faiss_searcher import FaissSearcher
+            return FaissSearcher
+        elif cli_name == cls.REASONIR.cli_name:
+            from .faiss_searcher import ReasonIrSearcher
+            return ReasonIrSearcher
+        elif cli_name == cls.CUSTOM.cli_name:
+            from .custom_searcher import CustomSearcher
+            return CustomSearcher
         raise ValueError(f"Unknown searcher type: {cli_name}")
 
 

diff --git a/searcher/searchers/faiss_searcher.py b/searcher/searchers/faiss_searcher.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import pickle
+from contextlib import nullcontext
 from itertools import chain
 from typing import Any, Dict, List, Optional
 
@@ -83,6 +84,7 @@ def __init__(self, args):
         self.tokenizer = None
         self.lookup = None
         self.docid_to_text = None
+        self.device = detect_device()
 
         logger.info("Initializing FAISS searcher...")
 
@@ -176,11 +178,12 @@ def _load_model(self) -> None:
             normalize=model_args.normalize,
             lora_name_or_path=model_args.lora_name_or_path,
             cache_dir=model_args.cache_dir,
-            torch_dtype=torch_dtype,
-            attn_implementation=model_args.attn_implementation,
+            dtype=torch_dtype,
+            attn_implementation=get_attn_implementation(self.device) #model_args.attn_implementation,
         )
 
-        self.model = self.model.to("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = self.model.to(self.device)
+
         self.model.eval()
 
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -260,10 +263,10 @@ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
             return_tensors="pt",
         )
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
+        batch_dict = {k: v.to(self.device) for k, v in batch_dict.items()}
+
 
-        with torch.amp.autocast(device):
+        with self.get_autocast_ctx():
             with torch.no_grad():
                 q_reps = self.model.encode_query(batch_dict)
                 q_reps = q_reps.cpu().detach().numpy()
@@ -298,6 +301,13 @@ def get_document(self, docid: str) -> Optional[Dict[str, Any]]:
     def search_type(self) -> str:
         return "FAISS"
 
+    def get_autocast_ctx(self):
+        if self.device.type in ["cpu", "cuda"]:
+            return torch.amp.autocast(device_type=self.device.type)
+        else:
+            # MPS: autocast not supported -> use nullcontext
+            return nullcontext()
+
 
 class ReasonIrSearcher(FaissSearcher):
     def _load_model(self) -> None:
@@ -326,10 +336,10 @@ def _load_model(self) -> None:
         self.model = AutoModel.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
-            torch_dtype=torch_dtype,
+            dtype=torch_dtype,
             trust_remote_code=True,
         )
-        self.model = self.model.to("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = self.model.to(self.device)
         self.model.eval()
 
         logger.info("Model loaded successfully")
@@ -338,9 +348,7 @@ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
         if not all([self.retriever, self.model, self.lookup]):
             raise RuntimeError("Searcher not properly initialized")
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-
-        with torch.amp.autocast(device):
+        with self.get_autocast_ctx():
             with torch.no_grad():
                 q_reps = self.model.encode(
                     [query],
@@ -359,3 +367,33 @@ def search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
             )
 
         return results
+
+def detect_device():
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print(f"CUDA available: {torch.cuda.get_device_name(0)}")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        print("MPS available (Apple Silicon GPU)")
+    else:
+        device = torch.device("cpu")
+        print("Using CPU only (no GPU backend available)")
+    return device
+
+
+def has_flash_attn():
+    try:
+        import flash_attn
+        return True
+    except ImportError:
+        return False
+
+def get_attn_implementation(device):
+    if device == torch.device("cuda") and has_flash_attn():
+        return "flash_attention_2"
+    elif device == torch.device("mps"):
+        return "sdpa"
+    elif device == torch.device("cpu"):
+        return "eager"
+    raise ValueError("Unknown device {}".format(device))
+