Skip to content

Commit d2d5d5a

Browse files
author
Your Name
committed
dd
1 parent bf481ba commit d2d5d5a

File tree

8 files changed

+1115
-891
lines changed

8 files changed

+1115
-891
lines changed

examples/droid_h5/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
results/
22
output/
33
eval_runs/
4+
clip_700_output/
5+
eval_runs_2/
6+
*.png
7+
*.pdf
8+
eval_runs*

examples/droid_h5/siglip2_baseline_pipeline.py renamed to examples/droid_h5/clip_baseline_pipeline.py

Lines changed: 73 additions & 132 deletions
Large diffs are not rendered by default.

examples/droid_h5/evaluate_vlm_configs.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
python evaluate_vlm_configs.py \
2424
--trajectories gs://.../success/... gs://.../failure/... \
2525
--eval-root ./eval_runs
26+
27+
CUDA_VISIBLE_DEVICES=4,5,6,7 SGLANG_VLM_CACHE_SIZE_MB=1024 python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-32B-Instruct --host 0.0.0.0 --port 30000 --tp 4 --mem-fraction-static 0.6 --chunked-prefill-size 4096
2628
"""
2729

2830
import argparse
@@ -118,23 +120,36 @@ def main():
118120
parser.add_argument("--balance", type=float, help="Success ratio target in sampling, e.g., 0.5")
119121
parser.add_argument("--seed", type=int, help="Random seed")
120122
parser.add_argument("--max-workers", type=int, default=4, help="Parallel workers for VLM")
121-
parser.add_argument("--eval-root", default="./eval_runs", help="Root folder for evaluation outputs")
123+
parser.add_argument("--eval-root", default="./eval_runs_2", help="Root folder for evaluation outputs")
122124
parser.add_argument("--num-trials", type=int, default=1, help="Number of trials per configuration")
123125

124-
parser.add_argument("--frame-counts", type=int, nargs='+', default=[2, 4, 6, 8, 10],
126+
parser.add_argument("--frame-counts", type=int, nargs='+', default=[2, 4, 8, 16, 32],
125127
help="Frame counts to evaluate")
126-
parser.add_argument("--passing-methods", nargs='+', default=["stream", "concat"],
128+
parser.add_argument("--passing-methods", nargs='+', default=["stream"],
127129
choices=["stream", "concat"], help="Passing methods to evaluate")
128-
parser.add_argument("--video-path-keys", nargs='*', default=None,
129-
help="Video path keys from metadata (e.g., ext1_mp4_path wrist_mp4_path). If omitted, auto-detect.")
130+
parser.add_argument("--video-path-keys", nargs='*', default=["ext1_mp4_path"],
131+
help="Video path keys from metadata (e.g., ext1_mp4_path wrist_mp4_path all). 'all' concatenates ext1_mp4_path and wrist_mp4_path. If omitted, auto-detect.")
130132

131133
parser.add_argument("--language-key", default="metadata/language_instruction",
132134
help="Language key to extract from HDF5 fallback")
133135
parser.add_argument("--question", default="Is this trajectory successful?",
134136
help="VLM question")
137+
parser.add_argument("--use-gpt", action="store_true",
138+
help="Use GPT vision API instead of local VLM")
139+
parser.add_argument("--gpt-api-key",
140+
help="OpenAI API key (or set OPENAI_API_KEY environment variable)")
141+
parser.add_argument("--gpt-model", default="gpt-5-2025-08-07",
142+
# choices=["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"],
143+
help="GPT model to use for vision tasks")
135144

136145
args = parser.parse_args()
137146

147+
# Handle GPT API key
148+
gpt_api_key = args.gpt_api_key or os.environ.get("OPENAI_API_KEY")
149+
if args.use_gpt and not gpt_api_key:
150+
print("❌ GPT API key required when using --use-gpt. Set --gpt-api-key or OPENAI_API_KEY environment variable.")
151+
return 1
152+
138153
# Resolve GCS paths
139154
if args.trajectories:
140155
gcs_paths = list(args.trajectories)
@@ -180,7 +195,11 @@ def main():
180195
configs.append((method, n, None))
181196
else:
182197
for cam_key in args.video_path_keys:
183-
configs.append((method, n, cam_key))
198+
# Handle 'all' option to concatenate ext1_mp4_path and wrist_mp4_path
199+
if cam_key == "all":
200+
configs.append((method, n, "all"))
201+
else:
202+
configs.append((method, n, cam_key))
184203

185204
start_all = time.time()
186205
for (method, n, cam_key) in configs:
@@ -204,7 +223,10 @@ def main():
204223
video_path_key=cam_key,
205224
num_frames=n,
206225
passing_method=method,
207-
concat_grid_cols=None
226+
concat_grid_cols=None,
227+
use_gpt=args.use_gpt,
228+
gpt_api_key=gpt_api_key,
229+
gpt_model=args.gpt_model
208230
)
209231

210232
# Persist raw results per trial

0 commit comments

Comments
 (0)