2323 python evaluate_vlm_configs.py \
2424 --trajectories gs://.../success/... gs://.../failure/... \
2525 --eval-root ./eval_runs
26+
27+ CUDA_VISIBLE_DEVICES=4,5,6,7 SGLANG_VLM_CACHE_SIZE_MB=1024 python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-32B-Instruct --host 0.0.0.0 --port 30000 --tp 4 --mem-fraction-static 0.6 --chunked-prefill-size 4096
2628"""
2729
2830import argparse
@@ -118,23 +120,36 @@ def main():
118120 parser .add_argument ("--balance" , type = float , help = "Success ratio target in sampling, e.g., 0.5" )
119121 parser .add_argument ("--seed" , type = int , help = "Random seed" )
120122 parser .add_argument ("--max-workers" , type = int , default = 4 , help = "Parallel workers for VLM" )
121- parser .add_argument ("--eval-root" , default = "./eval_runs " , help = "Root folder for evaluation outputs" )
123+ parser .add_argument ("--eval-root" , default = "./eval_runs_2 " , help = "Root folder for evaluation outputs" )
122124 parser .add_argument ("--num-trials" , type = int , default = 1 , help = "Number of trials per configuration" )
123125
124- parser .add_argument ("--frame-counts" , type = int , nargs = '+' , default = [2 , 4 , 6 , 8 , 10 ],
126+ parser .add_argument ("--frame-counts" , type = int , nargs = '+' , default = [2 , 4 , 8 , 16 , 32 ],
125127 help = "Frame counts to evaluate" )
126- parser .add_argument ("--passing-methods" , nargs = '+' , default = ["stream" , "concat" ],
128+ parser .add_argument ("--passing-methods" , nargs = '+' , default = ["stream" ],
127129 choices = ["stream" , "concat" ], help = "Passing methods to evaluate" )
128- parser .add_argument ("--video-path-keys" , nargs = '*' , default = None ,
129- help = "Video path keys from metadata (e.g., ext1_mp4_path wrist_mp4_path) . If omitted, auto-detect." )
130+ parser .add_argument ("--video-path-keys" , nargs = '*' , default = [ "ext1_mp4_path" ] ,
131+ help = "Video path keys from metadata (e.g., ext1_mp4_path wrist_mp4_path all). 'all' concatenates ext1_mp4_path and wrist_mp4_path . If omitted, auto-detect." )
130132
131133 parser .add_argument ("--language-key" , default = "metadata/language_instruction" ,
132134 help = "Language key to extract from HDF5 fallback" )
133135 parser .add_argument ("--question" , default = "Is this trajectory successful?" ,
134136 help = "VLM question" )
137+ parser .add_argument ("--use-gpt" , action = "store_true" ,
138+ help = "Use GPT vision API instead of local VLM" )
139+ parser .add_argument ("--gpt-api-key" ,
140+ help = "OpenAI API key (or set OPENAI_API_KEY environment variable)" )
141+ parser .add_argument ("--gpt-model" , default = "gpt-5-2025-08-07" ,
142+ # choices=["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"],
143+ help = "GPT model to use for vision tasks" )
135144
136145 args = parser .parse_args ()
137146
147+ # Handle GPT API key
148+ gpt_api_key = args .gpt_api_key or os .environ .get ("OPENAI_API_KEY" )
149+ if args .use_gpt and not gpt_api_key :
150+ print ("❌ GPT API key required when using --use-gpt. Set --gpt-api-key or OPENAI_API_KEY environment variable." )
151+ return 1
152+
138153 # Resolve GCS paths
139154 if args .trajectories :
140155 gcs_paths = list (args .trajectories )
@@ -180,7 +195,11 @@ def main():
180195 configs .append ((method , n , None ))
181196 else :
182197 for cam_key in args .video_path_keys :
183- configs .append ((method , n , cam_key ))
198+ # Handle 'all' option to concatenate ext1_mp4_path and wrist_mp4_path
199+ if cam_key == "all" :
200+ configs .append ((method , n , "all" ))
201+ else :
202+ configs .append ((method , n , cam_key ))
184203
185204 start_all = time .time ()
186205 for (method , n , cam_key ) in configs :
@@ -204,7 +223,10 @@ def main():
204223 video_path_key = cam_key ,
205224 num_frames = n ,
206225 passing_method = method ,
207- concat_grid_cols = None
226+ concat_grid_cols = None ,
227+ use_gpt = args .use_gpt ,
228+ gpt_api_key = gpt_api_key ,
229+ gpt_model = args .gpt_model
208230 )
209231
210232 # Persist raw results per trial
0 commit comments