VP-VLA-Robocasa-Tabletop / run_robocasa_visual_prompt_no_subtask_two_image.sh
Vincent2311's picture
Add files using upload-large-folder tool
bc2c774 verified
#!/bin/bash
# Visual Prompt Training Script for RoboCasa
export NCCL_SOCKET_IFNAME=bond0
export NCCL_IB_HCA=mlx5_2,mlx5_3
# used for check save when communication
export NCCL_BLOCKING_WAIT=1
export NCCL_ASYNC_ERROR_HANDLING=1
export TORCH_NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
# Timeout settings for distributed operations (in seconds)
# 3600 = 1 hour - needed for large model checkpoint saving with DeepSpeed ZeRO
# Note: PyTorch default is 1800s (30 min). NCCL_TIMEOUT may not work with all PyTorch versions.
export NCCL_TIMEOUT=3600
export TORCH_DISTRIBUTED_DEBUG=DETAIL # Provides more debugging info on timeout
Framework_name=QwenOFT
base_vlm=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Pretrained_models/Qwen3-VL-4B-Instruct
freeze_module_list=''
DIT_TYPE="DiT-B"
# Data paths
data_root_dir=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Datasets/nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim
visual_prompt_dir=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Datasets/visual_prompt_robocasa_by_frames
data_mix=fourier_gr1_unified_1000
# Output
run_root_dir=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Checkpoints
run_id=fourier_gr1_unified_1000_visual_prompt_training_QwenOFT_lr1e_5_no_subtask_two_image
output_dir=${run_root_dir}/${run_id}
mkdir -p ${output_dir}
# Save this script to the output dir
cp $0 ${output_dir}/
accelerate launch \
--config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
--num_processes 8 \
starVLA/training/train_starvla_visual_prompt.py \
--config_yaml ./examples/Robocasa_tabletop/train_files/starvla_cotrain_robocasa_gr1_visual_prompt.yaml \
--framework.name ${Framework_name} \
--framework.qwenvl.base_vlm ${base_vlm} \
--framework.action_model.action_model_type ${DIT_TYPE} \
--datasets.vla_data.data_root_dir ${data_root_dir} \
--datasets.vla_data.visual_prompt_dir ${visual_prompt_dir} \
--datasets.vla_data.data_mix ${data_mix} \
--datasets.vla_data.per_device_batch_size 32 \
--datasets.vla_data.video_backend decord \
--datasets.vp_data.per_device_batch_size 8 \
--trainer.freeze_modules "${freeze_module_list}" \
--trainer.max_train_steps 100000 \
--trainer.save_interval 10000 \
--trainer.logging_frequency 10 \
--trainer.eval_interval 100 \
--trainer.learning_rate.base 3e-5 \
--trainer.learning_rate.qwen_vl_interface 1e-5 \
--trainer.loss_scale.visual_prompt 0.1 \
--datasets.vla_data.use_subtask false \
--datasets.vla_data.feed_both_images true \
--datasets.vp_data.feed_both_images false \
--run_root_dir ${run_root_dir} \
--run_id ${run_id} \
--wandb_project robocasa_visual_prompt \
--wandb_entity zwanggk
# --is_debug True