| #!/bin/bash |
| |
|
|
| export NCCL_SOCKET_IFNAME=bond0 |
| export NCCL_IB_HCA=mlx5_2,mlx5_3 |
|
|
| |
| export NCCL_BLOCKING_WAIT=1 |
| export NCCL_ASYNC_ERROR_HANDLING=1 |
| export TORCH_NCCL_BLOCKING_WAIT=1 |
| export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 |
|
|
| |
| |
| |
| export NCCL_TIMEOUT=3600 |
| export TORCH_DISTRIBUTED_DEBUG=DETAIL |
|
|
| Framework_name=QwenOFT |
| base_vlm=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Pretrained_models/Qwen3-VL-4B-Instruct |
| freeze_module_list='' |
| DIT_TYPE="DiT-B" |
|
|
| |
| data_root_dir=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Datasets/nvidia/PhysicalAI-Robotics-GR00T-X-Embodiment-Sim |
| visual_prompt_dir=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Datasets/visual_prompt_robocasa_by_frames |
| data_mix=fourier_gr1_unified_1000 |
|
|
| |
| run_root_dir=/gpfs/wangzixuan/visual_prompting/starVLA_robocasa/playground/Checkpoints |
| run_id=fourier_gr1_unified_1000_visual_prompt_training_QwenOFT_lr1e_5_no_subtask_two_image |
|
|
| output_dir=${run_root_dir}/${run_id} |
| mkdir -p ${output_dir} |
| |
| cp $0 ${output_dir}/ |
|
|
| accelerate launch \ |
| --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \ |
| --num_processes 8 \ |
| starVLA/training/train_starvla_visual_prompt.py \ |
| --config_yaml ./examples/Robocasa_tabletop/train_files/starvla_cotrain_robocasa_gr1_visual_prompt.yaml \ |
| --framework.name ${Framework_name} \ |
| --framework.qwenvl.base_vlm ${base_vlm} \ |
| --framework.action_model.action_model_type ${DIT_TYPE} \ |
| --datasets.vla_data.data_root_dir ${data_root_dir} \ |
| --datasets.vla_data.visual_prompt_dir ${visual_prompt_dir} \ |
| --datasets.vla_data.data_mix ${data_mix} \ |
| --datasets.vla_data.per_device_batch_size 32 \ |
| --datasets.vla_data.video_backend decord \ |
| --datasets.vp_data.per_device_batch_size 8 \ |
| --trainer.freeze_modules "${freeze_module_list}" \ |
| --trainer.max_train_steps 100000 \ |
| --trainer.save_interval 10000 \ |
| --trainer.logging_frequency 10 \ |
| --trainer.eval_interval 100 \ |
| --trainer.learning_rate.base 3e-5 \ |
| --trainer.learning_rate.qwen_vl_interface 1e-5 \ |
| --trainer.loss_scale.visual_prompt 0.1 \ |
| --datasets.vla_data.use_subtask false \ |
| --datasets.vla_data.feed_both_images true \ |
| --datasets.vp_data.feed_both_images false \ |
| --run_root_dir ${run_root_dir} \ |
| --run_id ${run_id} \ |
| --wandb_project robocasa_visual_prompt \ |
| --wandb_entity zwanggk |
| |
|
|