| | export HF_HOME=/mnt/petrelfs/share/yejinhui/Models/huggingface_cache |
| |
|
| | export NCCL_SOCKET_IFNAME=bond0 |
| | export NCCL_IB_HCA=mlx5_2,mlx5_3 |
| |
|
| | |
| | export NCCL_BLOCKING_WAIT=1 |
| | export NCCL_ASYNC_ERROR_HANDLING=1 |
| | export NCCL_TIMEOUT=1000 |
| |
|
| | cd /mnt/petrelfs/yujunqiu/code/vla-baseline/llavavla-00hf1 |
| |
|
| | |
| | |
| | run_root_dir=./playground/Checkpoints |
| | task_name=libero_goal |
| | run_id=0903_${task_name}_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_16_pretrained_vlm |
| |
|
| |
|
| | export WANDB_MODE=disabled |
| |
|
| | output_dir=${run_root_dir}/${run_id} |
| | mkdir -p ${output_dir} |
| | |
| | cp $0 ${output_dir}/ |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | DEBUG=False |
| | |
| |
|
| | if [ "$DEBUG" = True ]; then |
| | num_processes=1 |
| | run_id=debug |
| | else |
| | num_processes=8 |
| | fi |
| |
|
| |
|
| | accelerate launch \ |
| | --config_file scripts/run_scripts/deepspeed_zero2.yaml \ |
| | --num_processes ${num_processes} \ |
| | llavavla/training/train_qwenvla.py \ |
| | --config_yaml ./llavavla/config/lerobot_data/qwenvla_cotrain_libero.yaml \ |
| | --datasets.vla_data.per_device_batch_size 16 \ |
| | --datasets.vla_data.data_mix ${task_name} \ |
| | --framework.action_model.future_action_window_size 7 \ |
| | --trainer.max_train_steps 100_000 \ |
| | --trainer.save_interval 10_000 \ |
| | --run_root_dir ${run_root_dir} \ |
| | --run_id ${run_id} \ |
| | --wandb_project Internvla \ |
| | --wandb_entity michaelyu-1101-fudanuniversity \ |
| | --is_debug ${DEBUG} \ |
| | --framework.qwenvl.base_vlm /mnt/phwfile/efm_t/zhuyangkun_tmp_need_del/exp/exp_08_09/manip_sys2_qwen25_3b_onevision_molmo_a0all_refsp20/checkpoint-20000 |
| |
|
| | |
| | |
| |
|
| | |
| |
|