#!/usr/bin/env python3 """ Complete pipeline script for T5 Parallel Model """ import os import sys import argparse from src.data_preprocess import main as preprocess_data from src.train import train_model def main(): parser = argparse.ArgumentParser(description="T5 Parallel Model Training Pipeline") parser.add_argument("--preprocess-only", action="store_true", help="Only preprocess data") parser.add_argument("--train-only", action="store_true", help="Only train model") parser.add_argument("--sample-size", type=int, default=10000, help="Dataset sample size") args = parser.parse_args() # Create necessary directories os.makedirs("data/processed", exist_ok=True) os.makedirs("checkpoints", exist_ok=True) os.makedirs("logs", exist_ok=True) if not args.train_only: print("=== Starting Data Preprocessing ===") # You might need to modify preprocess_data to accept sample_size preprocess_data() if not args.preprocess_only: print("=== Starting Model Training ===") train_model() if __name__ == "__main__": main()