|
|
|
|
|
""" |
|
|
Complete pipeline script for T5 Parallel Model |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import argparse |
|
|
from src.data_preprocess import main as preprocess_data |
|
|
from src.train import train_model |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="T5 Parallel Model Training Pipeline") |
|
|
parser.add_argument("--preprocess-only", action="store_true", help="Only preprocess data") |
|
|
parser.add_argument("--train-only", action="store_true", help="Only train model") |
|
|
parser.add_argument("--sample-size", type=int, default=10000, help="Dataset sample size") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
os.makedirs("data/processed", exist_ok=True) |
|
|
os.makedirs("checkpoints", exist_ok=True) |
|
|
os.makedirs("logs", exist_ok=True) |
|
|
|
|
|
if not args.train_only: |
|
|
print("=== Starting Data Preprocessing ===") |
|
|
|
|
|
preprocess_data() |
|
|
|
|
|
if not args.preprocess_only: |
|
|
print("=== Starting Model Training ===") |
|
|
train_model() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |