# check_training_setup.py import torch from transformers import AutoTokenizer from src.data_preprocess import DataProcessor def check_training_data_quality(): print("=== CHECKING TRAINING DATA QUALITY ===") # Load your actual training data processor = DataProcessor() processed_data, tokenizer = processor.load_processed_data("./data/processed") if not processed_data: print("❌ No processed data found!") return train_data = processed_data['train'] print(f"Training samples: {len(train_data)}") if len(train_data) == 0: print("❌ Empty training data!") return # Analyze first few samples print("\n=== SAMPLE ANALYSIS ===") for i in range(min(5, len(train_data))): sample = train_data[i] input_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True) label_text = tokenizer.decode([l for l in sample['labels'] if l != -100], skip_special_tokens=True) print(f"\nSample {i}:") print(f" Input: '{input_text}'") print(f" Target: '{label_text}'") print(f" Input length: {len(sample['input_ids'])}") print(f" Label length: {len(sample['labels'])}") # Check label composition labels = sample['labels'] unique_labels = torch.unique(labels) print(f" Unique label tokens: {len(unique_labels)}") # Count -100 (ignored tokens) ignored_count = (labels == -100).sum().item() print(f" Ignored tokens (-100): {ignored_count}/{len(labels)}") # Check if input and labels are identical (this would be bad for language modeling) if torch.equal(sample['input_ids'], sample['labels']): print(" ⚠️ WARNING: Input and labels are identical!") def check_model_vs_data(): print("\n=== CHECKING MODEL vs DATA COMPATIBILITY ===") tokenizer = AutoTokenizer.from_pretrained("t5-small") # Check if your model's vocab size matches tokenizer from src.model import ParallelT5Small model = ParallelT5Small(vocab_size=tokenizer.vocab_size) print(f"Tokenizer vocab size: {tokenizer.vocab_size}") print(f"Model vocab size: {model.token_embedding.num_embeddings}") print(f"LM head vocab size: {model.lm_head.out_features}") if tokenizer.vocab_size != model.token_embedding.num_embeddings: print("❌ VOCAB SIZE MISMATCH!") else: print("✅ Vocab sizes match") if __name__ == "__main__": check_training_data_quality() check_model_vs_data()