model = "Qwen/Qwen2.5-3B" seed = 42 quantization = "none" batch_size = 0 max_batch_size = 8 max_response_length = 96 n_trials = 48 n_startup_trials = 4 orthogonalize_direction = true benign_subspace_rank = 8 row_normalization = "pre" direction_variance_floor = 1e-6 kl_divergence_target = 0.10 overrefusal_penalty = 0.32 harmful_marker_penalty = 0.18 compliance_gap_penalty = 0.42 study_checkpoint_dir = "checkpoints_qwen2_5_3b_base_benchmark" [good_prompts] dataset = "mlabonne/harmless_alpaca" split = "train[:240]" column = "text" residual_plot_label = '"Harmless" prompts' residual_plot_color = "royalblue" [bad_prompts] dataset = "JailbreakBench/JBB-Behaviors" name = "behaviors" split = "harmful[:80]" column = "Goal" residual_plot_label = '"Direct harmful" prompts' residual_plot_color = "darkorange" [good_evaluation_prompts] dataset = "mlabonne/harmless_alpaca" split = "test[:64]" column = "text" [bad_evaluation_prompts] dataset = "JailbreakBench/JBB-Behaviors" name = "behaviors" split = "harmful[80:100]" column = "Goal"