model = "meta-llama/Llama-3.1-8B-Instruct" seed = 42 quantization = "bnb_4bit" batch_size = 0 max_batch_size = 8 max_response_length = 96 n_trials = 48 n_startup_trials = 4 orthogonalize_direction = true benign_subspace_rank = 8 row_normalization = "pre" direction_variance_floor = 1e-6 kl_divergence_target = 0.10 overrefusal_penalty = 0.32 harmful_marker_penalty = 0.18 compliance_gap_penalty = 0.42 study_checkpoint_dir = "checkpoints_llama3_1_8b_benchmark" [good_prompts] dataset = "mlabonne/harmless_alpaca" split = "train[:240]" column = "text" residual_plot_label = '"Harmless" prompts' residual_plot_color = "royalblue" [bad_prompts] dataset = "JailbreakBench/JBB-Behaviors" name = "behaviors" split = "harmful[:80]" column = "Goal" residual_plot_label = '"Direct harmful" prompts' residual_plot_color = "darkorange" [good_evaluation_prompts] dataset = "mlabonne/harmless_alpaca" split = "test[:64]" column = "text" [bad_evaluation_prompts] dataset = "JailbreakBench/JBB-Behaviors" name = "behaviors" split = "harmful[80:100]" column = "Goal"