#this repo contains the code for mixtral model for finding the icd-10 codes and this scripts runs well on the single GPU and is now trying to run with the multiple GPU and i need to make sure that this script runs in a multi gpu environment

import warnings
warnings.filterwarnings("ignore")

from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from datasets import load_dataset
import torch
import transformers
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training , LoraConfig, get_peft_model

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
) #made to distribute the weights across multi gpu env 

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

## Loading the dataset
def Profiler_load_dataset(data_files , field = 'train'):
      return load_dataset('json' , data_files = data_files , field= field)


## high ram used here 
train_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='train')
eval_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='test')


### What is the use of formatting function ? 
## It formats the data in this form for the mixtral model ( means easy to use in an instruction fine-tuning scenario )
def format_fun(example):
    text = f" The ICD10 code for {example['Input']} is , {example['Output']} "
    return text

# base_model_id = "mistralai/Mixtral-8x7B-v0.1"
#try out different models from the hugging faces library ( the best would have been the once released by the authors but that wont be quantised so dont think it would work well !! 


base_model_id = 'TheBloke/dolphin-2.5-mixtral-8x7b-GGUF' # this is passed in as arg -> args.model_id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="cuda")
## The model got loaded and works !! 


tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token


max_length = 50 #max number of word generation 
def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        format_fun(prompt), 
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy() #what this do ?? 
    return result

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)


#Fine tuning the model 
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "w1",
        "w2",
        "w3",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True
    

project = "icd-finetune"
base_model_name = "mixtral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=300,
        learning_rate=2.5e-5, # Want a small lr for finetuning
        fp16=True, 
        optim="paged_adamw_8bit",
        logging_steps=25,              # When to start reporting loss
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=25,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


# Implement RAG on the fine tuned model 


# final model prepared 
'''
1) Make sure the model runs on multi gpu script ! 
2) The dataset is loaded 
3) The langchain implementation to oversee the prompt generation guide 
4) Also try the bert models rather than directly using the mixtral model () 
5) Once the model is trained copy the checkpoint folder and paste in a local env 
'''