Spaces:
Runtime error
Runtime error
| print('this is the main file called main.py') | |
| ''' | |
| #this repo contains the code for mixtral model for finding the icd-10 codes and this scripts runs well on the single GPU and is now trying to run with the multiple GPU and i need to make sure that this script runs in a multi gpu environment | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| from accelerate import FullyShardedDataParallelPlugin, Accelerator | |
| from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig | |
| from datasets import load_dataset | |
| import torch | |
| import transformers | |
| from datetime import datetime | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import prepare_model_for_kbit_training , LoraConfig, get_peft_model | |
| fsdp_plugin = FullyShardedDataParallelPlugin( | |
| state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False), | |
| optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False), | |
| ) #made to distribute the weights across multi gpu env | |
| accelerator = Accelerator(fsdp_plugin=fsdp_plugin) | |
| ## Loading the dataset | |
| def Profiler_load_dataset(data_files , field = 'train'): | |
| return load_dataset('json' , data_files = data_files , field= field) | |
| ## high ram used here | |
| train_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='train') | |
| eval_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='test') | |
| ### What is the use of formatting function ? | |
| ## It formats the data in this form for the mixtral model ( means easy to use in an instruction fine-tuning scenario ) | |
| def format_fun(example): | |
| text = f" The ICD10 code for {example['Input']} is , {example['Output']} " | |
| return text | |
| # base_model_id = "mistralai/Mixtral-8x7B-v0.1" | |
| #try out different models from the hugging faces library ( the best would have been the once released by the authors but that wont be quantised so dont think it would work well !! | |
| base_model_id = 'TheBloke/dolphin-2.5-mixtral-8x7b-GGUF' # this is passed in as arg -> args.model_id | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="cuda") | |
| ## The model got loaded and works !! | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_id, | |
| padding_side="left", | |
| add_eos_token=True, | |
| add_bos_token=True, | |
| ) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| max_length = 50 #max number of word generation | |
| def generate_and_tokenize_prompt(prompt): | |
| result = tokenizer( | |
| format_fun(prompt), | |
| truncation=True, | |
| max_length=max_length, | |
| padding="max_length", | |
| ) | |
| result["labels"] = result["input_ids"].copy() #what this do ?? | |
| return result | |
| tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt) | |
| tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt) | |
| #Fine tuning the model | |
| model.gradient_checkpointing_enable() | |
| model = prepare_model_for_kbit_training(model) | |
| config = LoraConfig( | |
| r=32, | |
| lora_alpha=64, | |
| target_modules=[ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "w1", | |
| "w2", | |
| "w3", | |
| "lm_head", | |
| ], | |
| bias="none", | |
| lora_dropout=0.05, # Conventional | |
| task_type="CAUSAL_LM", | |
| ) | |
| model = get_peft_model(model, config) | |
| if torch.cuda.device_count() > 1: # If more than 1 GPU | |
| model.is_parallelizable = True | |
| model.model_parallel = True | |
| project = "icd-finetune" | |
| base_model_name = "mixtral" | |
| run_name = base_model_name + "-" + project | |
| output_dir = "./" + run_name | |
| trainer = transformers.Trainer( | |
| model=model, | |
| train_dataset=tokenized_train_dataset, | |
| eval_dataset=tokenized_val_dataset, | |
| args=transformers.TrainingArguments( | |
| output_dir=output_dir, | |
| warmup_steps=1, | |
| per_device_train_batch_size=2, | |
| gradient_accumulation_steps=1, | |
| gradient_checkpointing=True, | |
| max_steps=300, | |
| learning_rate=2.5e-5, # Want a small lr for finetuning | |
| fp16=True, | |
| optim="paged_adamw_8bit", | |
| logging_steps=25, # When to start reporting loss | |
| logging_dir="./logs", # Directory for storing logs | |
| save_strategy="steps", # Save the model checkpoint every logging step | |
| save_steps=25, # Save checkpoints every 50 steps | |
| evaluation_strategy="steps", # Evaluate the model every logging step | |
| eval_steps=25, # Evaluate and save checkpoints every 50 steps | |
| do_eval=True, # Perform evaluation at the end of training | |
| ), | |
| data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), | |
| ) | |
| model.config.use_cache = False # silence the warnings. Please re-enable for inference! | |
| trainer.train() | |
| # Implement RAG on the fine tuned model | |
| # final model prepared | |
| ''' | |
| # 1) Make sure the model runs on multi gpu script ! | |
| # 2) The dataset is loaded | |
| # 3) The langchain implementation to oversee the prompt generation guide | |
| # 4) Also try the bert models rather than directly using the mixtral model () | |
| # 5) Once the model is trained copy the checkpoint folder and paste in a local env | |
| ''' | |
| ''' | |