Spaces:

mohitFlooid
/

randomSpace

Runtime error

App Files Files Community

complete-dope commited on Jan 31, 2024

Commit

e29386b

1 Parent(s): 8a8d3b0

updated

Browse files

Files changed (3) hide show

main.py +155 -0
prov_data2.jsonl +0 -0
requirements.txt +9 -0

main.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#this repo contains the code for mixtral model for finding the icd-10 codes and this scripts runs well on the single GPU and is now trying to run with the multiple GPU and i need to make sure that this script runs in a multi gpu environment
+import warnings
+warnings.filterwarnings("ignore")
+from accelerate import FullyShardedDataParallelPlugin, Accelerator
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
+from datasets import load_dataset
+import torch
+import transformers
+from datetime import datetime
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import prepare_model_for_kbit_training , LoraConfig, get_peft_model
+fsdp_plugin = FullyShardedDataParallelPlugin(
+    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
+    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
+) #made to distribute the weights across multi gpu env
+accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
+## Loading the dataset
+def Profiler_load_dataset(data_files , field = 'train'):
+      return load_dataset('json' , data_files = data_files , field= field)
+## high ram used here
+train_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='train')
+eval_dataset = Profiler_load_dataset(data_files='/content/prov_data2.jsonl', field='test')
+### What is the use of formatting function ?
+## It formats the data in this form for the mixtral model ( means easy to use in an instruction fine-tuning scenario )
+def format_fun(example):
+    text = f" The ICD10 code for {example['Input']} is , {example['Output']} "
+    return text
+# base_model_id = "mistralai/Mixtral-8x7B-v0.1"
+#try out different models from the hugging faces library ( the best would have been the once released by the authors but that wont be quantised so dont think it would work well !!
+base_model_id = '' # this is passed in as arg -> args.model_id
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="cuda")
+## The model got loaded and works !!
+tokenizer = AutoTokenizer.from_pretrained(
+    base_model_id,
+    padding_side="left",
+    add_eos_token=True,
+    add_bos_token=True,
+)
+tokenizer.pad_token = tokenizer.eos_token
+max_length = 50 #max number of word generation
+def generate_and_tokenize_prompt(prompt):
+    result = tokenizer(
+        format_fun(prompt),
+        truncation=True,
+        max_length=max_length,
+        padding="max_length",
+    )
+    result["labels"] = result["input_ids"].copy() #what this do ??
+    return result
+tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
+tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
+#Fine tuning the model
+model.gradient_checkpointing_enable()
+model = prepare_model_for_kbit_training(model)
+config = LoraConfig(
+    r=32,
+    lora_alpha=64,
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "w1",
+        "w2",
+        "w3",
+        "lm_head",
+    ],
+    bias="none",
+    lora_dropout=0.05,  # Conventional
+    task_type="CAUSAL_LM",
+)
+model = get_peft_model(model, config)
+if torch.cuda.device_count() > 1: # If more than 1 GPU
+    model.is_parallelizable = True
+    model.model_parallel = True
+project = "icd-finetune"
+base_model_name = "mixtral"
+run_name = base_model_name + "-" + project
+output_dir = "./" + run_name
+trainer = transformers.Trainer(
+    model=model,
+    train_dataset=tokenized_train_dataset,
+    eval_dataset=tokenized_val_dataset,
+    args=transformers.TrainingArguments(
+        output_dir=output_dir,
+        warmup_steps=1,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        max_steps=300,
+        learning_rate=2.5e-5, # Want a small lr for finetuning
+        fp16=True,
+        optim="paged_adamw_8bit",
+        logging_steps=25,              # When to start reporting loss
+        logging_dir="./logs",        # Directory for storing logs
+        save_strategy="steps",       # Save the model checkpoint every logging step
+        save_steps=25,                # Save checkpoints every 50 steps
+        evaluation_strategy="steps", # Evaluate the model every logging step
+        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
+        do_eval=True,                # Perform evaluation at the end of training
+    ),
+    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+trainer.train()
+# Implement RAG on the fine tuned model
+# final model prepared
+'''
+1) Make sure the model runs on multi gpu script !
+2) The dataset is loaded
+3) The langchain implementation to oversee the prompt generation guide
+4) Also try the bert models rather than directly using the mixtral model ()
+5) Once the model is trained copy the checkpoint folder and paste in a local env
+'''

prov_data2.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+numpy
+git+https://github.com/huggingface/transformers.git
+git+https://github.com/huggingface/peft.git
+git+https://github.com/huggingface/accelerate.git
+datasets
+scipy
+ipywidgets
+matplotlib