ritvik77
/

Doctor_AI_LoRA-Mistral-7B-Instruct_PEFT

@@ -5,8 +5,8 @@ model_name: Doctor_AI_LoRA-Mistral-7B-Instructritvik77
 tags:
 - generated_from_trainer
 - trl
-- sft
 licence: license
 ---
 # Model Card for Doctor_AI_LoRA-Mistral-7B-Instructritvik77
@@ -17,13 +17,116 @@ It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ```python
-from transformers import pipeline
-question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
-generator = pipeline("text-generation", model="ritvik77/Doctor_AI_LoRA-Mistral-7B-Instructritvik77", device="cuda")
-output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
-print(output["generated_text"])
-```
 ## Training procedure

 tags:
 - generated_from_trainer
 - trl
 licence: license
+license: apache-2.0
 ---
 # Model Card for Doctor_AI_LoRA-Mistral-7B-Instructritvik77
 ## Quick start
 ```python
+# from peft import PeftModel, PeftConfig
+# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# from datasets import load_dataset
+# import torch
+# # Quantization config for 4-bit loading
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_quant_type="nf4",
+#     bnb_4bit_compute_dtype=torch.bfloat16,
+#     bnb_4bit_use_double_quant=True,
+# )
+# # Repo ID for the PEFT model
+# peft_model_id = f"{username}/{output_dir}"  # e.g., ritvik77/Mixtral-7B-LoRA-Salesforce-Optimized-AI-AgentCall
+# device = "auto"
+# # Load PEFT config from the Hub
+# config = PeftConfig.from_pretrained(peft_model_id)
+# # Load the base model (e.g., Mistral-7B) with quantization
+# model = AutoModelForCausalLM.from_pretrained(
+#     config.base_model_name_or_path,  # Base model ID stored in PEFT config
+#     device_map="auto",
+#     quantization_config=bnb_config,  # Apply 4-bit quantization
+# )
+# # Load tokenizer from the PEFT model repo
+# tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
+# # Resize token embeddings to match tokenizer (if needed)
+# model.resize_token_embeddings(len(tokenizer))
+# # Load PEFT adapters and apply them to the base model
+# model = PeftModel.from_pretrained(model, peft_model_id)
+# # Convert model to bfloat16 and set to evaluation mode
+# model.to(torch.bfloat16)
+# model.eval()
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel, PeftConfig
+# ✅ Quantization config for 4-bit loading (Memory Optimization)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",           # ✅ Improved precision for LoRA weights
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,      # ✅ Reduces VRAM overhead
+)
+# ✅ Load tokenizer from fine-tuned checkpoint (Ensures token consistency)
+peft_model_id = "ritvik77/Doctor_AI_LoRA-Mistral-7B-Instructritvik77"
+tokenizer = AutoTokenizer.from_pretrained(peft_model_id, trust_remote_code=True)
+# ✅ Ensure `pad_token` is correctly assigned
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+# ✅ Load Base Model with Quantization for Memory Efficiency
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",                # ✅ Efficiently maps to available GPUs
+    quantization_config=bnb_config,   # ✅ Efficient quantization for large models
+    torch_dtype=torch.bfloat16
+)
+# ✅ Resize Token Embeddings BEFORE Loading LoRA Adapter (Prevents size mismatch)
+model.resize_token_embeddings(len(tokenizer))
+# ✅ Load PEFT Adapter (LoRA Weights)
+model = PeftModel.from_pretrained(model, peft_model_id)
+# ✅ Unfreeze LoRA layers to ensure they are trainable
+for name, param in model.named_parameters():
+    if "lora" in name:
+        param.requires_grad = True
+# ✅ Confirm LoRA Layers Are Active
+if hasattr(model, 'print_trainable_parameters'):
+    model.print_trainable_parameters()
+else:
+    print("❗ Warning: LoRA adapter may not have loaded correctly.")
+# ✅ Ensure model is in evaluation mode for inference
+model.eval()
+# ✅ Sample Inference Code
+def generate_response(prompt, max_new_tokens=300, temperature=0.7):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature
+    )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# ✅ Sample Prompt for Medical Diagnosis
+prompt = "Patient reports chest pain and shortness of breath. What might be the diagnosis?"
+response = generate_response(prompt)
+print("\n🩺 **Diagnosis:**", response)
+print("🚀 PEFT model loaded successfully with resized embeddings!")
 ## Training procedure