finetune / app.py
yashoda74679's picture
Update app.py
7b87f4d verified
import os
import torch
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
# ✅ Set a writable cache directory inside the container
os.environ["HF_HOME"] = "/app/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache"
# Ensure cache directory exists
os.makedirs("/app/hf_cache", exist_ok=True)
# Set verbose logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Loading dataset...")
ds = load_dataset("facebook/natural_reasoning") # Replace with your dataset
logger.info(f"Dataset loaded successfully! Dataset info:\n{ds}")
# Load tokenizer
logger.info("Loading tokenizer...")
model_name = "deepseek-ai/DeepSeek-R1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
logger.info("Tokenizer loaded successfully!")
# Tokenization function
def preprocess_function(examples):
input_texts = [f"Q: {q} A: {a}" for q, a in zip(examples["question"], examples["reference_answer"])]
return tokenizer(input_texts, truncation=True, padding="max_length", max_length=512)
# Tokenize dataset
logger.info("Tokenizing dataset...")
tokenized_datasets = ds.map(preprocess_function, batched=True)
logger.info("Dataset tokenized successfully!")
# Load model
logger.info("Loading model...")
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
logger.info("Model loaded successfully!")
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=4, # Adjust based on available RAM
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
push_to_hub=True,
report_to="none",
logging_first_step=True
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
tokenizer=tokenizer
)
# Start training
logger.info("Starting training...")
trainer.train()
logger.info("Training completed!")
# Push trained model to Hugging Face Hub
logger.info("Pushing trained model to Hugging Face Hub...")
trainer.push_to_hub()
logger.info("Model push completed! Training process finished successfully.")