Luigi's picture
add train script
e44c5d6
#!/usr/bin/env python
# fine_tune_taiwan_restaurant_intents.py
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
Trainer, TrainingArguments)
import evaluate
import numpy as np
import torch
from collections import Counter
# Load the dataset from Hugging Face Hub
dataset = load_dataset("Luigi/dinercall-intent")
# Choose a pretrained model checkpoint.
model_checkpoint = "ckiplab/albert-base-chinese"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Tokenization function
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
# Tokenize
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Number of intent labels
num_labels = tokenized_datasets["train"].features["label"].num_classes
# ----------- 🛡️ Compute class weights for imbalanced training set -----------
label_list = tokenized_datasets["train"]["label"]
label_freq = Counter(label_list)
total = len(label_list)
# Inverse frequency as weight
class_weights = [total / label_freq[i] for i in range(num_labels)]
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)
# ----------- 🧠 Load model with weighted loss function -----------
from torch import nn
from transformers import BertPreTrainedModel, BertModel
class CustomModel(AutoModelForSequenceClassification):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
outputs = self.bert(input_ids, attention_mask=attention_mask)
pooled_output = outputs[1] # CLS token
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
loss = self.loss_fn(logits, labels)
return {"loss": loss, "logits": logits}
# Initialize model
model = CustomModel.from_pretrained(model_checkpoint, num_labels=num_labels)
# ----------- 📊 Metrics (precision, recall, F1, accuracy) -----------
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
"precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"],
"recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"],
"f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
}
# ----------- ⚙️ Training arguments -----------
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=100,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
save_total_limit=2,
push_to_hub=True,# 👈 This will push your model after training
hub_model_id="Luigi/albert-base-chinese-dinercall-intent", # optional, use if you want a custom name
hub_private_repo=False # optional, if you want a private repo
)
# ----------- 🧪 Trainer -----------
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# Train!
trainer.train()
# Save final model
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")