|
|
|
|
|
|
|
from datasets import load_dataset |
|
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, |
|
Trainer, TrainingArguments) |
|
import evaluate |
|
import numpy as np |
|
import torch |
|
from collections import Counter |
|
|
|
|
|
|
|
dataset = load_dataset("Luigi/dinercall-intent") |
|
|
|
|
|
model_checkpoint = "ckiplab/albert-base-chinese" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64) |
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
num_labels = tokenized_datasets["train"].features["label"].num_classes |
|
|
|
|
|
label_list = tokenized_datasets["train"]["label"] |
|
label_freq = Counter(label_list) |
|
total = len(label_list) |
|
|
|
|
|
class_weights = [total / label_freq[i] for i in range(num_labels)] |
|
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float) |
|
|
|
print("Class weights:", class_weights) |
|
|
|
|
|
from torch import nn |
|
from transformers import BertPreTrainedModel, BertModel |
|
|
|
class CustomModel(AutoModelForSequenceClassification): |
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.num_labels = config.num_labels |
|
self.bert = BertModel(config) |
|
self.dropout = nn.Dropout(config.hidden_dropout_prob) |
|
self.classifier = nn.Linear(config.hidden_size, config.num_labels) |
|
self.loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor) |
|
|
|
def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs): |
|
outputs = self.bert(input_ids, attention_mask=attention_mask) |
|
pooled_output = outputs[1] |
|
pooled_output = self.dropout(pooled_output) |
|
logits = self.classifier(pooled_output) |
|
|
|
loss = None |
|
if labels is not None: |
|
loss = self.loss_fn(logits, labels) |
|
|
|
return {"loss": loss, "logits": logits} |
|
|
|
|
|
model = CustomModel.from_pretrained(model_checkpoint, num_labels=num_labels) |
|
|
|
|
|
accuracy = evaluate.load("accuracy") |
|
precision = evaluate.load("precision") |
|
recall = evaluate.load("recall") |
|
f1 = evaluate.load("f1") |
|
|
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
predictions = np.argmax(logits, axis=-1) |
|
return { |
|
"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], |
|
"precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"], |
|
"recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"], |
|
"f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"] |
|
} |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=16, |
|
num_train_epochs=100, |
|
weight_decay=0.01, |
|
logging_dir='./logs', |
|
logging_steps=10, |
|
save_total_limit=2, |
|
push_to_hub=True, |
|
hub_model_id="Luigi/albert-base-chinese-dinercall-intent", |
|
hub_private_repo=False |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets["train"], |
|
eval_dataset=tokenized_datasets["test"], |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./final_model") |
|
tokenizer.save_pretrained("./final_model") |
|
|