distilhubert-finetuned-gtzan / audiocourse_musicgenreclassifier_p2.py
futureProofGlitch's picture
Upload audiocourse_musicgenreclassifier_p2.py
9c3919d verified
# -*- coding: utf-8 -*-
"""Copy of AudioCourse_MusicGenreClassifier_P2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/19qAGS31MqX04p9EeUgAM2dGvI3SB3pm6
"""
!pip install --upgrade transformers
!pip install datasets
!pip install gradio
from datasets import load_dataset
gtzan = load_dataset("marsyas/gtzan", "all")
gtzan
# GTZAN does not provide a split in the dataset, so we are creating one
# ourselves
gtzan = gtzan['train'].train_test_split(test_size=0.1, shuffle=True, seed=42)
gtzan
gtzan['train'][0]
# genre is represented as an integer, so let’s use the int2str() method of
# the genre feature to map these integers to human-readable names
id2label_fn = gtzan['train'].features['genre'].int2str
id2label_fn(gtzan['train'][0]['genre'])
# Let’s now listen to a few more examples by using Gradio to create a simple
# interface with the Blocks API
import gradio as gr
def generate_audio():
example = gtzan["train"].shuffle()[0]
audio = example["audio"]
return (
audio["sampling_rate"],
audio["array"],
), id2label_fn(example["genre"])
with gr.Blocks() as demo:
with gr.Column():
for _ in range(4):
audio, label = generate_audio()
output = gr.Audio(audio, label=label)
demo.launch(debug=True)
from transformers import AutoFeatureExtractor
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_id, do_normalize=True, return_attention_mask=True
)
# As we have seen above, the sampling rate of the audio samples
# in the dataset is = 22KHz(approx.) let's find the sampling rate accepted by the
# model
sampling_rate = feature_extractor.sampling_rate
sampling_rate
# The model needs 16Khz samples so we can use the cast_column() method to
# downsample the examples to match the requirements of the model.
from datasets import Audio
gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))
# Let's verify if the changes were successful.
gtzan['train'][0]
# Works! However, I noticed that the 1-D NP Array of the audio has changed.
# What exactly did the feature extractor do?
# Looking into this
import numpy as np
test_sample = gtzan['train'][0]['audio']
print(f"Mean: {np.mean(test_sample['array']):.3},\n",
f"Variance: {np.var(test_sample['array']):.3}")
inputs = feature_extractor(test_sample["array"],
sampling_rate=test_sample["sampling_rate"])
print(f"inputs keys: {list(inputs.keys())}")
print(
f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)
# the model cannot process audio samples above 30secs,
# therefore, wee need to trucate examples that have longer durations.
# Let's define a method to set the max_duration and use the feature_extractor
# class on a single sample.
# Later, we can use the .map() method to apply the same for all samples.
max_duration = 30.0
def preprocess_function(examples):
audio_arrays = [x["array"] for x in examples["audio"]]
inputs = feature_extractor(
audio_arrays,
sampling_rate=feature_extractor.sampling_rate,
max_length=int(feature_extractor.sampling_rate * max_duration),
truncation=True,
return_attention_mask=True,
)
return inputs
gtzan_encoded = gtzan.map(
preprocess_function,
remove_columns=["audio", "file"],
batched=True,
batch_size=100,
num_proc=1,
)
gtzan_encoded
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")
id2label = {
str(i): id2label_fn(i)
for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}
id2label["7"]
# Begin fine tuning the model
from transformers import AutoModelForAudioClassification
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
model_id,
num_labels=num_labels,
label2id=label2id,
id2label=id2label,
)
from huggingface_hub import notebook_login
notebook_login()
!pip install transformers[torch]
!pip install accelerate -U
from transformers import TrainingArguments
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10
training_args = TrainingArguments(
f"{model_name}-finetuned-gtzan",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_train_epochs,
warmup_ratio=0.1,
logging_steps=5,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
fp16=True,
push_to_hub=False,
)
!pip install evaluate
import evaluate
import numpy as np
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
"""Computes accuracy on a batch of predictions"""
predictions = np.argmax(eval_pred.predictions, axis=1)
return metric.compute(predictions=predictions, references=eval_pred.label_ids)
# Now we have got all the required pieces.
# Instantiating the Trainer class and training the model.
from transformers import Trainer
trainer = Trainer(
model,
training_args,
train_dataset=gtzan_encoded["train"],
eval_dataset=gtzan_encoded["test"],
tokenizer=feature_extractor,
compute_metrics=compute_metrics,
)
trainer.train()
!pip install huggingface-cli
!huggingface-cli login
kwargs = {
"dataset_tags": "marsyas/gtzan",
"dataset": "GTZAN",
"model_name": f"{model_name}-finetuned-gtzan",
"finetuned_from": model_id,
"tasks": "audio-classification",
}