distilhubert-finetuned-gtzan / audiocourse_musicgenreclassifier_p2.py

Upload audiocourse_musicgenreclassifier_p2.py

9c3919d verified about 1 year ago

5.72 kB

	# -- coding: utf-8 --
	"""Copy of AudioCourse_MusicGenreClassifier_P2.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/19qAGS31MqX04p9EeUgAM2dGvI3SB3pm6
	"""

	!pip install --upgrade transformers
	!pip install datasets
	!pip install gradio

	from datasets import load_dataset

	gtzan = load_dataset("marsyas/gtzan", "all")
	gtzan

	# GTZAN does not provide a split in the dataset, so we are creating one
	# ourselves
	gtzan = gtzan['train'].train_test_split(test_size=0.1, shuffle=True, seed=42)
	gtzan

	gtzan['train'][0]

	# genre is represented as an integer, so let’s use the int2str() method of
	# the genre feature to map these integers to human-readable names
	id2label_fn = gtzan['train'].features['genre'].int2str
	id2label_fn(gtzan['train'][0]['genre'])

	# Let’s now listen to a few more examples by using Gradio to create a simple
	# interface with the Blocks API

	import gradio as gr


	def generate_audio():
	example = gtzan["train"].shuffle()[0]
	audio = example["audio"]
	return (
	audio["sampling_rate"],
	audio["array"],
	), id2label_fn(example["genre"])


	with gr.Blocks() as demo:
	with gr.Column():
	for _ in range(4):
	audio, label = generate_audio()
	output = gr.Audio(audio, label=label)

	demo.launch(debug=True)

	from transformers import AutoFeatureExtractor

	model_id = "ntu-spml/distilhubert"
	feature_extractor = AutoFeatureExtractor.from_pretrained(
	model_id, do_normalize=True, return_attention_mask=True
	)

	# As we have seen above, the sampling rate of the audio samples
	# in the dataset is = 22KHz(approx.) let's find the sampling rate accepted by the
	# model
	sampling_rate = feature_extractor.sampling_rate
	sampling_rate

	# The model needs 16Khz samples so we can use the cast_column() method to
	# downsample the examples to match the requirements of the model.

	from datasets import Audio

	gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))

	# Let's verify if the changes were successful.
	gtzan['train'][0]

	# Works! However, I noticed that the 1-D NP Array of the audio has changed.
	# What exactly did the feature extractor do?
	# Looking into this

	import numpy as np
	test_sample = gtzan['train'][0]['audio']
	print(f"Mean: {np.mean(test_sample['array']):.3},\n",
	f"Variance: {np.var(test_sample['array']):.3}")

	inputs = feature_extractor(test_sample["array"],
	sampling_rate=test_sample["sampling_rate"])

	print(f"inputs keys: {list(inputs.keys())}")

	print(
	f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
	)

	# the model cannot process audio samples above 30secs,
	# therefore, wee need to trucate examples that have longer durations.
	# Let's define a method to set the max_duration and use the feature_extractor
	# class on a single sample.
	# Later, we can use the .map() method to apply the same for all samples.

	max_duration = 30.0


	def preprocess_function(examples):
	audio_arrays = [x["array"] for x in examples["audio"]]
	inputs = feature_extractor(
	audio_arrays,
	sampling_rate=feature_extractor.sampling_rate,
	max_length=int(feature_extractor.sampling_rate * max_duration),
	truncation=True,
	return_attention_mask=True,
	)
	return inputs

	gtzan_encoded = gtzan.map(
	preprocess_function,
	remove_columns=["audio", "file"],
	batched=True,
	batch_size=100,
	num_proc=1,
	)
	gtzan_encoded

	gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

	id2label = {
	str(i): id2label_fn(i)
	for i in range(len(gtzan_encoded["train"].features["label"].names))
	}
	label2id = {v: k for k, v in id2label.items()}

	id2label["7"]

	# Begin fine tuning the model

	from transformers import AutoModelForAudioClassification

	num_labels = len(id2label)

	model = AutoModelForAudioClassification.from_pretrained(
	model_id,
	num_labels=num_labels,
	label2id=label2id,
	id2label=id2label,
	)

	from huggingface_hub import notebook_login

	notebook_login()

	!pip install transformers[torch]
	!pip install accelerate -U

	from transformers import TrainingArguments

	model_name = model_id.split("/")[-1]
	batch_size = 8
	gradient_accumulation_steps = 1
	num_train_epochs = 10

	training_args = TrainingArguments(
	f"{model_name}-finetuned-gtzan",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	learning_rate=5e-5,
	per_device_train_batch_size=batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=num_train_epochs,
	warmup_ratio=0.1,
	logging_steps=5,
	load_best_model_at_end=True,
	metric_for_best_model="accuracy",
	fp16=True,
	push_to_hub=False,
	)

	!pip install evaluate

	import evaluate
	import numpy as np

	metric = evaluate.load("accuracy")


	def compute_metrics(eval_pred):
	"""Computes accuracy on a batch of predictions"""
	predictions = np.argmax(eval_pred.predictions, axis=1)
	return metric.compute(predictions=predictions, references=eval_pred.label_ids)

	# Now we have got all the required pieces.
	# Instantiating the Trainer class and training the model.

	from transformers import Trainer

	trainer = Trainer(
	model,
	training_args,
	train_dataset=gtzan_encoded["train"],
	eval_dataset=gtzan_encoded["test"],
	tokenizer=feature_extractor,
	compute_metrics=compute_metrics,
	)

	trainer.train()

	!pip install huggingface-cli

	!huggingface-cli login

	kwargs = {
	"dataset_tags": "marsyas/gtzan",
	"dataset": "GTZAN",
	"model_name": f"{model_name}-finetuned-gtzan",
	"finetuned_from": model_id,
	"tasks": "audio-classification",
	}