# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [1]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [2]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.11.0-37-generic-x86_64-with-glibc2.10
CPU cores: 60
Python version: 3.8.8
PyTorch version: 1.10.1+cu102
GPU is visible: True
Transformers version: 4.16.0.dev0
Datasets version: 1.17.1.dev0
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [3]:
!nvidia-smi

Fri Jan 21 17:23:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100S-PCI...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      4MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [4]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py

--2022-01-21 13:32:51--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30348 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-21 13:32:51 (18.2 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30348/30348]



In [34]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="hi" \
	--output_dir="./wav2vec2-large-xls-r-300m-hindi" \
	--overwrite_output_dir \
	--num_train_epochs="100" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="32" \
	--gradient_accumulation_steps="1" \
	--learning_rate="7.5e-5" \
	--warmup_steps="2000" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — \’ … \– \' \’ \– � \' \’ \– \& a-z A-Z \। \
	--save_steps="500" \
	--eval_steps="500" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="3" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/21/2022 17:27:56 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=7.5e-05,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log

In [None]:
import pandas as pd

df = pd.DataFrame([
    {}
])

In [13]:
# !zip -r wav2vec2-large-xls-r-300m-odia.zip wav2vec2-large-xls-r-300m-odia/
# !rm wav2vec2-large-xls-r-300m-odia.zip

In [10]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  557G  2.8T  17% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G     0   87G   0% /dev/shm
/dev/md0        3.5T  557G  2.8T  17% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.6G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [4]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "hi", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "hi", use_auth_token=True, split="test")

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)


In [6]:
len(common_voice_train) * 50 / 32

7360.9375

In [7]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [8]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [32]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,रोज़ेटा यूरोपीय अंतरिक्ष एजेंसी ने बनाया था
1,आप थोड़ा धीरे बोल सकते हैं क्या
2,पाक में बाढ़ की स्थिति का फायदा उठा सकते हैं चरमपंथीअमेरिका
3,दिल्ली पुलिस के हत्थे चढ़ा माओवादियों का कारतूस सप्लायर
4,यूपी में बंदर बना ड्राइवर दो गाड़ियों में मारी टक्कर
5,उत्तराखंड सरकारी कार्यालयों में सिंगल यूज प्लास्टिक पर लगेगा प्रतिबंध
6,वही तो
7,उसेन बोल्ट का गेंदबाजी एक्शन देखकर हैरान रह गयाः हरभजन सिंह
8,इटली एक बहुत सुंदर देश है
9,तुम्हारी बंदूक कहाँ है


In [18]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\’\–\&a-zA-Z\।]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [19]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

  0%|          | 0/4711 [00:00<?, ?ex/s]

  0%|          | 0/2095 [00:00<?, ?ex/s]

In [20]:
def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
    batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
    batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
    batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
    return batch

In [21]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

  0%|          | 0/4711 [00:00<?, ?ex/s]

  0%|          | 0/2095 [00:00<?, ?ex/s]

In [22]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [23]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [25]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '|': 1,
 'ँ': 2,
 'ं': 3,
 'ः': 4,
 'अ': 5,
 'आ': 6,
 'इ': 7,
 'ई': 8,
 'उ': 9,
 'ऊ': 10,
 'ऋ': 11,
 'ए': 12,
 'ऐ': 13,
 'ऑ': 14,
 'ओ': 15,
 'औ': 16,
 'क': 17,
 'ख': 18,
 'ग': 19,
 'घ': 20,
 'च': 21,
 'छ': 22,
 'ज': 23,
 'झ': 24,
 'ञ': 25,
 'ट': 26,
 'ठ': 27,
 'ड': 28,
 'ढ': 29,
 'ण': 30,
 'त': 31,
 'थ': 32,
 'द': 33,
 'ध': 34,
 'न': 35,
 'प': 36,
 'फ': 37,
 'ब': 38,
 'भ': 39,
 'म': 40,
 'य': 41,
 'र': 42,
 'ल': 43,
 'व': 44,
 'श': 45,
 'ष': 46,
 'स': 47,
 'ह': 48,
 '़': 49,
 'ा': 50,
 'ि': 51,
 'ी': 52,
 'ु': 53,
 'ू': 54,
 'ृ': 55,
 'ॅ': 56,
 'े': 57,
 'ै': 58,
 'ॉ': 59,
 'ो': 60,
 'ौ': 61,
 '्': 62,
 'क़': 63,
 'ग़': 64,
 'ज़': 65,
 'ड़': 66,
 'ढ़': 67}

In [35]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-hindi
!ls -ltr wav2vec2-large-xls-r-300m-hindi

--2022-01-22 02:39:02--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4419 (4.3K) [text/plain]
Saving to: ‘eval.py’


2022-01-22 02:39:02 (11.4 MB/s) - ‘eval.py’ saved [4419/4419]

total 1232728
drwxr-xr-x 5 ovh ovh       4096 Jan 21 17:29 runs
-rw-r--r-- 1 ovh ovh         23 Jan 21 17:29 added_tokens.json
-rw-r--r-- 1 ovh ovh        771 Jan 21 17:29 vocab.json
-rw-r--r-- 1 ovh ovh        260 Jan 21 17:29 tokenizer_config.json
-rw-r--r-- 1 ovh ovh        309 Jan 21 17:29 special_tokens_map.json
drwxr-xr-x 2 ovh ovh       4096 Jan 21 23:40 checkpoint-13500
drwxr-xr-x 2 ovh ovh       4096 Jan 21 23:53 checkpoint-14000
drwxr-xr-x 2 ovh ovh

In [36]:
!cd wav2vec2-large-xls-r-300m-hindi; python eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config hi --split test --log_outputs

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/hi/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)
100%|███████████████████████████████████████████| 10/10 [00:05<00:00,  1.99ex/s]
WER: 1.0166666666666666
CER: 1.0327272727272727
100%|████████████████████████████████████████| 10/10 [00:00<00:00, 12905.55ex/s]
