Spaces:

sawadogosalif
/

Transliterate

Running

App Files Files Community

sawadogosalif commited on 20 days ago

Commit

fbe0b46

1 Parent(s): e564c3e

end

Browse files

Files changed (13) hide show

.gitattributes +35 -0
.gitignore +127 -0
Dockerfile +3 -6
Home.py +50 -0
LICENSE +21 -0
README.md +1 -1
assets/css/style.css +169 -0
pages/1_🎧_Transcriptions.py +210 -0
pages/2_📊_Statistiques.py +89 -0
requirements.txt +11 -0
rocket_pipeline/youtuber.py +232 -0
utils/utils_stats.py +132 -0
utils/utils_trad.py +126 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,127 @@

+**/*.json
+**/*.ipynb
+**/*.xlsx
+**/*.sh
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+testing.ipynb
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+*images/

Dockerfile CHANGED Viewed

@@ -2,7 +2,6 @@ FROM python:3.11-slim
 ENV DEBIAN_FRONTEND=noninteractive
 # Combine apt-get update, install, clean, and remove apt lists into a single RUN statement
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
@@ -10,18 +9,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-# Create a non-root user and set up the environment
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME="/home/user"
 ENV PATH="${HOME}/.local/bin:$PATH"
 WORKDIR $HOME/app
-# Clone the repository and install dependencies in one step
-RUN git clone https://github.com/sawadogosalif/TransLiterate.git . \
-    && pip install --no-cache-dir -r requirements.txt \
-    && pip install s3fs
 EXPOSE 7860

 ENV DEBIAN_FRONTEND=noninteractive
 # Combine apt-get update, install, clean, and remove apt lists into a single RUN statement
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME="/home/user"
 ENV PATH="${HOME}/.local/bin:$PATH"
+# Set the working directory
 WORKDIR $HOME/app
+# Copy the project files into the container
+COPY --chown=user:user . .
 EXPOSE 7860

Home.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+import streamlit as st
+st.set_page_config(
+    page_title="MooreFrCollection",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+with open("assets/css/style.css") as f:
+    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+st.title("🚀 Outil de Traduction et Transcription pour MooreFrCollection")
+st.markdown("""
+    ### Bienvenue sur MooreFrCollection
+    Aidez-nous à casser la barrière de la langue et à améliorer l'accès aux ressources en mooré.
+    MooreFrCollection a pour but de collecter des ressources en mooré pour la mise en place de plusieurs IA locaux.
+    Votre participation est essentielle pour enrichir la base de données et faciliter la traduction de la langue mooré.
+    ### Points importants à connaître:
+    Pour la transcription et traduction des fichiers audio, gardez en tête les consignes suivantes:
+    1. **Simplicité d'abord**: Pas besoin de faire une traduction parfaite, restituez simplement le contenu de manière claire et compréhensible.
+    2. **Éléments spéciaux**: N'hésitez pas à mentionner les éléments non-verbaux dans la transcription:
+       - `#rires` - Pour indiquer des rires
+       - `#pleurs` - Pour indiquer des pleurs
+       - `#MUSIQUE` - Pour indiquer de la musique
+       - `#BRUIT` - Pour indiquer des bruits de fond significatifs
+       - `#silence` - Pour indiquer un silence prolongé
+    3. Exemple:
+        - **transcription**: `#rires` Gɛɛla karẽn-biisa naan maana wags-taaba rasem a yiib pʋgẽ
+        - **traduction** : `#rires` Les étudiants en mathématiques feront un examen dans deux jours
+    ### L'Alphabet Mooré
+    Voici l'alphabet mooré attendu :
+""")
+alphabet = ["a", "ã", "b", "d", "e", "ẽ", "ɛ", "f", "g", "h", "i", "ĩ", "ɩ", "k", "l", "m", "n", "o", "õ", "p", "r", "s", "t", "u", "ũ", "ʋ", "v", "w", "y", "z"]
+st.write(", ".join(alphabet))

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Gaël Penessot
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -3,7 +3,7 @@ title: Transliterate
 emoji: 👁
 colorFrom: green
 colorTo: yellow
-sdk: docker
 sdk_version: 1.44.1
 app_file: app.py
 pinned: false

 emoji: 👁
 colorFrom: green
 colorTo: yellow
+sdk: streamlit
 sdk_version: 1.44.1
 app_file: app.py
 pinned: false

assets/css/style.css ADDED Viewed

	@@ -0,0 +1,169 @@

+/* Import des polices Google Fonts */
+@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;500;600;700&family=Lato:wght@300;400;700&display=swap');
+/* Styles globaux */
+html, body, [class*="css"] {
+    font-family: 'Lato', sans-serif;
+    font-weight: 400;
+    color: #333333;
+}
+/* Entêtes */
+h1, h2, h3, h4, h5, h6, .stTitle {
+    font-family: 'Poppins', sans-serif !important;
+    font-weight: 600 !important;
+    color: #1E1E1E !important;
+    letter-spacing: -0.01em;
+}
+/* Titre principal spécifique */
+h1, .stTitle > h1 {
+    font-weight: 700 !important;
+    font-size: 2.2rem !important;
+    margin-bottom: 0.5rem;
+}
+h2 {
+    font-size: 1.8rem !important;
+    margin-top: 1.5rem !important;
+}
+h3 {
+    font-size: 1.5rem !important;
+}
+/* Texte normal */
+p, span, li, div:not(.stTitle):not(.stAlert) {
+    font-family: 'Lato', sans-serif !important;
+    font-size: 1rem;
+    line-height: 1.6;
+}
+/* Boutons et widgets */
+button, .stButton>button, .stSelectbox, .stMultiselect, .stSlider {
+    font-family: 'Lato', sans-serif !important;
+}
+/* Métriques */
+.css-1wivap2, [data-testid="stMetricValue"] {
+    font-family: 'Poppins', sans-serif !important;
+    font-weight: 600 !important;
+    font-size: 1.5rem !important;
+    background-color: rgba(79, 139, 249, 0.1);
+    border-radius: 10px;
+    padding: 10px !important;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
+}
+/* Label des métriques */
+[data-testid="stMetricLabel"] {
+    font-family: 'Lato', sans-serif !important;
+    font-weight: 700 !important;
+    font-size: 0.9rem !important;
+}
+/* Sidebar */
+.sidebar .sidebar-content {
+    font-family: 'Lato', sans-serif !important;
+}
+/* En-têtes de la sidebar */
+.sidebar .sidebar-content h1,
+.sidebar .sidebar-content h2,
+.sidebar .sidebar-content h3 {
+    font-family: 'Poppins', sans-serif !important;
+    font-weight: 600 !important;
+}
+/* Accordéons */
+.streamlit-expanderHeader {
+    font-family: 'Poppins', sans-serif !important;
+    font-weight: 600 !important;
+    color: #4F8BF9 !important;
+}
+/* Badges */
+.stAlert {
+    border-radius: 8px;
+    font-family: 'Lato', sans-serif !important;
+}
+/* Cartes d'information */
+div[data-testid="stDecoration"] {
+    background-image: linear-gradient(90deg, #4F8BF9, #1EAEDB);
+}
+/* Personnalisation des widgets de la sidebar */
+.css-1adrfps {
+    padding-top: 2rem;
+}
+/* Labels des widgets */
+label, .stRadio label, .stCheckbox label {
+    font-family: 'Lato', sans-serif !important;
+    font-weight: 700 !important;
+}
+/* Tableaux - styles améliorés et spécifiques */
+.stDataFrame {
+    border-radius: 8px;
+    overflow: hidden;
+}
+/* Sélecteurs spécifiques pour les tableaux et DataFrames */
+.stDataFrame table,
+div[data-testid="stTable"] table,
+[data-testid="stDataFrame"] table,
+.dataframe {
+    font-family: 'Lato', sans-serif !important;
+}
+/* En-têtes de tableaux */
+.stDataFrame th,
+div[data-testid="stTable"] th,
+[data-testid="stDataFrame"] th,
+.dataframe th,
+thead tr th,
+table thead th,
+table tr th {
+    font-family: 'Poppins', sans-serif !important;
+    font-weight: 600 !important;
+    background-color: #f0f2f6 !important;
+    font-size: 0.9rem !important;
+}
+/* Cellules de données de tableaux */
+.stDataFrame td,
+div[data-testid="stTable"] td,
+[data-testid="stDataFrame"] td,
+.dataframe td,
+table tbody td,
+table tr td {
+    font-family: 'Lato', sans-serif !important;
+    font-size: 0.9rem !important;
+}
+/* Style spécifique pour le contenu des cellules */
+.stDataFrame td div,
+div[data-testid="stTable"] td div,
+[data-testid="stDataFrame"] td div,
+.dataframe td div {
+    font-family: 'Lato', sans-serif !important;
+}
+/* Bloc de code */
+code {
+    font-family: 'Courier New', monospace !important;
+}
+/* Sélecteurs pour les tableaux dans les sections de widgets (multiselect, etc.) */
+.stMultiSelect td, .stMultiSelect th,
+[data-baseweb="table"] td, [data-baseweb="table"] th {
+    font-family: 'Lato', sans-serif !important;
+}
+/* Style spécifique pour les valeurs dans les cellules */
+td [data-testid*="StyledDataFrameDataCell"],
+div[data-testid*="column-header"] {
+    font-family: 'Lato', sans-serif !important;
+}

pages/1_🎧_Transcriptions.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import streamlit as st
+from urllib.parse import unquote
+import os
+import json
+from utils.utils_trad import get_total_audio_duration_by_user, list_audio_files_by_title, get_processed_audio_files_by_user_and_title, get_audio_url, save_annotation
+from dotenv import load_dotenv
+load_dotenv(".env")
+S3_BUCKET = os.getenv("S3_BUCKET")
+S3_PREFIX = os.getenv("S3_PREFIX")
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
+ANNOTATIONS_PREFIX = "annotations"
+import s3fs
+access_key = os.getenv("AWS_ACCESS_KEY_ID")
+secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
+endpoint_url = os.getenv("AWS_ENDPOINT_URL_S3")
+fs = s3fs.S3FileSystem(
+    key=AWS_ACCESS_KEY_ID,
+    secret=AWS_SECRET_ACCESS_KEY,
+    endpoint_url=ENDPOINT_URL)
+if not all([S3_BUCKET, S3_PREFIX, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, ENDPOINT_URL]):
+    st.error("Veuillez configurer correctement les variables d'environnement S3.")
+    st.stop()
+# Fonction pour vérifier les titres complètement traités
+def get_completed_titles():
+    """Renvoie la liste des titres qui n'ont plus d'audios à traiter."""
+    status_file = "title_completion_status.json"
+    if os.path.exists(status_file):
+        with open(status_file, 'r') as f:
+            status = json.load(f)
+        return [title for title, is_completed in status.items() if is_completed]
+    else:
+        return []
+def save_title_completion_status(title, is_completed):
+    """Sauvegarde l'état de traitement d'un titre dans un fichier JSON."""
+    status_file = "title_completion_status.json"
+    with fs.open(status_file, 'r') as f:
+        status = json.load(f)
+    status[title] = is_completed
+    with fs.open(status_file, 'w') as f:
+        json.dump(status, f)
+st.set_page_config(page_title="Travaux Audio", layout="wide")
+st.title("🗣️ Travaux Audio - Transcription & Traduction")
+st.markdown("""
+Bienvenue sur la page des **Travaux Audio** du projet **MooreFrCollection**.
+> 📝 Votre mission : écouter les audios mooré, écrire leur **transcription** (en mooré) et leur **traduction** (en français).
+""")
+if "user_logged_in" not in st.session_state:
+    st.session_state.user_logged_in = False
+if "current_username" not in st.session_state:
+    st.session_state.current_username = ""
+if "completed_titles" not in st.session_state:
+    st.session_state.completed_titles = set()
+if not st.session_state.user_logged_in:
+    with st.form("login_form"):
+        input_username = st.text_input("Entrez votre nom ou pseudo pour contribuer :", key="input_username")
+        submit_button = st.form_submit_button("✅ Commencer à contribuer")
+        if submit_button:
+            if not input_username:
+                st.error("Merci d'entrer un nom avant de continuer.")
+            else:
+                st.session_state.user_logged_in = True
+                st.session_state.current_username = input_username
+                st.rerun()
+    st.stop()
+username = st.session_state.current_username
+st.success(f"👤 Connecté en tant que: **{username}**")
+user_duration_minutes = get_total_audio_duration_by_user(username)
+st.info(f"🎯 Vous avez déjà traité environ **{user_duration_minutes:.1f} minutes** d'audio.")
+if st.button("👋 Changer d'utilisateur"):
+    st.session_state.user_logged_in = False
+    st.session_state.current_username = ""
+    st.rerun()
+# Charger les titres disponibles
+audio_titles = list_audio_files_by_title()
+if not audio_titles:
+    st.warning("Aucun audio disponible pour l'instant.")
+    st.stop()
+# Obtenir les titres globalement terminés
+globally_completed_titles = get_completed_titles()
+# Filtrer les titres pour exclure ceux qui sont déjà terminés
+available_titles = [title for title in audio_titles.keys()
+                   if title not in st.session_state.completed_titles
+                   and title not in globally_completed_titles]
+if not available_titles:
+    st.success("🎉 Félicitations ! Tous les groupes d'audio disponibles sont terminés.")
+    st.stop()
+# Sélection du titre audio
+default_index = 0
+if "selected_title" in st.session_state and st.session_state["selected_title"] in available_titles:
+    default_index = available_titles.index(st.session_state["selected_title"])
+selected_title = st.selectbox(
+    "Choisissez un groupe audio :",
+    available_titles,
+    key="audio_group",
+    index=default_index
+)
+st.session_state["selected_title"] = selected_title
+audio_paths = audio_titles[selected_title]
+# Récupérer les fichiers déjà traités pour ce titre et cet utilisateur
+processed_files = get_processed_audio_files_by_user_and_title(username, selected_title)
+# Filtrer la liste des audios pour ne garder que ceux non traités
+unprocessed_audio_paths = [path for path in audio_paths if os.path.basename(path) not in processed_files]
+if not unprocessed_audio_paths:
+    st.success(f"🎉 Vous avez déjà terminé tous les audios du groupe '{selected_title}'!")
+    st.session_state.completed_titles.add(selected_title)
+    # Vérifier si ce titre est complètement traité par tous les utilisateurs
+    # Cela nécessite une fonction qui vérifie si tous les audios de ce titre ont des annotations
+    all_files_processed = True
+    for audio_path in audio_paths:
+        audio_filename = os.path.basename(audio_path)
+        annotation_path = f"{ANNOTATIONS_PREFIX}/{selected_title}/{audio_filename}.json"
+        if not os.path.exists(annotation_path):
+            all_files_processed = False
+            break
+    if all_files_processed:
+        save_title_completion_status(selected_title, True)
+    if st.button("Continuer avec un autre groupe (Terminé)"):
+        st.rerun()
+    st.stop()
+# Initialiser l'index de l'audio pour le titre sélectionné (ou reprendre la progression)
+index_key = f"index_{selected_title}"
+if index_key not in st.session_state:
+    st.session_state[index_key] = 0
+else:
+    st.session_state[index_key] = min(st.session_state[index_key], len(unprocessed_audio_paths) - 1)
+current_index = st.session_state[index_key]
+if unprocessed_audio_paths:
+    current_audio = unprocessed_audio_paths[current_index]
+    st.subheader(f"🎧 Audio {current_index + 1} sur {len(unprocessed_audio_paths)} : {current_audio.split('/')[-1]}")
+    st.audio(get_audio_url(current_audio))
+    with st.form(f"form_{current_audio}"):
+        transcription = st.text_area("Transcription en mooré", key=f"tr_{current_audio}")
+        traduction = st.text_area("Traduction en français", key=f"trad_{current_audio}")
+        submitted = st.form_submit_button("💾 Soumettre")
+        if submitted:
+            save_annotation(
+                audio_path=current_audio,
+                user=username,
+                transcription=transcription,
+                traduction=traduction,
+            )
+            st.success("✅ Contribution enregistrée avec succès !")
+            st.session_state[index_key] += 1
+            # Vérifier si tous les audios non traités de ce groupe sont maintenant terminés
+            if st.session_state[index_key] >= len(unprocessed_audio_paths):
+                st.success(f"🎉 Vous avez terminé tous les audios du groupe '{selected_title}'!")
+                st.session_state.completed_titles.add(selected_title)
+                # Vérifier si ce titre est maintenant complètement traité par tous
+                all_files_processed = True
+                for audio_path in audio_paths:
+                    audio_filename = os.path.basename(audio_path)
+                    annotation_path = f"{ANNOTATIONS_PREFIX}/{selected_title}/{audio_filename}.json"
+                    if not os.path.exists(annotation_path):
+                        all_files_processed = False
+                        break
+                if all_files_processed:
+                    save_title_completion_status(selected_title, True)
+            else:
+                st.rerun()
+    # Bouton pour continuer après avoir potentiellement terminé un groupe (hors du formulaire)
+    if st.session_state[index_key] >= len(unprocessed_audio_paths) and st.button("Continuer avec un autre groupe"):
+        st.rerun()
+else:
+    st.info(f"Il ne reste plus d'audios à traiter pour le groupe '{selected_title}'.")
+    if st.button("Choisir un autre groupe"):
+        st.rerun()

pages/2_📊_Statistiques.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import streamlit as st
+import pandas as pd
+import plotly.graph_objects as go
+from utils.utils_stats import (
+    load_all_annotations,
+    calculate_total_duration,
+    calculate_contributor_ranking,
+    create_contributions_histogram,
+    create_contributions_pie_chart,
+    calculate_contributions_over_time,
+    calculate_average_annotation_length
+)
+def display_most_recent_contributions(annotations, n=5):
+    """Affiche les contributions les plus récentes."""
+    if not annotations:
+        st.info("Aucune contribution récente.")
+        return
+    st.subheader(f"⏱️ {n} Contributions les plus récentes (approximatif)")
+    for ann in annotations[-n:]:
+        st.markdown(f"- Utilisateur: **{ann.get('user', 'N/A')}**, Audio: `{(ann.get('audio_path', 'N/A'))}`")
+st.set_page_config(page_title="Statistiques des Travaux Audio", layout="wide")
+st.title("📊 Statistiques des Travaux Audio")
+st.markdown("Voici un aperçu des statistiques de contribution pour le projet **MooreFrCollection**.")
+# Charger toutes les annotations
+all_annotations = load_all_annotations()
+if all_annotations:
+    # Première ligne : Métriques principales
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        total_duration_minutes = calculate_total_duration(all_annotations)
+        st.metric("⏱️ Total d'audios traités", f"{total_duration_minutes:.2f} minutes")
+    with col2:
+        avg_annotation_length = calculate_average_annotation_length(all_annotations)
+        st.metric("📏 Durée moyenne d'une annotation", f"{avg_annotation_length:.2f} minutes")
+    with col3:
+        st.empty()
+    st.markdown("---")
+    # Deuxième ligne : Classement et histogramme
+    col_ranking, col_histogram = st.columns([1, 2])
+    with col_ranking:
+        st.subheader("🏆 Classement des contributeurs par durée totale")
+        contributor_ranking = calculate_contributor_ranking(all_annotations)
+        if contributor_ranking:
+            ranking_df = pd.DataFrame(contributor_ranking, columns=['Contributeur', 'Durée totale (secondes)'])
+            ranking_df['Durée totale (minutes)'] = ranking_df['Durée totale (secondes)'] / 60.0
+            st.dataframe(ranking_df[['Contributeur', 'Durée totale (minutes)']].set_index('Contributeur'), height=300)
+        else:
+            st.info("Aucune contribution enregistrée pour le moment.")
+    with col_histogram:
+        histogram_fig = create_contributions_histogram(contributor_ranking)
+        if histogram_fig:
+            st.plotly_chart(histogram_fig, use_container_width=True)
+    st.markdown("---")
+    # Troisième ligne : Diagramme circulaire et contributions récentes
+    col_pie, col_recent = st.columns(2)
+    with col_pie:
+        pie_chart_fig = create_contributions_pie_chart(all_annotations)
+        if pie_chart_fig:
+            st.plotly_chart(pie_chart_fig, use_container_width=True)
+    with col_recent:
+        display_most_recent_contributions(all_annotations)
+    st.markdown("---")
+    # Quatrième ligne : Évolution temporelle
+    st.subheader("📈 Évolution temporelle des contributions")
+    contributions_over_time_df = calculate_contributions_over_time(all_annotations)
+    if contributions_over_time_df is not None and not contributions_over_time_df.empty:
+        fig = go.Figure(data=[go.Scatter(x=contributions_over_time_df['Date'], y=contributions_over_time_df['Nombre de contributions'], mode='lines+markers')])
+        st.plotly_chart(fig, use_container_width=True)
+    elif all_annotations:
+        st.info("Impossible de déterminer l'évolution temporelle des contributions (informations de date manquantes dans les clés S3).")
+    else:
+        st.info("Aucune contribution à afficher pour l'évolution temporelle.")
+else:
+    st.info("Aucune donnée d'annotation disponible pour générer les statistiques.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy>=2.2.3
+pandas>=2.2.3
+plotly>=6.0.0
+pyarrow>=19.0.1
+streamlit>=1.43.1
+datasets
+boto3
+pydub
+python-dotenv
+soundfile
+s3fs

rocket_pipeline/youtuber.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import os
+from loguru import logger
+import boto3
+from tqdm import tqdm
+from pydub import AudioSegment
+from yt_dlp import YoutubeDL
+from dotenv import load_dotenv
+load_dotenv()
+def filter_videos_by_keywords(candidates, keywords):
+    if not candidates:
+        return []
+    filtered_videos = []
+    for candidate in candidates:
+        if not isinstance(candidate, dict):
+            continue
+        title = str(candidate.get("title", "")).lower()
+        description = str(candidate.get("description", "")).lower()
+        if any(keyword.lower() in title or keyword.lower() in description for keyword in keywords):
+            filtered_videos.append(candidate)
+    logger.info(f"Filtrage terminé: {len(filtered_videos)}/{len(candidates)} vidéos correspondent aux mots-clés {keywords}")
+    return filtered_videos
+def get_videos_from_channel(channel_url):
+    logger.info(f"Extraction des vidéos depuis la chaîne: {channel_url}")
+    ydl_opts = {
+        'extract_flat': True,
+        'quiet': True,
+    }
+    with YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(channel_url, download=False)
+        if 'entries' in info:
+            videos = info['entries']
+            videos_urls = [video for video in videos  if not "Shorts" in  video["title"]]
+            videos_urls = sum([videos_url["entries"] for videos_url in videos_urls], [])
+            logger.info(f"Nombre total  de videos trouvées: {len(videos_urls)}")
+            return videos_urls
+        else:
+            logger.warning("Aucune vidéo trouvée sur cette chaîne")
+            return []
+def download_youtube_audios(videos, output_dir):
+    """
+    Télécharge les fichiers audio des vidéos YouTube.
+    Args:
+        videos: Liste des vidéos à télécharger
+        output_dir: Répertoire de sortie (utilise INPUT_DIR par défaut)
+    """
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'wav',
+        }],
+        'quiet': False,
+    }
+    logger.info(f"Début du téléchargement de {len(videos)} vidéos")
+    with YoutubeDL(ydl_opts) as ydl:
+        for video in tqdm(videos, desc="Téléchargement des vidéos"):
+            try:
+                url = f"https://www.youtube.com/watch?v={video['id']}"
+                logger.info(f"Téléchargement de l'audio (WAV) : {video['title']}")
+                ydl.download([url])
+            except Exception as e:
+                logger.error(f"Erreur lors du téléchargement de {video.get('title', video.get('id', 'inconnu'))}: {str(e)}")
+def segment_audio_files(input_dir, output_dir, segment_length):
+    """
+    Découpe les fichiers audio en segments.
+    Args:
+        input_dir: Répertoire des fichiers audio source (utilise INPUT_DIR par défaut)
+        output_dir: Répertoire des segments audio (utilise OUTPUT_DIR par défaut)
+        segment_length: Durée de chaque segment en ms (utilise SEGMENT_LENGTH_MS par défaut)
+    Returns:
+        Nombre total de segments créés
+    """
+    wav_files = [f for f in os.listdir(input_dir) if f.endswith(".wav")]
+    logger.info(f"Nombre de fichiers WAV à traiter: {len(wav_files)}")
+    total_segments = 0
+    processed_segments = []
+    for filename in tqdm(wav_files, desc="Traitement des fichiers audio"):
+        try:
+            filepath = os.path.join(input_dir, filename)
+            audio = AudioSegment.from_wav(filepath)
+            duration = len(audio)
+            base_name = os.path.splitext(filename)[0]
+            video_folder = os.path.join(output_dir, base_name)
+            os.makedirs(video_folder, exist_ok=True)
+            logger.info(f"Découpage de : {filename} → dossier [{video_folder}]")
+            num_segments = (duration + segment_length - 1) // segment_length
+            segments_created = 0
+            for i in tqdm(range(0, duration, segment_length),
+                          desc=f"Segments de {base_name}",
+                          total=num_segments):
+                segment = audio[i:i + segment_length]
+                segment_name = f"part{i // segment_length + 1}.wav"
+                segment_path = os.path.join(video_folder, segment_name)
+                segment.export(segment_path, format="wav")
+                segments_created += 1
+                processed_segments.append(segment_path)
+            logger.info(f"Fichier {filename}: {segments_created} segments créés")
+            total_segments += segments_created
+        except Exception as e:
+            logger.error(f"Erreur lors du traitement de {filename}: {str(e)}")
+    logger.info(f"Traitement terminé. Total des segments créés: {total_segments}")
+    return total_segments, processed_segments
+def setup_s3_client():
+    access_key = os.getenv("AWS_ACCESS_KEY_ID")
+    secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
+    endpoint_url = os.getenv("AWS_ENDPOINT_URL_S3")
+    if not all([access_key, secret_key]):
+        logger.warning("Variables d'environnement AWS manquantes (AWS_ACCESS_KEY_ID ou AWS_SECRET_ACCESS_KEY)")
+        return None
+    client_params = {
+        "aws_access_key_id": access_key,
+        "aws_secret_access_key": secret_key,
+    }
+    if endpoint_url:
+        client_params["endpoint_url"] = endpoint_url
+    try:
+        return boto3.client("s3", **client_params)
+    except Exception as e:
+        logger.error(f"Erreur lors de l'initialisation du client S3: {str(e)}")
+        return None
+def upload_file_to_s3(s3_client, local_path, bucket_name, s3_key):
+    try:
+        s3_client.upload_file(local_path, bucket_name, s3_key)
+        logger.info(f"Uploadé {local_path} vers s3://{bucket_name}/{s3_key}")
+    except Exception as e:
+        logger.error(f"Erreur lors de l'upload de {local_path}: {str(e)}")
+def upload_segments_to_s3(segments, bucket_name, prefix, segments_folder):
+    s3_client = setup_s3_client()
+    if not s3_client:
+        logger.error("Client S3 non disponible. Upload annulé.")
+        return 0
+    uploaded_count = 0
+    logger.info(f"Début de l'upload des segments vers S3 (bucket: {bucket_name}, préfixe: {prefix})")
+    for segment_path in tqdm(segments, desc="Upload des segments vers S3"):
+        try:
+            relative_path = os.path.relpath(segment_path, start=segments_folder)
+            s3_key = f"{prefix}/{relative_path.replace(os.sep, '/')}"
+            upload_file_to_s3(s3_client, segment_path, bucket_name, s3_key)
+            uploaded_count += 1
+        except Exception as e:
+            logger.error(f"Erreur lors de l'upload de {segment_path}: {str(e)}")
+    logger.info(f"Upload terminé. {uploaded_count}/{len(segments)} fichiers envoyés vers S3.")
+    return uploaded_count
+def main():
+    # ====================== CHANGE ME - CONFIGURATION ======================
+    # Mots-clés pour le filtrage des vidéos
+    FILTER_KEYWORDS = ["sid pa"]  #
+    CHANNEL_URL = "https://www.youtube.com/@livenewsafrica/"
+    RAW_AUDIO_DIR = "audios_sidpa_wav"
+    SEGMENT_AUDIO_DIR = "audios_segments_wav"
+    # Durée des segments en millisecondes
+    SEGMENT_LENGTH_MS = 30 * 1000  # 30 secondes par défaut
+    # Configuration S3
+    BUCKET_NAME = "moore-collection"
+    S3_PREFIX = "audios_wav"
+    USE_S3 = True  # Mettre à True pour activer les opérations S3
+    # ====================== FIN CHANGE ME ======================
+    os.makedirs(RAW_AUDIO_DIR, exist_ok=True)
+    os.makedirs(SEGMENT_AUDIO_DIR, exist_ok=True)
+    logger.info("Démarrage du traitement des fichiers audio")
+    videos = get_videos_from_channel(CHANNEL_URL)
+    filtered_videos = filter_videos_by_keywords(videos, keywords=["sid pa"])
+    download_youtube_audios(filtered_videos, RAW_AUDIO_DIR)
+    total_segments, processed_segments = segment_audio_files(RAW_AUDIO_DIR, SEGMENT_AUDIO_DIR, SEGMENT_LENGTH_MS)
+    if USE_S3:
+        upload_segments_to_s3(processed_segments, BUCKET_NAME, S3_PREFIX, SEGMENT_AUDIO_DIR)
+    logger.info("Traitement terminé avec succès")
+if __name__ == "__main__":
+    main()

utils/utils_stats.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import boto3
+import json
+import os
+from collections import defaultdict
+from datetime import datetime
+import pandas as pd
+import plotly.express as px
+from dotenv import load_dotenv
+load_dotenv(".env")
+S3_BUCKET = os.getenv("S3_BUCKET")
+ANNOTATIONS_PREFIX = "annotations"
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
+s3 = boto3.client(
+    "s3",
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    endpoint_url=ENDPOINT_URL
+)
+def load_all_annotations():
+    """Charge toutes les annotations depuis S3."""
+    annotations = []
+    paginator = s3.get_paginator("list_objects_v2")
+    for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=ANNOTATIONS_PREFIX):
+        for obj in page.get("Contents", []):
+            key = obj["Key"]
+            if key.endswith(".json"):
+                try:
+                    file_obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
+                    content = file_obj["Body"].read().decode('utf-8')
+                    data = json.loads(content)
+                    annotations.append(data)
+                except Exception as e:
+                    print(f"Erreur lors de la lecture de {key}: {e}")
+    return annotations
+def calculate_total_duration(annotations):
+    """Calcule la durée totale des audios annotés (en minutes)."""
+    total_seconds = sum(float(ann.get("duration", 0)) for ann in annotations)
+    return total_seconds / 60.0
+def calculate_contributor_ranking(annotations):
+    """Calcule la durée totale des contributions par utilisateur."""
+    contributor_durations = defaultdict(float)
+    for ann in annotations:
+        user = ann.get("user")
+        duration = float(ann.get("duration", 0))
+        if user:
+            contributor_durations[user] += duration
+    return sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True)
+def create_contributions_histogram(contributor_ranking):
+    """Crée un histogramme des contributions par utilisateur."""
+    if not contributor_ranking:
+        return None
+    users = [item[0] for item in contributor_ranking]
+    durations_minutes = [item[1] / 60.0 for item in contributor_ranking]
+    fig = px.bar(x=users, y=durations_minutes, labels={'x': 'Contributeur', 'y': 'Durée totale (minutes)'},
+                 title='Durée totale des contributions par utilisateur')
+    return fig
+def create_contributions_pie_chart(annotations):
+    """Crée un diagramme circulaire des contributions par utilisateur (top 10)."""
+    contributor_durations = defaultdict(float)
+    for ann in annotations:
+        user = ann.get("user")
+        duration = float(ann.get("duration", 0))
+        if user:
+            contributor_durations[user] += duration
+    if not contributor_durations:
+        return None
+    sorted_contributors = sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True)
+    top_n = sorted_contributors[:10]  # Afficher les 10 meilleurs contributeurs
+    labels = [item[0] for item in top_n]
+    values = [item[1] / 60.0 for item in top_n]
+    fig = px.pie(names=labels, values=values, title='Répartition des contributions (Top 10 des contributeurs)',
+                 hole=0.3)
+    fig.update_traces(textinfo='percent+label')
+    return fig
+def extract_annotation_date(annotation_key):
+    """Extrait une date approximative de l'annotation à partir de la clé S3."""
+    parts = annotation_key.split('/')
+    if len(parts) >= 3:
+        for part in parts:
+            try:
+                return datetime.strptime(part, '%Y-%m-%d').date()
+            except ValueError:
+                pass
+    return None
+def calculate_contributions_over_time(annotations):
+    """Calcule le nombre de contributions par jour en utilisant le champ 'created_at'."""
+    daily_contributions_count = defaultdict(int)
+    for ann in annotations:
+        created_at_str = ann.get("created_at")
+        if created_at_str:
+            try:
+                created_at = datetime.fromisoformat(created_at_str)
+                annotation_date = created_at.date()
+                daily_contributions_count[annotation_date] += 1
+            except ValueError:
+                print(f"Erreur lors de la conversion de la date: {created_at_str}")
+    if not daily_contributions_count:
+        return None
+    df = pd.DataFrame(daily_contributions_count.items(), columns=['Date', 'Nombre de contributions'])
+    df = df.sort_values(by='Date')
+    return df
+def create_contributions_time_series(df_contributions):
+    """Crée un graphique de l'évolution temporelle du nombre de contributions."""
+    fig = px.line(df_contributions, x='Date', y='Nombre de contributions',
+                  title='Nombre de contributions par jour')
+    return fig
+def calculate_average_annotation_length(annotations):
+    """Calcule la durée moyenne des annotations."""
+    total_duration = sum(float(ann.get("duration", 0)) for ann in annotations)
+    num_annotations = len(annotations)
+    if num_annotations > 0:
+        return total_duration / num_annotations / 60.0  # en minutes
+    return 0.0

utils/utils_trad.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import boto3
+import json
+import os
+from dotenv import load_dotenv
+import pandas as pd
+from io import BytesIO
+import soundfile as sf
+from datetime import datetime
+from dotenv import load_dotenv
+load_dotenv(".env")
+S3_BUCKET = os.getenv("S3_BUCKET")
+S3_PREFIX = os.getenv("S3_PREFIX")
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
+ANNOTATIONS_PREFIX = "annotations"
+s3 = boto3.client(
+    "s3",
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    endpoint_url=ENDPOINT_URL
+)
+def list_audio_files_by_title():
+    """Regroupe les fichiers audio par titre (préfixe de dossier)."""
+    response = s3.list_objects_v2(Bucket=S3_BUCKET, Prefix=S3_PREFIX)
+    if "Contents" not in response:
+        return {}
+    grouped = {}
+    for obj in response["Contents"]:
+        key = obj["Key"]
+        if not key.endswith(".wav"):
+            continue
+        parts = key.split("/")
+        if len(parts) >= 3:
+            title = parts[1]
+            grouped.setdefault(title, []).append(key)
+    return grouped
+def get_audio_url(audio_path):
+    """Génère une URL temporaire pour écouter l'audio."""
+    return s3.generate_presigned_url(
+        ClientMethod="get_object",
+        Params={"Bucket": S3_BUCKET, "Key": audio_path},
+        ExpiresIn=3600,
+    )
+def get_audio_duration_from_s3(bucket, key):
+    """Récupère la durée d'un fichier audio depuis S3."""
+    try:
+        obj = s3.get_object(Bucket=bucket, Key=key)
+        audio_bytes = obj['Body'].read()
+        with BytesIO(audio_bytes) as audio_buffer:
+            y, sr = sf.read(audio_buffer)
+            duration = len(y) / sr
+            return duration
+    except Exception as e:
+        print(f"Erreur lors de la lecture de la durée de {key}: {e}")
+        return 0.0
+def save_annotation(audio_path, user, transcription, traduction):
+    """Sauvegarde l'annotation de l'utilisateur dans S3."""
+    duration = get_audio_duration_from_s3(S3_BUCKET, audio_path)
+    base_filename = os.path.basename(audio_path).replace(".wav", "")
+    path_parts = audio_path.split('/')
+    title = path_parts[-2]
+    annotation_key = f"{ANNOTATIONS_PREFIX}/{title}/{base_filename}__{user}.json"
+    payload = {
+        "audio_path": audio_path,
+        "user": user,
+        "transcription": transcription,
+        "traduction": traduction,
+        "duration": duration,
+        "created_at": datetime.utcnow().isoformat()  # Ajouter un timestamp UTC
+    }
+    s3.put_object(
+        Bucket=S3_BUCKET,
+        Key=annotation_key,
+        Body=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
+        ContentType="application/json",
+    )
+def get_total_audio_duration_by_user(username: str) -> float:
+    """Calcule la durée totale (en minutes) d'audios annotés par un utilisateur."""
+    paginator = s3.get_paginator("list_objects_v2")
+    total_seconds = 0.0
+    for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=ANNOTATIONS_PREFIX):
+        for obj in page.get("Contents", []):
+            key = obj["Key"]
+            if not key.endswith(".json") or f"__{username}.json" not in key:
+                continue
+            try:
+                file_obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
+                content = file_obj["Body"].read().decode('utf-8')
+                data = json.loads(content)
+                duration = data.get("duration")
+                if duration:
+                    total_seconds += float(duration)
+            except Exception as e:
+                print(f"Erreur lors de la lecture de {key}: {e}")
+                continue
+    return total_seconds / 60.0
+def get_processed_audio_files_by_user_and_title(username: str, title: str) -> set:
+    """Récupère l'ensemble des noms de fichiers audio déjà traités par un utilisateur pour un titre donné."""
+    processed_files = set()
+    prefix = f"{ANNOTATIONS_PREFIX}/{title}/"
+    paginator = s3.get_paginator("list_objects_v2")
+    for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=prefix):
+        for obj in page.get("Contents", []):
+            key = obj["Key"]
+            if key.endswith(f"__{username}.json"):
+                filename_with_ext = key.split("/")[-1].replace(f"__{username}.json", ".wav")
+                processed_files.add(filename_with_ext)
+    return processed_files