Spaces:
Running
Running
Commit
·
fbe0b46
1
Parent(s):
e564c3e
end
Browse files- .gitattributes +35 -0
- .gitignore +127 -0
- Dockerfile +3 -6
- Home.py +50 -0
- LICENSE +21 -0
- README.md +1 -1
- assets/css/style.css +169 -0
- pages/1_🎧_Transcriptions.py +210 -0
- pages/2_📊_Statistiques.py +89 -0
- requirements.txt +11 -0
- rocket_pipeline/youtuber.py +232 -0
- utils/utils_stats.py +132 -0
- utils/utils_trad.py +126 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/*.json
|
2 |
+
**/*.ipynb
|
3 |
+
**/*.xlsx
|
4 |
+
**/*.sh
|
5 |
+
|
6 |
+
# Byte-compiled / optimized / DLL files
|
7 |
+
__pycache__/
|
8 |
+
*.py[cod]
|
9 |
+
*$py.class
|
10 |
+
|
11 |
+
|
12 |
+
# C extensions
|
13 |
+
*.so
|
14 |
+
|
15 |
+
# Distribution / packaging
|
16 |
+
.Python
|
17 |
+
build/
|
18 |
+
develop-eggs/
|
19 |
+
dist/
|
20 |
+
downloads/
|
21 |
+
eggs/
|
22 |
+
.eggs/
|
23 |
+
lib/
|
24 |
+
lib64/
|
25 |
+
parts/
|
26 |
+
sdist/
|
27 |
+
var/
|
28 |
+
wheels/
|
29 |
+
share/python-wheels/
|
30 |
+
*.egg-info/
|
31 |
+
.installed.cfg
|
32 |
+
*.egg
|
33 |
+
MANIFEST
|
34 |
+
|
35 |
+
# PyInstaller
|
36 |
+
# Usually these files are written by a python script from a template
|
37 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
38 |
+
*.manifest
|
39 |
+
*.spec
|
40 |
+
testing.ipynb
|
41 |
+
# Installer logs
|
42 |
+
pip-log.txt
|
43 |
+
pip-delete-this-directory.txt
|
44 |
+
|
45 |
+
# Unit test / coverage reports
|
46 |
+
htmlcov/
|
47 |
+
.tox/
|
48 |
+
.nox/
|
49 |
+
.coverage
|
50 |
+
.coverage.*
|
51 |
+
.cache
|
52 |
+
nosetests.xml
|
53 |
+
coverage.xml
|
54 |
+
*.cover
|
55 |
+
*.py,cover
|
56 |
+
.hypothesis/
|
57 |
+
.pytest_cache/
|
58 |
+
cover/
|
59 |
+
|
60 |
+
# Translations
|
61 |
+
*.mo
|
62 |
+
*.pot
|
63 |
+
|
64 |
+
# Django stuff:
|
65 |
+
*.log
|
66 |
+
local_settings.py
|
67 |
+
db.sqlite3
|
68 |
+
db.sqlite3-journal
|
69 |
+
|
70 |
+
# Flask stuff:
|
71 |
+
instance/
|
72 |
+
.webassets-cache
|
73 |
+
|
74 |
+
# Scrapy stuff:
|
75 |
+
.scrapy
|
76 |
+
|
77 |
+
# Sphinx documentation
|
78 |
+
docs/_build/
|
79 |
+
|
80 |
+
# PyBuilder
|
81 |
+
.pybuilder/
|
82 |
+
target/
|
83 |
+
|
84 |
+
# Jupyter Notebook
|
85 |
+
.ipynb_checkpoints
|
86 |
+
|
87 |
+
# IPython
|
88 |
+
profile_default/
|
89 |
+
ipython_config.py
|
90 |
+
|
91 |
+
|
92 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
93 |
+
__pypackages__/
|
94 |
+
|
95 |
+
# Celery stuff
|
96 |
+
celerybeat-schedule
|
97 |
+
celerybeat.pid
|
98 |
+
|
99 |
+
# SageMath parsed files
|
100 |
+
*.sage.py
|
101 |
+
|
102 |
+
# Environments
|
103 |
+
.env
|
104 |
+
.venv
|
105 |
+
env/
|
106 |
+
venv/
|
107 |
+
ENV/
|
108 |
+
env.bak/
|
109 |
+
venv.bak/
|
110 |
+
|
111 |
+
# Spyder project settings
|
112 |
+
.spyderproject
|
113 |
+
.spyproject
|
114 |
+
|
115 |
+
# Rope project settings
|
116 |
+
.ropeproject
|
117 |
+
|
118 |
+
# mkdocs documentation
|
119 |
+
/site
|
120 |
+
|
121 |
+
# mypy
|
122 |
+
.mypy_cache/
|
123 |
+
.dmypy.json
|
124 |
+
dmypy.json
|
125 |
+
|
126 |
+
*images/
|
127 |
+
|
Dockerfile
CHANGED
@@ -2,7 +2,6 @@ FROM python:3.11-slim
|
|
2 |
|
3 |
ENV DEBIAN_FRONTEND=noninteractive
|
4 |
|
5 |
-
|
6 |
# Combine apt-get update, install, clean, and remove apt lists into a single RUN statement
|
7 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
8 |
git \
|
@@ -10,18 +9,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
10 |
&& apt-get clean \
|
11 |
&& rm -rf /var/lib/apt/lists/*
|
12 |
|
13 |
-
# Create a non-root user and set up the environment
|
14 |
RUN useradd -m -u 1000 user
|
15 |
USER user
|
16 |
ENV HOME="/home/user"
|
17 |
ENV PATH="${HOME}/.local/bin:$PATH"
|
18 |
|
|
|
19 |
WORKDIR $HOME/app
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
-
&& pip install --no-cache-dir -r requirements.txt \
|
24 |
-
&& pip install s3fs
|
25 |
|
26 |
EXPOSE 7860
|
27 |
|
|
|
2 |
|
3 |
ENV DEBIAN_FRONTEND=noninteractive
|
4 |
|
|
|
5 |
# Combine apt-get update, install, clean, and remove apt lists into a single RUN statement
|
6 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
7 |
git \
|
|
|
9 |
&& apt-get clean \
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
|
|
12 |
RUN useradd -m -u 1000 user
|
13 |
USER user
|
14 |
ENV HOME="/home/user"
|
15 |
ENV PATH="${HOME}/.local/bin:$PATH"
|
16 |
|
17 |
+
# Set the working directory
|
18 |
WORKDIR $HOME/app
|
19 |
|
20 |
+
# Copy the project files into the container
|
21 |
+
COPY --chown=user:user . .
|
|
|
|
|
22 |
|
23 |
EXPOSE 7860
|
24 |
|
Home.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="MooreFrCollection",
|
6 |
+
page_icon="📊",
|
7 |
+
layout="wide",
|
8 |
+
initial_sidebar_state="expanded",
|
9 |
+
)
|
10 |
+
|
11 |
+
with open("assets/css/style.css") as f:
|
12 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
13 |
+
|
14 |
+
st.title("🚀 Outil de Traduction et Transcription pour MooreFrCollection")
|
15 |
+
st.markdown("""
|
16 |
+
### Bienvenue sur MooreFrCollection
|
17 |
+
Aidez-nous à casser la barrière de la langue et à améliorer l'accès aux ressources en mooré.
|
18 |
+
|
19 |
+
|
20 |
+
MooreFrCollection a pour but de collecter des ressources en mooré pour la mise en place de plusieurs IA locaux.
|
21 |
+
|
22 |
+
Votre participation est essentielle pour enrichir la base de données et faciliter la traduction de la langue mooré.
|
23 |
+
|
24 |
+
|
25 |
+
### Points importants à connaître:
|
26 |
+
|
27 |
+
Pour la transcription et traduction des fichiers audio, gardez en tête les consignes suivantes:
|
28 |
+
|
29 |
+
1. **Simplicité d'abord**: Pas besoin de faire une traduction parfaite, restituez simplement le contenu de manière claire et compréhensible.
|
30 |
+
|
31 |
+
2. **Éléments spéciaux**: N'hésitez pas à mentionner les éléments non-verbaux dans la transcription:
|
32 |
+
- `#rires` - Pour indiquer des rires
|
33 |
+
- `#pleurs` - Pour indiquer des pleurs
|
34 |
+
- `#MUSIQUE` - Pour indiquer de la musique
|
35 |
+
- `#BRUIT` - Pour indiquer des bruits de fond significatifs
|
36 |
+
- `#silence` - Pour indiquer un silence prolongé
|
37 |
+
|
38 |
+
3. Exemple:
|
39 |
+
- **transcription**: `#rires` Gɛɛla karẽn-biisa naan maana wags-taaba rasem a yiib pʋgẽ
|
40 |
+
- **traduction** : `#rires` Les étudiants en mathématiques feront un examen dans deux jours
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
### L'Alphabet Mooré
|
45 |
+
Voici l'alphabet mooré attendu :
|
46 |
+
""")
|
47 |
+
|
48 |
+
alphabet = ["a", "ã", "b", "d", "e", "ẽ", "ɛ", "f", "g", "h", "i", "ĩ", "ɩ", "k", "l", "m", "n", "o", "õ", "p", "r", "s", "t", "u", "ũ", "ʋ", "v", "w", "y", "z"]
|
49 |
+
st.write(", ".join(alphabet))
|
50 |
+
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Gaël Penessot
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -3,7 +3,7 @@ title: Transliterate
|
|
3 |
emoji: 👁
|
4 |
colorFrom: green
|
5 |
colorTo: yellow
|
6 |
-
sdk:
|
7 |
sdk_version: 1.44.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
3 |
emoji: 👁
|
4 |
colorFrom: green
|
5 |
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
sdk_version: 1.44.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
assets/css/style.css
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* Import des polices Google Fonts */
|
2 |
+
@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;500;600;700&family=Lato:wght@300;400;700&display=swap');
|
3 |
+
|
4 |
+
/* Styles globaux */
|
5 |
+
html, body, [class*="css"] {
|
6 |
+
font-family: 'Lato', sans-serif;
|
7 |
+
font-weight: 400;
|
8 |
+
color: #333333;
|
9 |
+
}
|
10 |
+
|
11 |
+
/* Entêtes */
|
12 |
+
h1, h2, h3, h4, h5, h6, .stTitle {
|
13 |
+
font-family: 'Poppins', sans-serif !important;
|
14 |
+
font-weight: 600 !important;
|
15 |
+
color: #1E1E1E !important;
|
16 |
+
letter-spacing: -0.01em;
|
17 |
+
}
|
18 |
+
|
19 |
+
/* Titre principal spécifique */
|
20 |
+
h1, .stTitle > h1 {
|
21 |
+
font-weight: 700 !important;
|
22 |
+
font-size: 2.2rem !important;
|
23 |
+
margin-bottom: 0.5rem;
|
24 |
+
}
|
25 |
+
|
26 |
+
h2 {
|
27 |
+
font-size: 1.8rem !important;
|
28 |
+
margin-top: 1.5rem !important;
|
29 |
+
}
|
30 |
+
|
31 |
+
h3 {
|
32 |
+
font-size: 1.5rem !important;
|
33 |
+
}
|
34 |
+
|
35 |
+
/* Texte normal */
|
36 |
+
p, span, li, div:not(.stTitle):not(.stAlert) {
|
37 |
+
font-family: 'Lato', sans-serif !important;
|
38 |
+
font-size: 1rem;
|
39 |
+
line-height: 1.6;
|
40 |
+
}
|
41 |
+
|
42 |
+
/* Boutons et widgets */
|
43 |
+
button, .stButton>button, .stSelectbox, .stMultiselect, .stSlider {
|
44 |
+
font-family: 'Lato', sans-serif !important;
|
45 |
+
}
|
46 |
+
|
47 |
+
/* Métriques */
|
48 |
+
.css-1wivap2, [data-testid="stMetricValue"] {
|
49 |
+
font-family: 'Poppins', sans-serif !important;
|
50 |
+
font-weight: 600 !important;
|
51 |
+
font-size: 1.5rem !important;
|
52 |
+
background-color: rgba(79, 139, 249, 0.1);
|
53 |
+
border-radius: 10px;
|
54 |
+
padding: 10px !important;
|
55 |
+
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
|
56 |
+
}
|
57 |
+
|
58 |
+
/* Label des métriques */
|
59 |
+
[data-testid="stMetricLabel"] {
|
60 |
+
font-family: 'Lato', sans-serif !important;
|
61 |
+
font-weight: 700 !important;
|
62 |
+
font-size: 0.9rem !important;
|
63 |
+
}
|
64 |
+
|
65 |
+
/* Sidebar */
|
66 |
+
.sidebar .sidebar-content {
|
67 |
+
font-family: 'Lato', sans-serif !important;
|
68 |
+
}
|
69 |
+
|
70 |
+
/* En-têtes de la sidebar */
|
71 |
+
.sidebar .sidebar-content h1,
|
72 |
+
.sidebar .sidebar-content h2,
|
73 |
+
.sidebar .sidebar-content h3 {
|
74 |
+
font-family: 'Poppins', sans-serif !important;
|
75 |
+
font-weight: 600 !important;
|
76 |
+
}
|
77 |
+
|
78 |
+
/* Accordéons */
|
79 |
+
.streamlit-expanderHeader {
|
80 |
+
font-family: 'Poppins', sans-serif !important;
|
81 |
+
font-weight: 600 !important;
|
82 |
+
color: #4F8BF9 !important;
|
83 |
+
}
|
84 |
+
|
85 |
+
/* Badges */
|
86 |
+
.stAlert {
|
87 |
+
border-radius: 8px;
|
88 |
+
font-family: 'Lato', sans-serif !important;
|
89 |
+
}
|
90 |
+
|
91 |
+
/* Cartes d'information */
|
92 |
+
div[data-testid="stDecoration"] {
|
93 |
+
background-image: linear-gradient(90deg, #4F8BF9, #1EAEDB);
|
94 |
+
}
|
95 |
+
|
96 |
+
/* Personnalisation des widgets de la sidebar */
|
97 |
+
.css-1adrfps {
|
98 |
+
padding-top: 2rem;
|
99 |
+
}
|
100 |
+
|
101 |
+
/* Labels des widgets */
|
102 |
+
label, .stRadio label, .stCheckbox label {
|
103 |
+
font-family: 'Lato', sans-serif !important;
|
104 |
+
font-weight: 700 !important;
|
105 |
+
}
|
106 |
+
|
107 |
+
/* Tableaux - styles améliorés et spécifiques */
|
108 |
+
.stDataFrame {
|
109 |
+
border-radius: 8px;
|
110 |
+
overflow: hidden;
|
111 |
+
}
|
112 |
+
|
113 |
+
/* Sélecteurs spécifiques pour les tableaux et DataFrames */
|
114 |
+
.stDataFrame table,
|
115 |
+
div[data-testid="stTable"] table,
|
116 |
+
[data-testid="stDataFrame"] table,
|
117 |
+
.dataframe {
|
118 |
+
font-family: 'Lato', sans-serif !important;
|
119 |
+
}
|
120 |
+
|
121 |
+
/* En-têtes de tableaux */
|
122 |
+
.stDataFrame th,
|
123 |
+
div[data-testid="stTable"] th,
|
124 |
+
[data-testid="stDataFrame"] th,
|
125 |
+
.dataframe th,
|
126 |
+
thead tr th,
|
127 |
+
table thead th,
|
128 |
+
table tr th {
|
129 |
+
font-family: 'Poppins', sans-serif !important;
|
130 |
+
font-weight: 600 !important;
|
131 |
+
background-color: #f0f2f6 !important;
|
132 |
+
font-size: 0.9rem !important;
|
133 |
+
}
|
134 |
+
|
135 |
+
/* Cellules de données de tableaux */
|
136 |
+
.stDataFrame td,
|
137 |
+
div[data-testid="stTable"] td,
|
138 |
+
[data-testid="stDataFrame"] td,
|
139 |
+
.dataframe td,
|
140 |
+
table tbody td,
|
141 |
+
table tr td {
|
142 |
+
font-family: 'Lato', sans-serif !important;
|
143 |
+
font-size: 0.9rem !important;
|
144 |
+
}
|
145 |
+
|
146 |
+
/* Style spécifique pour le contenu des cellules */
|
147 |
+
.stDataFrame td div,
|
148 |
+
div[data-testid="stTable"] td div,
|
149 |
+
[data-testid="stDataFrame"] td div,
|
150 |
+
.dataframe td div {
|
151 |
+
font-family: 'Lato', sans-serif !important;
|
152 |
+
}
|
153 |
+
|
154 |
+
/* Bloc de code */
|
155 |
+
code {
|
156 |
+
font-family: 'Courier New', monospace !important;
|
157 |
+
}
|
158 |
+
|
159 |
+
/* Sélecteurs pour les tableaux dans les sections de widgets (multiselect, etc.) */
|
160 |
+
.stMultiSelect td, .stMultiSelect th,
|
161 |
+
[data-baseweb="table"] td, [data-baseweb="table"] th {
|
162 |
+
font-family: 'Lato', sans-serif !important;
|
163 |
+
}
|
164 |
+
|
165 |
+
/* Style spécifique pour les valeurs dans les cellules */
|
166 |
+
td [data-testid*="StyledDataFrameDataCell"],
|
167 |
+
div[data-testid*="column-header"] {
|
168 |
+
font-family: 'Lato', sans-serif !important;
|
169 |
+
}
|
pages/1_🎧_Transcriptions.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from urllib.parse import unquote
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from utils.utils_trad import get_total_audio_duration_by_user, list_audio_files_by_title, get_processed_audio_files_by_user_and_title, get_audio_url, save_annotation
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
load_dotenv(".env")
|
9 |
+
S3_BUCKET = os.getenv("S3_BUCKET")
|
10 |
+
S3_PREFIX = os.getenv("S3_PREFIX")
|
11 |
+
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
12 |
+
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
13 |
+
ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
|
14 |
+
ANNOTATIONS_PREFIX = "annotations"
|
15 |
+
|
16 |
+
import s3fs
|
17 |
+
|
18 |
+
access_key = os.getenv("AWS_ACCESS_KEY_ID")
|
19 |
+
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
20 |
+
endpoint_url = os.getenv("AWS_ENDPOINT_URL_S3")
|
21 |
+
|
22 |
+
fs = s3fs.S3FileSystem(
|
23 |
+
key=AWS_ACCESS_KEY_ID,
|
24 |
+
secret=AWS_SECRET_ACCESS_KEY,
|
25 |
+
endpoint_url=ENDPOINT_URL)
|
26 |
+
|
27 |
+
|
28 |
+
if not all([S3_BUCKET, S3_PREFIX, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, ENDPOINT_URL]):
|
29 |
+
st.error("Veuillez configurer correctement les variables d'environnement S3.")
|
30 |
+
st.stop()
|
31 |
+
|
32 |
+
# Fonction pour vérifier les titres complètement traités
|
33 |
+
def get_completed_titles():
|
34 |
+
"""Renvoie la liste des titres qui n'ont plus d'audios à traiter."""
|
35 |
+
status_file = "title_completion_status.json"
|
36 |
+
|
37 |
+
if os.path.exists(status_file):
|
38 |
+
with open(status_file, 'r') as f:
|
39 |
+
status = json.load(f)
|
40 |
+
return [title for title, is_completed in status.items() if is_completed]
|
41 |
+
else:
|
42 |
+
return []
|
43 |
+
|
44 |
+
def save_title_completion_status(title, is_completed):
|
45 |
+
"""Sauvegarde l'état de traitement d'un titre dans un fichier JSON."""
|
46 |
+
status_file = "title_completion_status.json"
|
47 |
+
|
48 |
+
with fs.open(status_file, 'r') as f:
|
49 |
+
status = json.load(f)
|
50 |
+
|
51 |
+
status[title] = is_completed
|
52 |
+
|
53 |
+
with fs.open(status_file, 'w') as f:
|
54 |
+
json.dump(status, f)
|
55 |
+
|
56 |
+
st.set_page_config(page_title="Travaux Audio", layout="wide")
|
57 |
+
st.title("🗣️ Travaux Audio - Transcription & Traduction")
|
58 |
+
|
59 |
+
st.markdown("""
|
60 |
+
Bienvenue sur la page des **Travaux Audio** du projet **MooreFrCollection**.
|
61 |
+
|
62 |
+
> 📝 Votre mission : écouter les audios mooré, écrire leur **transcription** (en mooré) et leur **traduction** (en français).
|
63 |
+
""")
|
64 |
+
|
65 |
+
if "user_logged_in" not in st.session_state:
|
66 |
+
st.session_state.user_logged_in = False
|
67 |
+
if "current_username" not in st.session_state:
|
68 |
+
st.session_state.current_username = ""
|
69 |
+
if "completed_titles" not in st.session_state:
|
70 |
+
st.session_state.completed_titles = set()
|
71 |
+
|
72 |
+
if not st.session_state.user_logged_in:
|
73 |
+
with st.form("login_form"):
|
74 |
+
input_username = st.text_input("Entrez votre nom ou pseudo pour contribuer :", key="input_username")
|
75 |
+
submit_button = st.form_submit_button("✅ Commencer à contribuer")
|
76 |
+
|
77 |
+
if submit_button:
|
78 |
+
if not input_username:
|
79 |
+
st.error("Merci d'entrer un nom avant de continuer.")
|
80 |
+
else:
|
81 |
+
st.session_state.user_logged_in = True
|
82 |
+
st.session_state.current_username = input_username
|
83 |
+
st.rerun()
|
84 |
+
st.stop()
|
85 |
+
|
86 |
+
username = st.session_state.current_username
|
87 |
+
st.success(f"👤 Connecté en tant que: **{username}**")
|
88 |
+
|
89 |
+
user_duration_minutes = get_total_audio_duration_by_user(username)
|
90 |
+
st.info(f"🎯 Vous avez déjà traité environ **{user_duration_minutes:.1f} minutes** d'audio.")
|
91 |
+
|
92 |
+
if st.button("👋 Changer d'utilisateur"):
|
93 |
+
st.session_state.user_logged_in = False
|
94 |
+
st.session_state.current_username = ""
|
95 |
+
st.rerun()
|
96 |
+
|
97 |
+
# Charger les titres disponibles
|
98 |
+
audio_titles = list_audio_files_by_title()
|
99 |
+
if not audio_titles:
|
100 |
+
st.warning("Aucun audio disponible pour l'instant.")
|
101 |
+
st.stop()
|
102 |
+
|
103 |
+
# Obtenir les titres globalement terminés
|
104 |
+
globally_completed_titles = get_completed_titles()
|
105 |
+
|
106 |
+
# Filtrer les titres pour exclure ceux qui sont déjà terminés
|
107 |
+
available_titles = [title for title in audio_titles.keys()
|
108 |
+
if title not in st.session_state.completed_titles
|
109 |
+
and title not in globally_completed_titles]
|
110 |
+
|
111 |
+
if not available_titles:
|
112 |
+
st.success("🎉 Félicitations ! Tous les groupes d'audio disponibles sont terminés.")
|
113 |
+
st.stop()
|
114 |
+
|
115 |
+
# Sélection du titre audio
|
116 |
+
default_index = 0
|
117 |
+
if "selected_title" in st.session_state and st.session_state["selected_title"] in available_titles:
|
118 |
+
default_index = available_titles.index(st.session_state["selected_title"])
|
119 |
+
|
120 |
+
selected_title = st.selectbox(
|
121 |
+
"Choisissez un groupe audio :",
|
122 |
+
available_titles,
|
123 |
+
key="audio_group",
|
124 |
+
index=default_index
|
125 |
+
)
|
126 |
+
st.session_state["selected_title"] = selected_title
|
127 |
+
audio_paths = audio_titles[selected_title]
|
128 |
+
|
129 |
+
# Récupérer les fichiers déjà traités pour ce titre et cet utilisateur
|
130 |
+
processed_files = get_processed_audio_files_by_user_and_title(username, selected_title)
|
131 |
+
|
132 |
+
# Filtrer la liste des audios pour ne garder que ceux non traités
|
133 |
+
unprocessed_audio_paths = [path for path in audio_paths if os.path.basename(path) not in processed_files]
|
134 |
+
|
135 |
+
if not unprocessed_audio_paths:
|
136 |
+
st.success(f"🎉 Vous avez déjà terminé tous les audios du groupe '{selected_title}'!")
|
137 |
+
st.session_state.completed_titles.add(selected_title)
|
138 |
+
|
139 |
+
# Vérifier si ce titre est complètement traité par tous les utilisateurs
|
140 |
+
# Cela nécessite une fonction qui vérifie si tous les audios de ce titre ont des annotations
|
141 |
+
all_files_processed = True
|
142 |
+
for audio_path in audio_paths:
|
143 |
+
audio_filename = os.path.basename(audio_path)
|
144 |
+
annotation_path = f"{ANNOTATIONS_PREFIX}/{selected_title}/{audio_filename}.json"
|
145 |
+
if not os.path.exists(annotation_path):
|
146 |
+
all_files_processed = False
|
147 |
+
break
|
148 |
+
|
149 |
+
if all_files_processed:
|
150 |
+
save_title_completion_status(selected_title, True)
|
151 |
+
|
152 |
+
if st.button("Continuer avec un autre groupe (Terminé)"):
|
153 |
+
st.rerun()
|
154 |
+
st.stop()
|
155 |
+
|
156 |
+
# Initialiser l'index de l'audio pour le titre sélectionné (ou reprendre la progression)
|
157 |
+
index_key = f"index_{selected_title}"
|
158 |
+
if index_key not in st.session_state:
|
159 |
+
st.session_state[index_key] = 0
|
160 |
+
else:
|
161 |
+
st.session_state[index_key] = min(st.session_state[index_key], len(unprocessed_audio_paths) - 1)
|
162 |
+
|
163 |
+
current_index = st.session_state[index_key]
|
164 |
+
|
165 |
+
if unprocessed_audio_paths:
|
166 |
+
current_audio = unprocessed_audio_paths[current_index]
|
167 |
+
st.subheader(f"🎧 Audio {current_index + 1} sur {len(unprocessed_audio_paths)} : {current_audio.split('/')[-1]}")
|
168 |
+
st.audio(get_audio_url(current_audio))
|
169 |
+
|
170 |
+
with st.form(f"form_{current_audio}"):
|
171 |
+
transcription = st.text_area("Transcription en mooré", key=f"tr_{current_audio}")
|
172 |
+
traduction = st.text_area("Traduction en français", key=f"trad_{current_audio}")
|
173 |
+
submitted = st.form_submit_button("💾 Soumettre")
|
174 |
+
|
175 |
+
if submitted:
|
176 |
+
save_annotation(
|
177 |
+
audio_path=current_audio,
|
178 |
+
user=username,
|
179 |
+
transcription=transcription,
|
180 |
+
traduction=traduction,
|
181 |
+
)
|
182 |
+
st.success("✅ Contribution enregistrée avec succès !")
|
183 |
+
st.session_state[index_key] += 1
|
184 |
+
|
185 |
+
# Vérifier si tous les audios non traités de ce groupe sont maintenant terminés
|
186 |
+
if st.session_state[index_key] >= len(unprocessed_audio_paths):
|
187 |
+
st.success(f"🎉 Vous avez terminé tous les audios du groupe '{selected_title}'!")
|
188 |
+
st.session_state.completed_titles.add(selected_title)
|
189 |
+
|
190 |
+
# Vérifier si ce titre est maintenant complètement traité par tous
|
191 |
+
all_files_processed = True
|
192 |
+
for audio_path in audio_paths:
|
193 |
+
audio_filename = os.path.basename(audio_path)
|
194 |
+
annotation_path = f"{ANNOTATIONS_PREFIX}/{selected_title}/{audio_filename}.json"
|
195 |
+
if not os.path.exists(annotation_path):
|
196 |
+
all_files_processed = False
|
197 |
+
break
|
198 |
+
|
199 |
+
if all_files_processed:
|
200 |
+
save_title_completion_status(selected_title, True)
|
201 |
+
else:
|
202 |
+
st.rerun()
|
203 |
+
# Bouton pour continuer après avoir potentiellement terminé un groupe (hors du formulaire)
|
204 |
+
if st.session_state[index_key] >= len(unprocessed_audio_paths) and st.button("Continuer avec un autre groupe"):
|
205 |
+
st.rerun()
|
206 |
+
|
207 |
+
else:
|
208 |
+
st.info(f"Il ne reste plus d'audios à traiter pour le groupe '{selected_title}'.")
|
209 |
+
if st.button("Choisir un autre groupe"):
|
210 |
+
st.rerun()
|
pages/2_📊_Statistiques.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
from utils.utils_stats import (
|
5 |
+
load_all_annotations,
|
6 |
+
calculate_total_duration,
|
7 |
+
calculate_contributor_ranking,
|
8 |
+
create_contributions_histogram,
|
9 |
+
create_contributions_pie_chart,
|
10 |
+
calculate_contributions_over_time,
|
11 |
+
calculate_average_annotation_length
|
12 |
+
)
|
13 |
+
|
14 |
+
def display_most_recent_contributions(annotations, n=5):
|
15 |
+
"""Affiche les contributions les plus récentes."""
|
16 |
+
if not annotations:
|
17 |
+
st.info("Aucune contribution récente.")
|
18 |
+
return
|
19 |
+
|
20 |
+
st.subheader(f"⏱️ {n} Contributions les plus récentes (approximatif)")
|
21 |
+
for ann in annotations[-n:]:
|
22 |
+
st.markdown(f"- Utilisateur: **{ann.get('user', 'N/A')}**, Audio: `{(ann.get('audio_path', 'N/A'))}`")
|
23 |
+
|
24 |
+
st.set_page_config(page_title="Statistiques des Travaux Audio", layout="wide")
|
25 |
+
st.title("📊 Statistiques des Travaux Audio")
|
26 |
+
|
27 |
+
st.markdown("Voici un aperçu des statistiques de contribution pour le projet **MooreFrCollection**.")
|
28 |
+
|
29 |
+
# Charger toutes les annotations
|
30 |
+
all_annotations = load_all_annotations()
|
31 |
+
|
32 |
+
if all_annotations:
|
33 |
+
# Première ligne : Métriques principales
|
34 |
+
col1, col2, col3 = st.columns(3)
|
35 |
+
with col1:
|
36 |
+
total_duration_minutes = calculate_total_duration(all_annotations)
|
37 |
+
st.metric("⏱️ Total d'audios traités", f"{total_duration_minutes:.2f} minutes")
|
38 |
+
with col2:
|
39 |
+
avg_annotation_length = calculate_average_annotation_length(all_annotations)
|
40 |
+
st.metric("📏 Durée moyenne d'une annotation", f"{avg_annotation_length:.2f} minutes")
|
41 |
+
with col3:
|
42 |
+
st.empty()
|
43 |
+
|
44 |
+
st.markdown("---")
|
45 |
+
|
46 |
+
# Deuxième ligne : Classement et histogramme
|
47 |
+
col_ranking, col_histogram = st.columns([1, 2])
|
48 |
+
with col_ranking:
|
49 |
+
st.subheader("🏆 Classement des contributeurs par durée totale")
|
50 |
+
contributor_ranking = calculate_contributor_ranking(all_annotations)
|
51 |
+
if contributor_ranking:
|
52 |
+
ranking_df = pd.DataFrame(contributor_ranking, columns=['Contributeur', 'Durée totale (secondes)'])
|
53 |
+
ranking_df['Durée totale (minutes)'] = ranking_df['Durée totale (secondes)'] / 60.0
|
54 |
+
st.dataframe(ranking_df[['Contributeur', 'Durée totale (minutes)']].set_index('Contributeur'), height=300)
|
55 |
+
else:
|
56 |
+
st.info("Aucune contribution enregistrée pour le moment.")
|
57 |
+
|
58 |
+
with col_histogram:
|
59 |
+
histogram_fig = create_contributions_histogram(contributor_ranking)
|
60 |
+
if histogram_fig:
|
61 |
+
st.plotly_chart(histogram_fig, use_container_width=True)
|
62 |
+
|
63 |
+
st.markdown("---")
|
64 |
+
|
65 |
+
# Troisième ligne : Diagramme circulaire et contributions récentes
|
66 |
+
col_pie, col_recent = st.columns(2)
|
67 |
+
with col_pie:
|
68 |
+
pie_chart_fig = create_contributions_pie_chart(all_annotations)
|
69 |
+
if pie_chart_fig:
|
70 |
+
st.plotly_chart(pie_chart_fig, use_container_width=True)
|
71 |
+
|
72 |
+
with col_recent:
|
73 |
+
display_most_recent_contributions(all_annotations)
|
74 |
+
|
75 |
+
st.markdown("---")
|
76 |
+
|
77 |
+
# Quatrième ligne : Évolution temporelle
|
78 |
+
st.subheader("📈 Évolution temporelle des contributions")
|
79 |
+
contributions_over_time_df = calculate_contributions_over_time(all_annotations)
|
80 |
+
if contributions_over_time_df is not None and not contributions_over_time_df.empty:
|
81 |
+
fig = go.Figure(data=[go.Scatter(x=contributions_over_time_df['Date'], y=contributions_over_time_df['Nombre de contributions'], mode='lines+markers')])
|
82 |
+
st.plotly_chart(fig, use_container_width=True)
|
83 |
+
elif all_annotations:
|
84 |
+
st.info("Impossible de déterminer l'évolution temporelle des contributions (informations de date manquantes dans les clés S3).")
|
85 |
+
else:
|
86 |
+
st.info("Aucune contribution à afficher pour l'évolution temporelle.")
|
87 |
+
|
88 |
+
else:
|
89 |
+
st.info("Aucune donnée d'annotation disponible pour générer les statistiques.")
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy>=2.2.3
|
2 |
+
pandas>=2.2.3
|
3 |
+
plotly>=6.0.0
|
4 |
+
pyarrow>=19.0.1
|
5 |
+
streamlit>=1.43.1
|
6 |
+
datasets
|
7 |
+
boto3
|
8 |
+
pydub
|
9 |
+
python-dotenv
|
10 |
+
soundfile
|
11 |
+
s3fs
|
rocket_pipeline/youtuber.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from loguru import logger
|
3 |
+
import boto3
|
4 |
+
from tqdm import tqdm
|
5 |
+
from pydub import AudioSegment
|
6 |
+
from yt_dlp import YoutubeDL
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def filter_videos_by_keywords(candidates, keywords):
|
13 |
+
|
14 |
+
if not candidates:
|
15 |
+
return []
|
16 |
+
|
17 |
+
filtered_videos = []
|
18 |
+
|
19 |
+
for candidate in candidates:
|
20 |
+
if not isinstance(candidate, dict):
|
21 |
+
continue
|
22 |
+
|
23 |
+
title = str(candidate.get("title", "")).lower()
|
24 |
+
description = str(candidate.get("description", "")).lower()
|
25 |
+
|
26 |
+
if any(keyword.lower() in title or keyword.lower() in description for keyword in keywords):
|
27 |
+
filtered_videos.append(candidate)
|
28 |
+
|
29 |
+
logger.info(f"Filtrage terminé: {len(filtered_videos)}/{len(candidates)} vidéos correspondent aux mots-clés {keywords}")
|
30 |
+
return filtered_videos
|
31 |
+
|
32 |
+
def get_videos_from_channel(channel_url):
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
logger.info(f"Extraction des vidéos depuis la chaîne: {channel_url}")
|
37 |
+
|
38 |
+
ydl_opts = {
|
39 |
+
'extract_flat': True,
|
40 |
+
'quiet': True,
|
41 |
+
}
|
42 |
+
|
43 |
+
with YoutubeDL(ydl_opts) as ydl:
|
44 |
+
info = ydl.extract_info(channel_url, download=False)
|
45 |
+
if 'entries' in info:
|
46 |
+
videos = info['entries']
|
47 |
+
videos_urls = [video for video in videos if not "Shorts" in video["title"]]
|
48 |
+
videos_urls = sum([videos_url["entries"] for videos_url in videos_urls], [])
|
49 |
+
logger.info(f"Nombre total de videos trouvées: {len(videos_urls)}")
|
50 |
+
return videos_urls
|
51 |
+
else:
|
52 |
+
logger.warning("Aucune vidéo trouvée sur cette chaîne")
|
53 |
+
return []
|
54 |
+
|
55 |
+
def download_youtube_audios(videos, output_dir):
|
56 |
+
"""
|
57 |
+
Télécharge les fichiers audio des vidéos YouTube.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
videos: Liste des vidéos à télécharger
|
61 |
+
output_dir: Répertoire de sortie (utilise INPUT_DIR par défaut)
|
62 |
+
"""
|
63 |
+
|
64 |
+
ydl_opts = {
|
65 |
+
'format': 'bestaudio/best',
|
66 |
+
'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
|
67 |
+
'postprocessors': [{
|
68 |
+
'key': 'FFmpegExtractAudio',
|
69 |
+
'preferredcodec': 'wav',
|
70 |
+
}],
|
71 |
+
'quiet': False,
|
72 |
+
}
|
73 |
+
|
74 |
+
logger.info(f"Début du téléchargement de {len(videos)} vidéos")
|
75 |
+
with YoutubeDL(ydl_opts) as ydl:
|
76 |
+
for video in tqdm(videos, desc="Téléchargement des vidéos"):
|
77 |
+
try:
|
78 |
+
url = f"https://www.youtube.com/watch?v={video['id']}"
|
79 |
+
logger.info(f"Téléchargement de l'audio (WAV) : {video['title']}")
|
80 |
+
ydl.download([url])
|
81 |
+
except Exception as e:
|
82 |
+
logger.error(f"Erreur lors du téléchargement de {video.get('title', video.get('id', 'inconnu'))}: {str(e)}")
|
83 |
+
|
84 |
+
def segment_audio_files(input_dir, output_dir, segment_length):
|
85 |
+
"""
|
86 |
+
Découpe les fichiers audio en segments.
|
87 |
+
|
88 |
+
Args:
|
89 |
+
input_dir: Répertoire des fichiers audio source (utilise INPUT_DIR par défaut)
|
90 |
+
output_dir: Répertoire des segments audio (utilise OUTPUT_DIR par défaut)
|
91 |
+
segment_length: Durée de chaque segment en ms (utilise SEGMENT_LENGTH_MS par défaut)
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
Nombre total de segments créés
|
95 |
+
"""
|
96 |
+
|
97 |
+
|
98 |
+
wav_files = [f for f in os.listdir(input_dir) if f.endswith(".wav")]
|
99 |
+
logger.info(f"Nombre de fichiers WAV à traiter: {len(wav_files)}")
|
100 |
+
|
101 |
+
total_segments = 0
|
102 |
+
processed_segments = []
|
103 |
+
|
104 |
+
for filename in tqdm(wav_files, desc="Traitement des fichiers audio"):
|
105 |
+
try:
|
106 |
+
filepath = os.path.join(input_dir, filename)
|
107 |
+
audio = AudioSegment.from_wav(filepath)
|
108 |
+
duration = len(audio)
|
109 |
+
|
110 |
+
base_name = os.path.splitext(filename)[0]
|
111 |
+
|
112 |
+
video_folder = os.path.join(output_dir, base_name)
|
113 |
+
os.makedirs(video_folder, exist_ok=True)
|
114 |
+
|
115 |
+
logger.info(f"Découpage de : {filename} → dossier [{video_folder}]")
|
116 |
+
|
117 |
+
num_segments = (duration + segment_length - 1) // segment_length
|
118 |
+
segments_created = 0
|
119 |
+
|
120 |
+
for i in tqdm(range(0, duration, segment_length),
|
121 |
+
desc=f"Segments de {base_name}",
|
122 |
+
total=num_segments):
|
123 |
+
segment = audio[i:i + segment_length]
|
124 |
+
segment_name = f"part{i // segment_length + 1}.wav"
|
125 |
+
segment_path = os.path.join(video_folder, segment_name)
|
126 |
+
segment.export(segment_path, format="wav")
|
127 |
+
segments_created += 1
|
128 |
+
processed_segments.append(segment_path)
|
129 |
+
|
130 |
+
logger.info(f"Fichier {filename}: {segments_created} segments créés")
|
131 |
+
total_segments += segments_created
|
132 |
+
except Exception as e:
|
133 |
+
logger.error(f"Erreur lors du traitement de {filename}: {str(e)}")
|
134 |
+
|
135 |
+
logger.info(f"Traitement terminé. Total des segments créés: {total_segments}")
|
136 |
+
return total_segments, processed_segments
|
137 |
+
|
138 |
+
def setup_s3_client():
|
139 |
+
|
140 |
+
access_key = os.getenv("AWS_ACCESS_KEY_ID")
|
141 |
+
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
|
142 |
+
endpoint_url = os.getenv("AWS_ENDPOINT_URL_S3")
|
143 |
+
|
144 |
+
if not all([access_key, secret_key]):
|
145 |
+
logger.warning("Variables d'environnement AWS manquantes (AWS_ACCESS_KEY_ID ou AWS_SECRET_ACCESS_KEY)")
|
146 |
+
return None
|
147 |
+
|
148 |
+
client_params = {
|
149 |
+
"aws_access_key_id": access_key,
|
150 |
+
"aws_secret_access_key": secret_key,
|
151 |
+
}
|
152 |
+
|
153 |
+
if endpoint_url:
|
154 |
+
client_params["endpoint_url"] = endpoint_url
|
155 |
+
|
156 |
+
try:
|
157 |
+
return boto3.client("s3", **client_params)
|
158 |
+
except Exception as e:
|
159 |
+
logger.error(f"Erreur lors de l'initialisation du client S3: {str(e)}")
|
160 |
+
return None
|
161 |
+
|
162 |
+
|
163 |
+
def upload_file_to_s3(s3_client, local_path, bucket_name, s3_key):
|
164 |
+
|
165 |
+
try:
|
166 |
+
s3_client.upload_file(local_path, bucket_name, s3_key)
|
167 |
+
logger.info(f"Uploadé {local_path} vers s3://{bucket_name}/{s3_key}")
|
168 |
+
except Exception as e:
|
169 |
+
logger.error(f"Erreur lors de l'upload de {local_path}: {str(e)}")
|
170 |
+
|
171 |
+
def upload_segments_to_s3(segments, bucket_name, prefix, segments_folder):
|
172 |
+
|
173 |
+
s3_client = setup_s3_client()
|
174 |
+
if not s3_client:
|
175 |
+
logger.error("Client S3 non disponible. Upload annulé.")
|
176 |
+
return 0
|
177 |
+
|
178 |
+
uploaded_count = 0
|
179 |
+
logger.info(f"Début de l'upload des segments vers S3 (bucket: {bucket_name}, préfixe: {prefix})")
|
180 |
+
|
181 |
+
for segment_path in tqdm(segments, desc="Upload des segments vers S3"):
|
182 |
+
try:
|
183 |
+
relative_path = os.path.relpath(segment_path, start=segments_folder)
|
184 |
+
s3_key = f"{prefix}/{relative_path.replace(os.sep, '/')}"
|
185 |
+
|
186 |
+
upload_file_to_s3(s3_client, segment_path, bucket_name, s3_key)
|
187 |
+
uploaded_count += 1
|
188 |
+
except Exception as e:
|
189 |
+
logger.error(f"Erreur lors de l'upload de {segment_path}: {str(e)}")
|
190 |
+
|
191 |
+
logger.info(f"Upload terminé. {uploaded_count}/{len(segments)} fichiers envoyés vers S3.")
|
192 |
+
return uploaded_count
|
193 |
+
|
194 |
+
def main():
|
195 |
+
|
196 |
+
# ====================== CHANGE ME - CONFIGURATION ======================
|
197 |
+
# Mots-clés pour le filtrage des vidéos
|
198 |
+
FILTER_KEYWORDS = ["sid pa"] #
|
199 |
+
|
200 |
+
|
201 |
+
CHANNEL_URL = "https://www.youtube.com/@livenewsafrica/"
|
202 |
+
|
203 |
+
RAW_AUDIO_DIR = "audios_sidpa_wav"
|
204 |
+
SEGMENT_AUDIO_DIR = "audios_segments_wav"
|
205 |
+
|
206 |
+
# Durée des segments en millisecondes
|
207 |
+
SEGMENT_LENGTH_MS = 30 * 1000 # 30 secondes par défaut
|
208 |
+
|
209 |
+
# Configuration S3
|
210 |
+
BUCKET_NAME = "moore-collection"
|
211 |
+
S3_PREFIX = "audios_wav"
|
212 |
+
USE_S3 = True # Mettre à True pour activer les opérations S3
|
213 |
+
# ====================== FIN CHANGE ME ======================
|
214 |
+
|
215 |
+
os.makedirs(RAW_AUDIO_DIR, exist_ok=True)
|
216 |
+
os.makedirs(SEGMENT_AUDIO_DIR, exist_ok=True)
|
217 |
+
|
218 |
+
logger.info("Démarrage du traitement des fichiers audio")
|
219 |
+
|
220 |
+
videos = get_videos_from_channel(CHANNEL_URL)
|
221 |
+
filtered_videos = filter_videos_by_keywords(videos, keywords=["sid pa"])
|
222 |
+
download_youtube_audios(filtered_videos, RAW_AUDIO_DIR)
|
223 |
+
|
224 |
+
total_segments, processed_segments = segment_audio_files(RAW_AUDIO_DIR, SEGMENT_AUDIO_DIR, SEGMENT_LENGTH_MS)
|
225 |
+
|
226 |
+
if USE_S3:
|
227 |
+
upload_segments_to_s3(processed_segments, BUCKET_NAME, S3_PREFIX, SEGMENT_AUDIO_DIR)
|
228 |
+
|
229 |
+
logger.info("Traitement terminé avec succès")
|
230 |
+
|
231 |
+
if __name__ == "__main__":
|
232 |
+
main()
|
utils/utils_stats.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from collections import defaultdict
|
5 |
+
from datetime import datetime
|
6 |
+
import pandas as pd
|
7 |
+
import plotly.express as px
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
|
10 |
+
load_dotenv(".env")
|
11 |
+
S3_BUCKET = os.getenv("S3_BUCKET")
|
12 |
+
ANNOTATIONS_PREFIX = "annotations"
|
13 |
+
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
14 |
+
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
15 |
+
ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
|
16 |
+
|
17 |
+
s3 = boto3.client(
|
18 |
+
"s3",
|
19 |
+
aws_access_key_id=AWS_ACCESS_KEY_ID,
|
20 |
+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
|
21 |
+
endpoint_url=ENDPOINT_URL
|
22 |
+
)
|
23 |
+
|
24 |
+
def load_all_annotations():
|
25 |
+
"""Charge toutes les annotations depuis S3."""
|
26 |
+
annotations = []
|
27 |
+
paginator = s3.get_paginator("list_objects_v2")
|
28 |
+
for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=ANNOTATIONS_PREFIX):
|
29 |
+
for obj in page.get("Contents", []):
|
30 |
+
key = obj["Key"]
|
31 |
+
if key.endswith(".json"):
|
32 |
+
try:
|
33 |
+
file_obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
|
34 |
+
content = file_obj["Body"].read().decode('utf-8')
|
35 |
+
data = json.loads(content)
|
36 |
+
annotations.append(data)
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Erreur lors de la lecture de {key}: {e}")
|
39 |
+
return annotations
|
40 |
+
|
41 |
+
def calculate_total_duration(annotations):
|
42 |
+
"""Calcule la durée totale des audios annotés (en minutes)."""
|
43 |
+
total_seconds = sum(float(ann.get("duration", 0)) for ann in annotations)
|
44 |
+
return total_seconds / 60.0
|
45 |
+
|
46 |
+
def calculate_contributor_ranking(annotations):
|
47 |
+
"""Calcule la durée totale des contributions par utilisateur."""
|
48 |
+
contributor_durations = defaultdict(float)
|
49 |
+
for ann in annotations:
|
50 |
+
user = ann.get("user")
|
51 |
+
duration = float(ann.get("duration", 0))
|
52 |
+
if user:
|
53 |
+
contributor_durations[user] += duration
|
54 |
+
return sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True)
|
55 |
+
|
56 |
+
def create_contributions_histogram(contributor_ranking):
|
57 |
+
"""Crée un histogramme des contributions par utilisateur."""
|
58 |
+
if not contributor_ranking:
|
59 |
+
return None
|
60 |
+
users = [item[0] for item in contributor_ranking]
|
61 |
+
durations_minutes = [item[1] / 60.0 for item in contributor_ranking]
|
62 |
+
fig = px.bar(x=users, y=durations_minutes, labels={'x': 'Contributeur', 'y': 'Durée totale (minutes)'},
|
63 |
+
title='Durée totale des contributions par utilisateur')
|
64 |
+
return fig
|
65 |
+
|
66 |
+
def create_contributions_pie_chart(annotations):
|
67 |
+
"""Crée un diagramme circulaire des contributions par utilisateur (top 10)."""
|
68 |
+
contributor_durations = defaultdict(float)
|
69 |
+
for ann in annotations:
|
70 |
+
user = ann.get("user")
|
71 |
+
duration = float(ann.get("duration", 0))
|
72 |
+
if user:
|
73 |
+
contributor_durations[user] += duration
|
74 |
+
|
75 |
+
if not contributor_durations:
|
76 |
+
return None
|
77 |
+
|
78 |
+
sorted_contributors = sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True)
|
79 |
+
top_n = sorted_contributors[:10] # Afficher les 10 meilleurs contributeurs
|
80 |
+
|
81 |
+
labels = [item[0] for item in top_n]
|
82 |
+
values = [item[1] / 60.0 for item in top_n]
|
83 |
+
|
84 |
+
fig = px.pie(names=labels, values=values, title='Répartition des contributions (Top 10 des contributeurs)',
|
85 |
+
hole=0.3)
|
86 |
+
fig.update_traces(textinfo='percent+label')
|
87 |
+
return fig
|
88 |
+
|
89 |
+
def extract_annotation_date(annotation_key):
|
90 |
+
"""Extrait une date approximative de l'annotation à partir de la clé S3."""
|
91 |
+
parts = annotation_key.split('/')
|
92 |
+
if len(parts) >= 3:
|
93 |
+
for part in parts:
|
94 |
+
try:
|
95 |
+
return datetime.strptime(part, '%Y-%m-%d').date()
|
96 |
+
except ValueError:
|
97 |
+
pass
|
98 |
+
return None
|
99 |
+
|
100 |
+
def calculate_contributions_over_time(annotations):
|
101 |
+
"""Calcule le nombre de contributions par jour en utilisant le champ 'created_at'."""
|
102 |
+
daily_contributions_count = defaultdict(int)
|
103 |
+
for ann in annotations:
|
104 |
+
created_at_str = ann.get("created_at")
|
105 |
+
if created_at_str:
|
106 |
+
try:
|
107 |
+
created_at = datetime.fromisoformat(created_at_str)
|
108 |
+
annotation_date = created_at.date()
|
109 |
+
daily_contributions_count[annotation_date] += 1
|
110 |
+
except ValueError:
|
111 |
+
print(f"Erreur lors de la conversion de la date: {created_at_str}")
|
112 |
+
|
113 |
+
if not daily_contributions_count:
|
114 |
+
return None
|
115 |
+
|
116 |
+
df = pd.DataFrame(daily_contributions_count.items(), columns=['Date', 'Nombre de contributions'])
|
117 |
+
df = df.sort_values(by='Date')
|
118 |
+
return df
|
119 |
+
|
120 |
+
def create_contributions_time_series(df_contributions):
|
121 |
+
"""Crée un graphique de l'évolution temporelle du nombre de contributions."""
|
122 |
+
fig = px.line(df_contributions, x='Date', y='Nombre de contributions',
|
123 |
+
title='Nombre de contributions par jour')
|
124 |
+
return fig
|
125 |
+
|
126 |
+
def calculate_average_annotation_length(annotations):
|
127 |
+
"""Calcule la durée moyenne des annotations."""
|
128 |
+
total_duration = sum(float(ann.get("duration", 0)) for ann in annotations)
|
129 |
+
num_annotations = len(annotations)
|
130 |
+
if num_annotations > 0:
|
131 |
+
return total_duration / num_annotations / 60.0 # en minutes
|
132 |
+
return 0.0
|
utils/utils_trad.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import boto3
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import pandas as pd
|
7 |
+
from io import BytesIO
|
8 |
+
import soundfile as sf
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
load_dotenv(".env")
|
14 |
+
S3_BUCKET = os.getenv("S3_BUCKET")
|
15 |
+
S3_PREFIX = os.getenv("S3_PREFIX")
|
16 |
+
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
17 |
+
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
18 |
+
ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
|
19 |
+
ANNOTATIONS_PREFIX = "annotations"
|
20 |
+
|
21 |
+
|
22 |
+
s3 = boto3.client(
|
23 |
+
"s3",
|
24 |
+
aws_access_key_id=AWS_ACCESS_KEY_ID,
|
25 |
+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
|
26 |
+
endpoint_url=ENDPOINT_URL
|
27 |
+
)
|
28 |
+
|
29 |
+
def list_audio_files_by_title():
|
30 |
+
"""Regroupe les fichiers audio par titre (préfixe de dossier)."""
|
31 |
+
response = s3.list_objects_v2(Bucket=S3_BUCKET, Prefix=S3_PREFIX)
|
32 |
+
if "Contents" not in response:
|
33 |
+
return {}
|
34 |
+
|
35 |
+
grouped = {}
|
36 |
+
for obj in response["Contents"]:
|
37 |
+
key = obj["Key"]
|
38 |
+
if not key.endswith(".wav"):
|
39 |
+
continue
|
40 |
+
parts = key.split("/")
|
41 |
+
if len(parts) >= 3:
|
42 |
+
title = parts[1]
|
43 |
+
grouped.setdefault(title, []).append(key)
|
44 |
+
return grouped
|
45 |
+
|
46 |
+
def get_audio_url(audio_path):
|
47 |
+
"""Génère une URL temporaire pour écouter l'audio."""
|
48 |
+
return s3.generate_presigned_url(
|
49 |
+
ClientMethod="get_object",
|
50 |
+
Params={"Bucket": S3_BUCKET, "Key": audio_path},
|
51 |
+
ExpiresIn=3600,
|
52 |
+
)
|
53 |
+
|
54 |
+
def get_audio_duration_from_s3(bucket, key):
|
55 |
+
"""Récupère la durée d'un fichier audio depuis S3."""
|
56 |
+
try:
|
57 |
+
obj = s3.get_object(Bucket=bucket, Key=key)
|
58 |
+
audio_bytes = obj['Body'].read()
|
59 |
+
with BytesIO(audio_bytes) as audio_buffer:
|
60 |
+
y, sr = sf.read(audio_buffer)
|
61 |
+
duration = len(y) / sr
|
62 |
+
return duration
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Erreur lors de la lecture de la durée de {key}: {e}")
|
65 |
+
return 0.0
|
66 |
+
|
67 |
+
def save_annotation(audio_path, user, transcription, traduction):
|
68 |
+
"""Sauvegarde l'annotation de l'utilisateur dans S3."""
|
69 |
+
duration = get_audio_duration_from_s3(S3_BUCKET, audio_path)
|
70 |
+
base_filename = os.path.basename(audio_path).replace(".wav", "")
|
71 |
+
path_parts = audio_path.split('/')
|
72 |
+
title = path_parts[-2]
|
73 |
+
annotation_key = f"{ANNOTATIONS_PREFIX}/{title}/{base_filename}__{user}.json"
|
74 |
+
|
75 |
+
payload = {
|
76 |
+
"audio_path": audio_path,
|
77 |
+
"user": user,
|
78 |
+
"transcription": transcription,
|
79 |
+
"traduction": traduction,
|
80 |
+
"duration": duration,
|
81 |
+
"created_at": datetime.utcnow().isoformat() # Ajouter un timestamp UTC
|
82 |
+
|
83 |
+
}
|
84 |
+
|
85 |
+
s3.put_object(
|
86 |
+
Bucket=S3_BUCKET,
|
87 |
+
Key=annotation_key,
|
88 |
+
Body=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
|
89 |
+
ContentType="application/json",
|
90 |
+
)
|
91 |
+
|
92 |
+
def get_total_audio_duration_by_user(username: str) -> float:
|
93 |
+
"""Calcule la durée totale (en minutes) d'audios annotés par un utilisateur."""
|
94 |
+
paginator = s3.get_paginator("list_objects_v2")
|
95 |
+
total_seconds = 0.0
|
96 |
+
|
97 |
+
for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=ANNOTATIONS_PREFIX):
|
98 |
+
for obj in page.get("Contents", []):
|
99 |
+
key = obj["Key"]
|
100 |
+
if not key.endswith(".json") or f"__{username}.json" not in key:
|
101 |
+
continue
|
102 |
+
try:
|
103 |
+
file_obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
|
104 |
+
content = file_obj["Body"].read().decode('utf-8')
|
105 |
+
data = json.loads(content)
|
106 |
+
duration = data.get("duration")
|
107 |
+
if duration:
|
108 |
+
total_seconds += float(duration)
|
109 |
+
except Exception as e:
|
110 |
+
print(f"Erreur lors de la lecture de {key}: {e}")
|
111 |
+
continue
|
112 |
+
|
113 |
+
return total_seconds / 60.0
|
114 |
+
|
115 |
+
def get_processed_audio_files_by_user_and_title(username: str, title: str) -> set:
|
116 |
+
"""Récupère l'ensemble des noms de fichiers audio déjà traités par un utilisateur pour un titre donné."""
|
117 |
+
processed_files = set()
|
118 |
+
prefix = f"{ANNOTATIONS_PREFIX}/{title}/"
|
119 |
+
paginator = s3.get_paginator("list_objects_v2")
|
120 |
+
for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=prefix):
|
121 |
+
for obj in page.get("Contents", []):
|
122 |
+
key = obj["Key"]
|
123 |
+
if key.endswith(f"__{username}.json"):
|
124 |
+
filename_with_ext = key.split("/")[-1].replace(f"__{username}.json", ".wav")
|
125 |
+
processed_files.add(filename_with_ext)
|
126 |
+
return processed_files
|