Transliterate / utils /utils_stats.py
sawadogosalif's picture
end
fbe0b46
import boto3
import json
import os
from collections import defaultdict
from datetime import datetime
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv
load_dotenv(".env")
S3_BUCKET = os.getenv("S3_BUCKET")
ANNOTATIONS_PREFIX = "annotations"
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL_S3")
s3 = boto3.client(
"s3",
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
endpoint_url=ENDPOINT_URL
)
def load_all_annotations():
"""Charge toutes les annotations depuis S3."""
annotations = []
paginator = s3.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=ANNOTATIONS_PREFIX):
for obj in page.get("Contents", []):
key = obj["Key"]
if key.endswith(".json"):
try:
file_obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
content = file_obj["Body"].read().decode('utf-8')
data = json.loads(content)
annotations.append(data)
except Exception as e:
print(f"Erreur lors de la lecture de {key}: {e}")
return annotations
def calculate_total_duration(annotations):
"""Calcule la durée totale des audios annotés (en minutes)."""
total_seconds = sum(float(ann.get("duration", 0)) for ann in annotations)
return total_seconds / 60.0
def calculate_contributor_ranking(annotations):
"""Calcule la durée totale des contributions par utilisateur."""
contributor_durations = defaultdict(float)
for ann in annotations:
user = ann.get("user")
duration = float(ann.get("duration", 0))
if user:
contributor_durations[user] += duration
return sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True)
def create_contributions_histogram(contributor_ranking):
"""Crée un histogramme des contributions par utilisateur."""
if not contributor_ranking:
return None
users = [item[0] for item in contributor_ranking]
durations_minutes = [item[1] / 60.0 for item in contributor_ranking]
fig = px.bar(x=users, y=durations_minutes, labels={'x': 'Contributeur', 'y': 'Durée totale (minutes)'},
title='Durée totale des contributions par utilisateur')
return fig
def create_contributions_pie_chart(annotations):
"""Crée un diagramme circulaire des contributions par utilisateur (top 10)."""
contributor_durations = defaultdict(float)
for ann in annotations:
user = ann.get("user")
duration = float(ann.get("duration", 0))
if user:
contributor_durations[user] += duration
if not contributor_durations:
return None
sorted_contributors = sorted(contributor_durations.items(), key=lambda item: item[1], reverse=True)
top_n = sorted_contributors[:10] # Afficher les 10 meilleurs contributeurs
labels = [item[0] for item in top_n]
values = [item[1] / 60.0 for item in top_n]
fig = px.pie(names=labels, values=values, title='Répartition des contributions (Top 10 des contributeurs)',
hole=0.3)
fig.update_traces(textinfo='percent+label')
return fig
def extract_annotation_date(annotation_key):
"""Extrait une date approximative de l'annotation à partir de la clé S3."""
parts = annotation_key.split('/')
if len(parts) >= 3:
for part in parts:
try:
return datetime.strptime(part, '%Y-%m-%d').date()
except ValueError:
pass
return None
def calculate_contributions_over_time(annotations):
"""Calcule le nombre de contributions par jour en utilisant le champ 'created_at'."""
daily_contributions_count = defaultdict(int)
for ann in annotations:
created_at_str = ann.get("created_at")
if created_at_str:
try:
created_at = datetime.fromisoformat(created_at_str)
annotation_date = created_at.date()
daily_contributions_count[annotation_date] += 1
except ValueError:
print(f"Erreur lors de la conversion de la date: {created_at_str}")
if not daily_contributions_count:
return None
df = pd.DataFrame(daily_contributions_count.items(), columns=['Date', 'Nombre de contributions'])
df = df.sort_values(by='Date')
return df
def create_contributions_time_series(df_contributions):
"""Crée un graphique de l'évolution temporelle du nombre de contributions."""
fig = px.line(df_contributions, x='Date', y='Nombre de contributions',
title='Nombre de contributions par jour')
return fig
def calculate_average_annotation_length(annotations):
"""Calcule la durée moyenne des annotations."""
total_duration = sum(float(ann.get("duration", 0)) for ann in annotations)
num_annotations = len(annotations)
if num_annotations > 0:
return total_duration / num_annotations / 60.0 # en minutes
return 0.0