Spaces:

EmilyWitko
/

survey-analyzer

Sleeping

File size: 8,633 Bytes

import gradio as gr
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import re

def analyze_demographics(file):
    df = pd.read_excel(file.name)

    results = {
        "Overall Metrics": {},
        "Underrepresented Group Metrics": {},
        "Tenure Metrics": {},
        "Team Metrics": {},
        "Nationality Metrics": {},
        "Legal Entity Metrics": {},
        "Work Location Metrics": {}
    }

    tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]

    recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
    if recommend_col in df.columns:
        promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
        detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
        total_respondents = df[recommend_col].notna().sum()
        recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
        recommend_avg = df[recommend_col].mean()
        results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
        results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)

    support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
    if support_col in df.columns:
        promoters = df[support_col].apply(lambda x: x >= 9).sum()
        detractors = df[support_col].apply(lambda x: x <= 6).sum()
        total_respondents = df[support_col].notna().sum()
        support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
        support_avg = df[support_col].mean()
        results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
        results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)

    demographic_columns = [
        ("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
        ("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
        ("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
        ("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
        ("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
        ("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
    ]
    
    for demo_col, demo_category in demographic_columns:
        if demo_col in df.columns:
            for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
                if col in df.columns:
                    grouped_demo = df.groupby(demo_col)[col]
                    nps_by_demo = {}
                    for group, scores in grouped_demo:
                        promoters = scores.apply(lambda x: x >= 9).sum()
                        detractors = scores.apply(lambda x: x <= 6).sum()
                        total = scores.notna().sum()
                        nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
                    if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
                        sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
                        results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
                    else:
                        results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
                    averages_demo = grouped_demo.mean()
                    if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
                        sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
                        results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
                    else:
                        results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()

    return results

def analyze_why_columns(file):
    df = pd.read_excel(file.name)
    
    # Map column names to new labels
    column_label_map = {
        "Why? (optional)": "HF NPS Why?",
        "Why? (optional.1)": "Support Team NPS Why?",
        "Why? (optional.2)": "Productivity Why?"
    }
    
    # Rename columns in the DataFrame
    df = df.rename(columns=column_label_map)

    # Get the renamed columns that start with "Why"
    why_columns = [col for col in df.columns if col in column_label_map.values()]

    results = {}

    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    for col in why_columns:
        column_data = df[col].dropna().tolist()

        # Sentiment Analysis with Confidence Scores
        sentiments = sentiment_analyzer(column_data)
        sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
        detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}

        for response, sentiment in zip(column_data, sentiments):
            label = sentiment["label"]
            score = sentiment["score"]
            sentiment_summary[label] += 1
            detailed_sentiments[label].append({"response": response, "score": round(score, 2)})

        # Topic Modeling
        vectorizer = CountVectorizer(stop_words='english')
        X = vectorizer.fit_transform(column_data)
        lda = LatentDirichletAllocation(n_components=3, random_state=0)
        lda.fit(X)
        topics = []
        for idx, topic in enumerate(lda.components_):
            top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
            topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))

        # Keyword Extraction
        combined_text = " ".join(column_data)
        word_list = re.findall(r"\\b\\w+\\b", combined_text.lower())
        bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
        bigram_counts = bigram_vectorizer.fit_transform([combined_text])
        bigram_features = bigram_vectorizer.get_feature_names_out()
        bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
        bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
        keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]

        # Summarization
        def split_text(text, max_length=1000):
            words = text.split()
            for i in range(0, len(words), max_length):
                yield " ".join(words[i:i + max_length])

        summaries = []
        for chunk in split_text(combined_text, max_length=500):
            summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)

        final_summary = " ".join(summaries)

        # Store results
        results[col] = {
            "Sentiment Analysis Summary": sentiment_summary,
            "Detailed Sentiments": detailed_sentiments,
            "Topics": topics,
            "Keywords": keywords,
            "Summary": final_summary
        }

    return results

def process_file(file):
    quantitative_results = analyze_demographics(file)
    qualitative_results = analyze_why_columns(file)

    return quantitative_results, qualitative_results

def app():
    file_input = gr.File(label="Upload Survey Data (Excel format)")
    text_output = gr.JSON(label="Quantitative Analysis Results")
    qualitative_output = gr.JSON(label="Qualitative Analysis Results")

    iface = gr.Interface(
        fn=process_file,
        inputs=file_input,
        outputs=[text_output, qualitative_output],
        title="Survey Data Analyzer",
        description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
    )
    return iface

if __name__ == "__main__":
    app().launch(share=True)