File size: 8,633 Bytes
ec15d48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45c8733
 
 
 
 
 
 
 
 
 
 
 
 
ec15d48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45c8733
ec15d48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e07416
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import re

def analyze_demographics(file):
    df = pd.read_excel(file.name)

    results = {
        "Overall Metrics": {},
        "Underrepresented Group Metrics": {},
        "Tenure Metrics": {},
        "Team Metrics": {},
        "Nationality Metrics": {},
        "Legal Entity Metrics": {},
        "Work Location Metrics": {}
    }

    tenure_order = ["< 1 year", "1 year - 2 years", "2 years - 3 years", "3 years - 4 years", "> 4 years"]

    recommend_col = "On a scale of 0 to 10, how likely are you to recommend working at Hugging Face to a friend or colleague?"
    if recommend_col in df.columns:
        promoters = df[recommend_col].apply(lambda x: x >= 9).sum()
        detractors = df[recommend_col].apply(lambda x: x <= 6).sum()
        total_respondents = df[recommend_col].notna().sum()
        recommend_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
        recommend_avg = df[recommend_col].mean()
        results["Overall Metrics"]['HF NPS'] = round(recommend_nps, 2)
        results["Overall Metrics"]['HF NPS (Average)'] = round(recommend_avg, 2)

    support_col = "On a scale of 0 to 10, how likely are you to recommend the support functions at HF (diversity, finance, hr, legal, security, talent) to a friend or colleague?"
    if support_col in df.columns:
        promoters = df[support_col].apply(lambda x: x >= 9).sum()
        detractors = df[support_col].apply(lambda x: x <= 6).sum()
        total_respondents = df[support_col].notna().sum()
        support_nps = ((promoters - detractors) / total_respondents) * 100 if total_respondents > 0 else None
        support_avg = df[support_col].mean()
        results["Overall Metrics"]['Support NPS'] = round(support_nps, 2)
        results["Overall Metrics"]['Support NPS (Average)'] = round(support_avg, 2)

    demographic_columns = [
        ("I identify as a member of an underrepresented group in tech. (e.g. including but not limited to gender, age, disability, sexuality, etc.)", "Underrepresented Group Metrics"),
        ("How long have you been at Hugging Face? (optional)", "Tenure Metrics"),
        ("Which team are you on here at Hugging Face? (optional)", "Team Metrics"),
        ("What is your primary nationality? (optional -- we only listed the largest groups to ensure anonymity.)", "Nationality Metrics"),
        ("Which legal entity are you employed by at HF? (optional)", "Legal Entity Metrics"),
        ("Are you fully remote or work mostly from a Hugging Face office? (optional)", "Work Location Metrics")
    ]
    
    for demo_col, demo_category in demographic_columns:
        if demo_col in df.columns:
            for col, prefix in [(recommend_col, "HF NPS"), (support_col, "Support NPS")]:
                if col in df.columns:
                    grouped_demo = df.groupby(demo_col)[col]
                    nps_by_demo = {}
                    for group, scores in grouped_demo:
                        promoters = scores.apply(lambda x: x >= 9).sum()
                        detractors = scores.apply(lambda x: x <= 6).sum()
                        total = scores.notna().sum()
                        nps_by_demo[group] = ((promoters - detractors) / total) * 100 if total > 0 else None
                    if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
                        sorted_nps_by_demo = {k: nps_by_demo.get(k, None) for k in tenure_order if k in nps_by_demo}
                        results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in sorted_nps_by_demo.items()}
                    else:
                        results[demo_category][f'{prefix}'] = {k: round(v, 2) if v is not None else None for k, v in nps_by_demo.items()}
                    averages_demo = grouped_demo.mean()
                    if demo_category == "Tenure Metrics" and demo_col == "How long have you been at Hugging Face? (optional)":
                        sorted_averages_demo = {k: averages_demo.get(k, None) for k in tenure_order if k in averages_demo}
                        results[demo_category][f'{prefix} (Average)'] = {k: round(v, 2) if v is not None else None for k, v in sorted_averages_demo.items()}
                    else:
                        results[demo_category][f'{prefix} (Average)'] = averages_demo.round(2).to_dict()

    return results

def analyze_why_columns(file):
    df = pd.read_excel(file.name)
    
    # Map column names to new labels
    column_label_map = {
        "Why? (optional)": "HF NPS Why?",
        "Why? (optional.1)": "Support Team NPS Why?",
        "Why? (optional.2)": "Productivity Why?"
    }
    
    # Rename columns in the DataFrame
    df = df.rename(columns=column_label_map)

    # Get the renamed columns that start with "Why"
    why_columns = [col for col in df.columns if col in column_label_map.values()]

    results = {}

    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    for col in why_columns:
        column_data = df[col].dropna().tolist()

        # Sentiment Analysis with Confidence Scores
        sentiments = sentiment_analyzer(column_data)
        sentiment_summary = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
        detailed_sentiments = {"POSITIVE": [], "NEGATIVE": [], "NEUTRAL": []}

        for response, sentiment in zip(column_data, sentiments):
            label = sentiment["label"]
            score = sentiment["score"]
            sentiment_summary[label] += 1
            detailed_sentiments[label].append({"response": response, "score": round(score, 2)})

        # Topic Modeling
        vectorizer = CountVectorizer(stop_words='english')
        X = vectorizer.fit_transform(column_data)
        lda = LatentDirichletAllocation(n_components=3, random_state=0)
        lda.fit(X)
        topics = []
        for idx, topic in enumerate(lda.components_):
            top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
            topics.append(f"Topic {idx + 1}: " + ", ".join(top_words))

        # Keyword Extraction
        combined_text = " ".join(column_data)
        word_list = re.findall(r"\\b\\w+\\b", combined_text.lower())
        bigram_vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english')
        bigram_counts = bigram_vectorizer.fit_transform([combined_text])
        bigram_features = bigram_vectorizer.get_feature_names_out()
        bigram_counts_sum = bigram_counts.toarray().sum(axis=0)
        bigram_frequency = Counter(dict(zip(bigram_features, bigram_counts_sum))).most_common(10)
        keywords = [f"{phrase} ({count} mentions)" for phrase, count in bigram_frequency]

        # Summarization
        def split_text(text, max_length=1000):
            words = text.split()
            for i in range(0, len(words), max_length):
                yield " ".join(words[i:i + max_length])

        summaries = []
        for chunk in split_text(combined_text, max_length=500):
            summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)

        final_summary = " ".join(summaries)

        # Store results
        results[col] = {
            "Sentiment Analysis Summary": sentiment_summary,
            "Detailed Sentiments": detailed_sentiments,
            "Topics": topics,
            "Keywords": keywords,
            "Summary": final_summary
        }

    return results

def process_file(file):
    quantitative_results = analyze_demographics(file)
    qualitative_results = analyze_why_columns(file)

    return quantitative_results, qualitative_results

def app():
    file_input = gr.File(label="Upload Survey Data (Excel format)")
    text_output = gr.JSON(label="Quantitative Analysis Results")
    qualitative_output = gr.JSON(label="Qualitative Analysis Results")

    iface = gr.Interface(
        fn=process_file,
        inputs=file_input,
        outputs=[text_output, qualitative_output],
        title="Survey Data Analyzer",
        description="Analyze both quantitative and qualitative survey data. Upload an Excel file to generate insights."
    )
    return iface

if __name__ == "__main__":
    app().launch(share=True)