Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,270 +1,74 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
-
"""Emotion Detection NLP Mental Health
|
3 |
|
4 |
-
Automatically generated by Colab.
|
5 |
-
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/emotion-detection-nlp-mental-health-07377912-eef1-476c-bca0-e3f3abe2bc31.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250205/auto/storage/goog4_request%26X-Goog-Date%3D20250205T063040Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3379ac810304cc40b0fa5fa915ff09212c0da161bbdae3190bbb13f09d158e28ddbebaecc6f31f960598bf39852f632c8d65288530a38effc9d316c50e6ab1a71aedc9066b12ef4487648ede7d5646dbef0283c9eb7a5539c47ac342e640964e13ff9ea00f5ca777b4adc007f3a830e7d9cfccc590924dc8a5057440bfd82b0e97c9739112dba40371f7321d5231ddd5b476890fb7d4fced9ed0ba155fde73046cb775adeadd827f01dcc90a583f7dab149ca3a5c35f2b29df5106ca356258ee13267ac10671a604057af3e053d45fdabb4d1758c1b3f3da38ddbab02762b81b7f717321a649a1b63f8bc5773a8a27377de6214668dd1b1253012ff8017e2850
|
8 |
-
"""
|
9 |
-
|
10 |
-
'''# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
|
11 |
-
# THEN FEEL FREE TO DELETE THIS CELL.
|
12 |
-
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
|
13 |
-
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
|
14 |
-
# NOTEBOOK.
|
15 |
-
import kagglehub
|
16 |
-
thedevastator_nlp_mental_health_conversations_path = kagglehub.dataset_download('thedevastator/nlp-mental-health-conversations')
|
17 |
-
|
18 |
-
print('Data source import complete.')'''
|
19 |
-
|
20 |
-
"""# Introduction
|
21 |
-
|
22 |
-
In recent years, mental health awareness has grown, leading to a greater emphasis on making support more accessible to everyone. Artificial Intelligence (AI) is playing a pivotal role in bridging the gap between those in need of mental health advice and the limited number of qualified professionals available. The dataset provided in this project is a valuable resource for developing Natural Language Processing (NLP) models that can assist with mental health support.
|
23 |
-
|
24 |
-
The dataset used in this project consists of anonymized conversations between patients and experienced psychologists, where we will concentrate on detecting the emotional context of the dialogue. By understanding the emotions present in these exchanges, the NLP model will be able to respond more appropriately and offer tailored advice based on the patient's emotional state.
|
25 |
-
|
26 |
-
## Purpose
|
27 |
-
|
28 |
-
The notebook will explore, preprocess, and model the data with the goal of improving emotion detection in patient conversations. This will allow us to understand the emotional landscape of mental health discussions and create AI systems capable of providing emotionally aware responses.
|
29 |
-
|
30 |
-
# Libraries
|
31 |
-
"""
|
32 |
-
|
33 |
-
'''#Download and Extracting Data from Kaggle
|
34 |
-
import os
|
35 |
-
import zipfile'''
|
36 |
-
|
37 |
-
|
38 |
-
# Data Preprcessing
|
39 |
import string
|
40 |
import re
|
41 |
-
from warnings import filterwarnings
|
42 |
-
import matplotlib.pyplot as plt
|
43 |
-
import numpy as np
|
44 |
-
import pandas as pd
|
45 |
-
from PIL import Image
|
46 |
-
|
47 |
-
|
48 |
import nltk
|
49 |
-
|
50 |
from nltk.tokenize import word_tokenize
|
51 |
-
from nltk.
|
52 |
from nltk.stem import WordNetLemmatizer
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
from sklearn.
|
61 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
62 |
|
63 |
-
|
64 |
-
filterwarnings('ignore')
|
65 |
-
pd.set_option('display.max_columns', None)
|
66 |
-
pd.set_option('display.max_colwidth', None)
|
67 |
-
pd.set_option('display.width', 200)
|
68 |
-
pd.set_option('display.float_format', lambda x: '%.2f' % x)
|
69 |
-
|
70 |
-
# Download necessary NLTK resources
|
71 |
-
import nltk
|
72 |
nltk.download('punkt_tab')
|
73 |
nltk.download('stopwords')
|
74 |
nltk.download('punkt')
|
75 |
-
nltk.download('wordnet')
|
76 |
-
|
77 |
-
"""# Data
|
78 |
-
|
79 |
-
## Download and Extracting
|
80 |
-
"""
|
81 |
-
|
82 |
-
'''# Downlaod the dataset using kaggle API
|
83 |
-
os.system("kaggle datasets download -d thedevastator/nlp-mental-health-conversations")
|
84 |
-
|
85 |
-
#Extract the download zip files
|
86 |
-
dataset_zip='nlp-mental-health-conversations.zip'
|
87 |
-
extracted_folder='nlp_mental_health_conversations'
|
88 |
-
|
89 |
-
#Extract the dataset
|
90 |
-
with zipfile.ZipFile(dataset_zip,'r') as zip_ref:
|
91 |
-
zip_ref.extractall(extracted_folder)
|
92 |
-
|
93 |
-
print("Dataset downloaded and extracted successfully.")'''
|
94 |
-
|
95 |
-
"""## Explore Data"""
|
96 |
-
|
97 |
-
data = pd.read_csv("train.csv")
|
98 |
-
|
99 |
-
|
100 |
-
data.head()
|
101 |
-
|
102 |
-
reponse=data.loc[0,"Response"]
|
103 |
-
print("Length Before text preprocessing : ",len(reponse))
|
104 |
-
|
105 |
-
"""## Text Preprocessing
|
106 |
-
- Normlaize
|
107 |
-
- Punctuation
|
108 |
-
- Numbers
|
109 |
-
- StopWords
|
110 |
-
- Lemmezation
|
111 |
-
- Removing Words
|
112 |
-
"""
|
113 |
|
114 |
-
#
|
115 |
lemmatizer = WordNetLemmatizer()
|
116 |
|
117 |
def clean_text(text):
|
118 |
-
|
119 |
-
text = str(text)
|
120 |
-
#
|
121 |
-
text =
|
122 |
-
# Remove punctuation
|
123 |
-
text = text.translate(str.maketrans('', '', string.punctuation))
|
124 |
-
# Remove numbers
|
125 |
-
text = re.sub(r'\d+', '', text)
|
126 |
-
# Tokenize text
|
127 |
tokens = word_tokenize(text)
|
128 |
-
# Remove stop words
|
129 |
stop_words = set(stopwords.words('english'))
|
130 |
-
tokens = [word for word in tokens if word not in stop_words]
|
131 |
-
# Lemmatize
|
132 |
-
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
133 |
-
|
134 |
-
# Join tokens back into a string
|
135 |
return ' '.join(tokens)
|
136 |
|
137 |
-
#
|
|
|
138 |
data['Context'] = data['Context'].apply(clean_text)
|
139 |
|
140 |
-
#
|
141 |
-
|
142 |
-
# Let's remove words used less than 1
|
143 |
-
temp_Context = pd.Series(' '.join(data['Context']).split()).value_counts()
|
144 |
-
drops = temp_Context[temp_Context <= 1]
|
145 |
-
data['Context'] = data['Context'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))
|
146 |
-
|
147 |
-
"""## Text visualization"""
|
148 |
-
|
149 |
-
tf_Context = data["Context"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
|
150 |
-
tf_Context.columns = ["words", "tf"]
|
151 |
-
tf_Context.sort_values("tf", ascending=False)
|
152 |
-
|
153 |
-
# Barplot for Context
|
154 |
-
|
155 |
-
tf_Context[tf_Context["tf"] > 300].plot.bar(x="words", y="tf")
|
156 |
-
plt.show()
|
157 |
-
|
158 |
-
"""# Emotions Anaylsis"""
|
159 |
-
|
160 |
-
from transformers import pipeline
|
161 |
-
|
162 |
-
# Extract and clean 'Context' column
|
163 |
-
contexts = data['Context']
|
164 |
-
|
165 |
-
# Load pre-trained emotion detection model
|
166 |
emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
|
167 |
-
|
168 |
-
# Analyze emotions in 'Context'
|
169 |
emotions = contexts.apply(lambda x: emotion_model(x)[0]['label'])
|
170 |
-
|
171 |
-
# Add detected emotions as a new column
|
172 |
data['Detected_Emotion'] = emotions
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
data['Detected_Emotion'].value_counts()
|
177 |
-
|
178 |
-
"""# Feature Extraction"""
|
179 |
-
|
180 |
-
# Initialize TF-IDF Vectorizer
|
181 |
vectorizer = TfidfVectorizer()
|
|
|
|
|
182 |
|
183 |
-
#
|
184 |
-
tfidf_matrix = vectorizer.fit_transform(contexts)
|
185 |
-
|
186 |
-
# Convert to array (if needed)
|
187 |
-
tfidf_array = tfidf_matrix.toarray()
|
188 |
-
|
189 |
-
"""# Model
|
190 |
-
|
191 |
-
## Data Spilting
|
192 |
-
"""
|
193 |
-
|
194 |
-
from sklearn.model_selection import train_test_split
|
195 |
-
|
196 |
-
# Split the data
|
197 |
-
X_train, X_test, y_train, y_test = train_test_split(tfidf_array, data['Detected_Emotion'], test_size=0.3, random_state=42)
|
198 |
-
|
199 |
-
from sklearn.ensemble import RandomForestClassifier
|
200 |
-
|
201 |
-
# Initialize the model
|
202 |
model = RandomForestClassifier()
|
203 |
-
|
204 |
-
"""## Fine Tuning"""
|
205 |
-
|
206 |
-
from sklearn.model_selection import GridSearchCV
|
207 |
-
|
208 |
-
# Define the parameter grid
|
209 |
-
param_grid = {
|
210 |
-
'n_estimators': [100, 200, 300],
|
211 |
-
'max_depth': [None, 10, 20, 30]
|
212 |
-
}
|
213 |
-
|
214 |
-
# Perform grid search
|
215 |
-
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
|
216 |
-
grid_search.fit(X_train, y_train)
|
217 |
-
|
218 |
-
# Best parameters
|
219 |
-
print(f'Best parameters: {grid_search.best_params_}')
|
220 |
-
|
221 |
-
"""# Train and Evaluation
|
222 |
-
|
223 |
-
## Train
|
224 |
-
"""
|
225 |
-
|
226 |
-
model = RandomForestClassifier()
|
227 |
-
# Train the model
|
228 |
model.fit(X_train, y_train)
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
# Calculate accuracy
|
239 |
-
accuracy = accuracy_score(y_test, y_pred)
|
240 |
-
print(f'Accuracy: {accuracy}')
|
241 |
-
|
242 |
-
# Print classification report
|
243 |
-
print("Classification Report:")
|
244 |
-
print(classification_report(y_test, y_pred))
|
245 |
-
|
246 |
-
# Generate confusion matrix
|
247 |
-
conf_matrix = confusion_matrix(y_test, y_pred)
|
248 |
-
print("Confusion Matrix:")
|
249 |
-
print(conf_matrix)
|
250 |
-
|
251 |
-
# Plot confusion matrix
|
252 |
-
plt.figure(figsize=(10, 7))
|
253 |
-
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_test))
|
254 |
-
plt.xlabel('Predicted')
|
255 |
-
plt.ylabel('Actual')
|
256 |
-
plt.title('Confusion Matrix')
|
257 |
-
plt.show()
|
258 |
-
|
259 |
-
"""## Test Unseen Data"""
|
260 |
-
|
261 |
-
# Example new text
|
262 |
-
new_text = ["let's leave i am scared"]
|
263 |
|
264 |
-
#
|
265 |
-
|
266 |
-
new_text_tfidf = vectorizer.transform(new_text_cleaned)
|
267 |
|
268 |
-
#
|
269 |
-
|
270 |
-
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
"""Emotion Detection NLP Mental Health"""
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import string
|
5 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import nltk
|
7 |
+
import pandas as pd
|
8 |
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.corpus import stopwords
|
10 |
from nltk.stem import WordNetLemmatizer
|
11 |
+
from transformers import pipeline
|
12 |
+
import gradio as gr
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import seaborn as sns
|
15 |
+
import numpy as np
|
16 |
+
from sklearn.model_selection import train_test_split
|
17 |
+
from sklearn.ensemble import RandomForestClassifier
|
18 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
19 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
20 |
+
from warnings import filterwarnings
|
21 |
|
22 |
+
# NLTK Downloads
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
nltk.download('punkt_tab')
|
24 |
nltk.download('stopwords')
|
25 |
nltk.download('punkt')
|
26 |
+
nltk.download('wordnet')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
# Text Preprocessing
|
29 |
lemmatizer = WordNetLemmatizer()
|
30 |
|
31 |
def clean_text(text):
|
32 |
+
"""Cleans and preprocesses the input text."""
|
33 |
+
text = str(text).lower()
|
34 |
+
text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
|
35 |
+
text = re.sub(r'\d+', '', text) # Remove numbers
|
|
|
|
|
|
|
|
|
|
|
36 |
tokens = word_tokenize(text)
|
|
|
37 |
stop_words = set(stopwords.words('english'))
|
38 |
+
tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
|
39 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize words
|
|
|
|
|
|
|
40 |
return ' '.join(tokens)
|
41 |
|
42 |
+
# Load data
|
43 |
+
data = pd.read_csv("train.csv")
|
44 |
data['Context'] = data['Context'].apply(clean_text)
|
45 |
|
46 |
+
# Emotion Detection Model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
|
48 |
+
contexts = data['Context']
|
|
|
49 |
emotions = contexts.apply(lambda x: emotion_model(x)[0]['label'])
|
|
|
|
|
50 |
data['Detected_Emotion'] = emotions
|
51 |
|
52 |
+
# Feature Extraction
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
vectorizer = TfidfVectorizer()
|
54 |
+
tfidf_matrix = vectorizer.fit_transform(data['Context'])
|
55 |
+
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix.toarray(), data['Detected_Emotion'], test_size=0.3, random_state=42)
|
56 |
|
57 |
+
# Train a Random Forest Classifier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
model = RandomForestClassifier()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
model.fit(X_train, y_train)
|
60 |
|
61 |
+
# Function to predict emotion of new text
|
62 |
+
def predict_emotion(text):
|
63 |
+
"""Predicts the emotion for the given text."""
|
64 |
+
cleaned_text = clean_text(text)
|
65 |
+
tfidf_text = vectorizer.transform([cleaned_text])
|
66 |
+
predicted_emotion = model.predict(tfidf_text)
|
67 |
+
return predicted_emotion[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
+
# Gradio Interface
|
70 |
+
iface = gr.Interface(fn=predict_emotion, inputs="text", outputs="text", live=True)
|
|
|
71 |
|
72 |
+
# Launch the Gradio Interface
|
73 |
+
if __name__ == "__main__":
|
74 |
+
iface.launch()
|