leygit commited on
Commit
ba5291d
·
verified ·
1 Parent(s): 8ef42f4

Upload 2 files

Browse files
Files changed (2) hide show
  1. spam_ham_dataset.csv +0 -0
  2. untitled3.py +419 -0
spam_ham_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
untitled3.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled3.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1BTaF9lue6oXAqEx5zFRq1cnWWQ9YKCiQ
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import torch
13
+ from transformers import BertTokenizer
14
+ import seaborn as sns
15
+ import matplotlib.pyplot as plt
16
+ from sklearn.feature_extraction.text import CountVectorizer
17
+
18
+
19
+ # Load dataset
20
+ file_path = 'spam_ham_dataset.csv'
21
+ df = pd.read_csv(file_path)
22
+ df.head()
23
+
24
+ # Preprocessing
25
+ #.str.replace(r'[^\w\s]', '', regex=True) removes everthing except letters, numbers, and spaces
26
+ # df['text'].str.lower() converts everything in the text column to lower case only
27
+ df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
28
+ df['text'].head()
29
+
30
+
31
+ sns.countplot(x=df['label'])
32
+ plt.title("Spam vs Ham Distribution")
33
+ plt.show()
34
+
35
+ # Calculate text length metrics
36
+ df['char_count'] = df['text'].apply(len)
37
+ df['word_count'] = df['text'].apply(lambda x: len(x.split()))
38
+ # Plot word count distribution for spam and ham
39
+ plt.figure(figsize=(12, 5))
40
+ sns.histplot(data=df, x='word_count', hue='label', bins=30, kde=True)
41
+ plt.xlim(0, 1000)
42
+ plt.title("Word Count Distribution by Label")
43
+ plt.xlabel("Number of Words")
44
+ plt.ylabel("Frequency")
45
+ plt.show()
46
+
47
+ def get_top_words(corpus, n=None):
48
+ vec = CountVectorizer(stop_words='english').fit(corpus)
49
+ bag_of_words = vec.transform(corpus)
50
+ sum_words = bag_of_words.sum(axis=0)
51
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
52
+ words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
53
+ return words_freq[:n]
54
+
55
+ # Top 10 words for spam
56
+ top_spam_words = get_top_words(df[df['label'] == "spam"]['text'], n=10)
57
+ print("Top spam words:", top_spam_words)
58
+
59
+ # Top 10 words for ham
60
+ top_ham_words = get_top_words(df[df['label'] == "ham"]['text'], n=10)
61
+ print("Top ham words:", top_ham_words)
62
+
63
+ from sklearn.feature_extraction.text import TfidfVectorizer
64
+ from sklearn.naive_bayes import MultinomialNB
65
+ from sklearn.metrics import classification_report
66
+
67
+ # TF-IDF Vectorization
68
+ vectorizer = TfidfVectorizer()
69
+ X = vectorizer.fit_transform(df['text'])
70
+ y = df['label_num']
71
+
72
+ # Train-Test Split
73
+ from sklearn.model_selection import train_test_split
74
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
75
+
76
+ # Train Naïve Bayes Model
77
+ nb_model = MultinomialNB()
78
+ nb_model.fit(X_train, y_train)
79
+
80
+ # Predictions
81
+ y_pred = nb_model.predict(X_test)
82
+ print(classification_report(y_test, y_pred))
83
+
84
+ import pandas as pd
85
+ import torch
86
+ import torch.nn as nn
87
+ import torch.optim as optim
88
+ from transformers import BertTokenizer, BertForSequenceClassification
89
+ from torch.utils.data import Dataset, DataLoader
90
+
91
+ # Load dataset
92
+ file_path = 'spam_ham_dataset.csv'
93
+ df = pd.read_csv(file_path)
94
+
95
+ # Convert label column to numeric (0 for ham, 1 for spam)
96
+ df['label_num'] = df['label'].astype('category').cat.codes
97
+
98
+ # Load tokenizer
99
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
100
+
101
+ # Tokenize dataset
102
+ encodings = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
103
+ labels = torch.tensor(df['label_num'].values)
104
+
105
+ # Custom Dataset
106
+ class SpamDataset(Dataset):
107
+ def __init__(self, encodings, labels):
108
+ self.encodings = encodings
109
+ self.labels = labels
110
+
111
+ def __len__(self):
112
+ return len(self.labels)
113
+
114
+ def __getitem__(self, idx):
115
+ item = {key: val[idx] for key, val in self.encodings.items()} # Keep as PyTorch tensors
116
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long) # Ensure labels are `long`
117
+ return item
118
+
119
+ # Create dataset
120
+ dataset = SpamDataset(encodings, labels)
121
+
122
+ # Split dataset (80% train, 20% validation)
123
+ train_size = int(0.8 * len(dataset))
124
+ val_size = len(dataset) - train_size
125
+ train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
126
+
127
+ # DataLoader Function (Fix Collate)
128
+ def collate_fn(batch):
129
+ keys = batch[0].keys()
130
+ collated = {key: torch.stack([b[key] for b in batch]) for key in keys}
131
+ return collated
132
+
133
+ # Create DataLoader
134
+ train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
135
+ val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
136
+
137
+ # Load BERT model
138
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
139
+ model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
140
+ model.to(device)
141
+
142
+ # Define optimizer and loss function
143
+ optimizer = optim.AdamW(model.parameters(), lr=5e-5)
144
+ loss_fn = nn.CrossEntropyLoss()
145
+
146
+ # Training Loop
147
+ EPOCHS = 10
148
+
149
+ for epoch in range(EPOCHS):
150
+ model.train()
151
+ total_loss = 0
152
+
153
+ for batch in train_loader:
154
+ optimizer.zero_grad()
155
+
156
+ # Move batch to device
157
+ inputs = {key: val.to(device) for key, val in batch.items()}
158
+ labels = inputs.pop("labels").to(device) # Move labels to device
159
+
160
+ # Forward pass
161
+ outputs = model(**inputs)
162
+ loss = loss_fn(outputs.logits, labels)
163
+
164
+ # Backward pass
165
+ loss.backward()
166
+ optimizer.step()
167
+
168
+ total_loss += loss.item()
169
+
170
+ avg_loss = total_loss / len(train_loader)
171
+ print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
172
+
173
+ print("Training complete!")
174
+
175
+ from sklearn.metrics import classification_report
176
+ from transformers import BertTokenizer
177
+ import torch
178
+ import torch.nn.functional as F
179
+
180
+ # Classification function
181
+ def classify_email(email_text):
182
+ model.eval() # Set model to evaluation mode
183
+
184
+ with torch.no_grad():
185
+ # Tokenize and convert input text to tensor
186
+ inputs = tokenizer(email_text, padding=True, truncation=True, max_length=256, return_tensors="pt")
187
+
188
+ # Move inputs to the appropriate device
189
+ inputs = {key: val.to(device) for key, val in inputs.items()}
190
+
191
+ # Get model predictions
192
+ outputs = model(**inputs)
193
+ logits = outputs.logits
194
+
195
+ # Convert logits to predicted class
196
+ predictions = torch.argmax(logits, dim=1)
197
+
198
+ # Convert logits to probabilities using softmax
199
+ probs = F.softmax(logits, dim=1)
200
+ confidence = torch.max(probs).item() * 100 # Convert to percentage
201
+
202
+ # Convert numeric prediction to label
203
+ result = "Spam" if predictions.item() == 1 else "Ham"
204
+
205
+ return {
206
+ "result": result,
207
+ "confidence": f"{confidence:.2f}%",
208
+ }
209
+
210
+ # Evaluation function with detailed classification report
211
+ def evaluate_model_with_report(val_loader):
212
+ model.eval() # Set model to evaluation mode
213
+ y_true = []
214
+ y_pred = []
215
+ correct = 0
216
+ total = 0
217
+
218
+ with torch.no_grad():
219
+ for batch in val_loader:
220
+ inputs = {key: val.to(device) for key, val in batch.items()}
221
+ labels = inputs.pop("labels").to(device)
222
+
223
+ outputs = model(**inputs)
224
+ predictions = torch.argmax(outputs.logits, dim=1)
225
+
226
+ # Collect labels and predictions
227
+ y_true.extend(labels.cpu().numpy())
228
+ y_pred.extend(predictions.cpu().numpy())
229
+
230
+ # Calculate accuracy
231
+ correct += (predictions == labels).sum().item()
232
+ total += labels.size(0)
233
+
234
+ # Calculate accuracy
235
+ accuracy = correct / total if total > 0 else 0
236
+ print(f"Validation Accuracy: {accuracy:.4f}")
237
+
238
+ # Print classification report
239
+ print("\nClassification Report:")
240
+ print(classification_report(y_true, y_pred, target_names=["Ham", "Spam"]))
241
+
242
+ return accuracy
243
+
244
+ # Run evaluation with classification report
245
+ accuracy = evaluate_model_with_report(val_loader)
246
+ print(f"Model Validation Accuracy: {accuracy:.4f}")
247
+
248
+ ## App Deployment Functions
249
+
250
+ def generate_performance_metrics():
251
+ y_pred = model.predict(X_test)
252
+ accuracy = evaluate_model_with_report(val_loader)
253
+ report = classification_report(y_true, y_pred, target_names=["Ham", "Spam"])
254
+ return {
255
+ "accuracy": f"{accuracy:.2%}",
256
+ "precision": f"{report['1']['precision']:.2%}",
257
+ "recall": f"{report['1']['recall']:.2%}",
258
+ "f1_score": f"{report['1']['f1-score']:.2%}"
259
+ }
260
+
261
+ def email_analysis_pipeline(email_text):
262
+ results = classify_email(email_text)
263
+ accuracy = evaluate_model_with_report(val_loader)
264
+ return {
265
+ results["result"],
266
+ results["confidence"],
267
+ accuracy
268
+ }
269
+
270
+ ## Gradio Interface
271
+
272
+ !pip install gradio
273
+ import gradio as gr
274
+
275
+ # Create Gradio Interface
276
+ def create_interface():
277
+ performance_metrics = generate_performance_metrics()
278
+
279
+ # Introduction - Title + Brief Description
280
+ with gr.Blocks(css=custom_css) as interface:
281
+ gr.Markdown("Spam Email Classification")
282
+ gr.Markdown(
283
+ """
284
+ Brief description of the project here
285
+
286
+ """
287
+ )
288
+
289
+ # Email Text Input
290
+ with gr.Row():
291
+ email_input = gr.Textbox(
292
+ lines=8, placeholder="Type or paste your email content here...", label="Email Content"
293
+ )
294
+
295
+ # Email Text Results and Analysis
296
+ with gr.Row():
297
+ result_output = gr.HTML(label="Classification Result") # label = [function that prints classification result]
298
+ confidence_output = gr.Textbox(label="Confidence Score", interactive=False)
299
+ accuracy_output = gr.Textbox(label="Accuracy", interactive=False)
300
+
301
+
302
+ analyze_button = gr.Button("Analyze Email 🕵️‍♂️")
303
+
304
+ analyze_button.click(
305
+ fn=email_analysis_pipeline,
306
+ inputs=email_input,
307
+ outputs=[result_output, confidence_output, accuracy_output]
308
+ )
309
+
310
+ # Analysis
311
+ gr.Markdown("## 📊 Model Performance Analytics")
312
+ with gr.Row():
313
+ with gr.Column():
314
+ gr.Textbox(value=performance_metrics["accuracy"], label="Accuracy", interactive=False, elem_classes=["metric"])
315
+ gr.Textbox(value=performance_metrics["precision"], label="Precision", interactive=False, elem_classes=["metric"])
316
+ gr.Textbox(value=performance_metrics["recall"], label="Recall", interactive=False, elem_classes=["metric"])
317
+ gr.Textbox(value=performance_metrics["f1_score"], label="F1 Score", interactive=False, elem_classes=["metric"])
318
+ with gr.Column():
319
+ gr.Markdown("### Confusion Matrix")
320
+ gr.HTML(f"<img src='data:image/png;base64,{performance_metrics['confusion_matrix_plot']}' style='max-width: 100%; height: auto;' />")
321
+
322
+ gr.Markdown("## 📘 Glossary and Explanation of Labels")
323
+ gr.Markdown(
324
+ """
325
+ ### Labels:
326
+ - **Spam:** Unwanted or harmful emails flagged by the system.
327
+ - **Ham:** Legitimate, safe emails.
328
+
329
+ ### Metrics:
330
+ - **Accuracy:** The percentage of correct classifications.
331
+ - **Precision:** Out of predicted Spam, how many are actually Spam.
332
+ - **Recall:** Out of all actual Spam emails, how many are predicted as Spam.
333
+ - **F1 Score:** Harmonic mean of Precision and Recall.
334
+ """
335
+ )
336
+
337
+ return interface
338
+
339
+ # Launch the interface
340
+ interface = create_interface()
341
+ interface.launch(share=True)
342
+
343
+ ## CSS
344
+
345
+ # Updated CSS
346
+ custom_css = """
347
+ body {
348
+ font-family: 'Arial', sans-serif;
349
+ background-image: url('https://cdn.pixabay.com/photo/2016/11/19/15/26/email-1839873_1280.jpg');
350
+ background-size: cover;
351
+ background-position: center;
352
+ background-attachment: fixed;
353
+ color: #333;
354
+ }
355
+ h1, h2, h3 {
356
+ text-align: center;
357
+ color: #ffffff;
358
+ text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
359
+ }
360
+ .gradio-container {
361
+ background-color: rgba(255, 255, 255, 0.8);
362
+ border-radius: 10px;
363
+ padding: 20px;
364
+ box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3);
365
+ }
366
+ button {
367
+ background-color: #1e90ff;
368
+ color: white;
369
+ padding: 10px 20px;
370
+ border: none;
371
+ border-radius: 5px;
372
+ cursor: pointer;
373
+ font-size: 1.2em;
374
+ transition: transform 0.2s, background-color 0.3s;
375
+ }
376
+ button:hover {
377
+ background-color: #1c86ee;
378
+ transform: scale(1.05);
379
+ }
380
+ .highlight {
381
+ background-color: #ffeb3b;
382
+ font-weight: bold;
383
+ padding: 0 3px;
384
+ border-radius: 3px;
385
+ }
386
+ .metric {
387
+ font-size: 1.2em;
388
+ text-align: center;
389
+ color: #ffffff;
390
+ background-color: #4CAF50;
391
+ border-radius: 8px;
392
+ padding: 10px;
393
+ margin: 10px 0;
394
+ box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
395
+ }
396
+ """
397
+
398
+ ## Original
399
+
400
+ from sklearn.metrics import classification_report
401
+
402
+ # Collect predictions and true labels
403
+ y_true = []
404
+ y_pred = []
405
+
406
+ model.eval()
407
+ with torch.no_grad():
408
+ for batch in val_loader:
409
+ inputs = {key: val.to(device) for key, val in batch.items()}
410
+ labels = inputs.pop("labels").to(device)
411
+
412
+ outputs = model(**inputs)
413
+ predictions = torch.argmax(outputs.logits, dim=1)
414
+
415
+ y_true.extend(labels.cpu().numpy())
416
+ y_pred.extend(predictions.cpu().numpy())
417
+
418
+ # Print detailed classification report
419
+ print(classification_report(y_true, y_pred, target_names=["Ham", "Spam"]))