Spaces:

leygit
/

ITI110_Spam_Classification_Project

Sleeping

App Files Files Community

leygit commited on Feb 26

Commit

563702e

verified ·

1 Parent(s): 116aa9b

Create app.py

Browse files

Files changed (1) hide show

app.py +219 -0

app.py ADDED Viewed

	@@ -0,0 +1,219 @@

+#DISTILLBERT RUN 3 , added weight_decay=0.01
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+from transformers import BertTokenizer
+# Load dataset
+file_path = 'spam_ham_dataset.csv'
+df = pd.read_csv(file_path)
+# Convert labels to numeric
+df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
+# Load tokenizer
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+# Tokenize dataset
+encodings = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
+labels = torch.tensor(df['label_num'].values)
+# Custom Dataset
+class SpamDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        item = {key: val[idx] for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
+        return item
+# Create dataset
+dataset = SpamDataset(encodings, labels)
+# Split dataset (80% train, 20% validation)
+train_size = int(0.8 * len(dataset))
+val_size = len(dataset) - train_size
+train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
+# DataLoader with batch size
+def collate_fn(batch):
+    keys = batch[0].keys()
+    return {key: torch.stack([b[key] for b in batch]) for key in keys}
+train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
+val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
+# Load the trained model
+def load_model(model_path="distilbert_spam_model.pt"):
+    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+    model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))  # Load model weights
+    model.eval()  # Set model to evaluation mode
+    return model
+# Evaluation
+model.eval()
+correct = 0
+total = 0
+with torch.no_grad():
+    for batch in val_loader:
+        inputs = {key: val.to(device) for key, val in batch.items()}
+        labels = inputs.pop("labels").to(device)
+        outputs = model(**inputs)
+        predictions = torch.argmax(outputs.logits, dim=1)
+        correct += (predictions == labels).sum().item()
+        total += labels.size(0)
+accuracy = correct / total
+print(f"Validation Accuracy: {accuracy:.4f}")
+# Classification function
+def classify_email(email_text):
+    model.eval()  # Set model to evaluation mode
+    with torch.no_grad():
+        # Tokenize and convert input text to tensor
+        inputs = tokenizer(email_text, padding=True, truncation=True, max_length=256, return_tensors="pt")
+        # Move inputs to the appropriate device
+        inputs = {key: val.to(device) for key, val in inputs.items()}
+        # Get model predictions
+        outputs = model(**inputs)
+        logits = outputs.logits
+        # Convert logits to predicted class
+        predictions = torch.argmax(logits, dim=1)
+        # Convert logits to probabilities using softmax
+        probs = F.softmax(logits, dim=1)
+        confidence = torch.max(probs).item() * 100  # Convert to percentage
+    # Convert numeric prediction to label
+    result = "Spam" if predictions.item() == 1 else "Ham"
+    return {
+        "result": result,
+        "confidence": f"{confidence:.2f}%",
+    }
+# Evaluation function with detailed classification report
+def evaluate_model_with_report(val_loader):
+    model.eval()  # Set model to evaluation mode
+    y_true = []
+    y_pred = []
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for batch in val_loader:
+            inputs = {key: val.to(device) for key, val in batch.items()}
+            labels = inputs.pop("labels").to(device)
+            outputs = model(**inputs)
+            predictions = torch.argmax(outputs.logits, dim=1)
+            # Collect labels and predictions
+            y_true.extend(labels.cpu().numpy())
+            y_pred.extend(predictions.cpu().numpy())
+            # Calculate accuracy
+            correct += (predictions == labels).sum().item()
+            total += labels.size(0)
+    # Calculate accuracy
+    accuracy = correct / total if total > 0 else 0
+    print(f"Validation Accuracy: {accuracy:.4f}")
+    # Print classification report
+    print("\nClassification Report:")
+    print(classification_report(y_true, y_pred, target_names=["Ham", "Spam"]))
+    return accuracy
+# Run evaluation with classification report
+accuracy = evaluate_model_with_report(val_loader)
+print(f"Model Validation Accuracy: {accuracy:.4f}")
+## Gradio Interface
+import gradio as gr
+# Create Gradio Interface
+def create_interface():
+    performance_metrics = generate_performance_metrics()
+    # Introduction - Title + Brief Description
+    with gr.Blocks(css=custom_css) as interface:
+        gr.Markdown("Spam Email Classification")
+        gr.Markdown(
+            """
+            Brief description of the project here
+            """
+        )
+        # Email Text Input
+        with gr.Row():
+            email_input = gr.Textbox(
+                lines=8, placeholder="Type or paste your email content here...", label="Email Content"
+            )
+        # Email Text Results and Analysis
+        with gr.Row():
+            result_output = gr.HTML(label="Classification Result") # label = [function that prints classification result]
+            confidence_output = gr.Textbox(label="Confidence Score", interactive=False)
+            accuracy_output = gr.Textbox(label="Accuracy", interactive=False)
+        analyze_button = gr.Button("Analyze Email 🕵️‍♂️")
+        analyze_button.click(
+            fn=email_analysis_pipeline,
+            inputs=email_input,
+            outputs=[result_output, confidence_output, accuracy_output]
+        )
+        # Analysis
+        gr.Markdown("## 📊 Model Performance Analytics")
+        with gr.Row():
+            with gr.Column():
+                gr.Textbox(value=performance_metrics["accuracy"], label="Accuracy", interactive=False, elem_classes=["metric"])
+                gr.Textbox(value=performance_metrics["precision"], label="Precision", interactive=False, elem_classes=["metric"])
+                gr.Textbox(value=performance_metrics["recall"], label="Recall", interactive=False, elem_classes=["metric"])
+                gr.Textbox(value=performance_metrics["f1_score"], label="F1 Score", interactive=False, elem_classes=["metric"])
+            with gr.Column():
+                gr.Markdown("### Confusion Matrix")
+                gr.HTML(f"<img src='data:image/png;base64,{performance_metrics['confusion_matrix_plot']}' style='max-width: 100%; height: auto;' />")
+        gr.Markdown("## 📘 Glossary and Explanation of Labels")
+        gr.Markdown(
+            """
+            ### Labels:
+            - **Spam:** Unwanted or harmful emails flagged by the system.
+            - **Ham:** Legitimate, safe emails.
+            ### Metrics:
+            - **Accuracy:** The percentage of correct classifications.
+            - **Precision:** Out of predicted Spam, how many are actually Spam.
+            - **Recall:** Out of all actual Spam emails, how many are predicted as Spam.
+            - **F1 Score:** Harmonic mean of Precision and Recall.
+            """
+        )
+    return interface
+# Launch the interface
+interface = create_interface()
+interface.launch(share=True)