yekaii's picture
Update app.py
4786981 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
import time
import io
# Load model and tokenizer once at startup
model_name = "yekaii/ytb-comment-judol-bulk"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Set device and move model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
def predict_single_comment(comment):
"""
Process a single comment through the judol detection model
Args:
comment: Text string containing the comment
Returns:
Float probability of being judol (0-1)
"""
# Tokenize input
inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=128)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Predict without calculating gradients
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
judol_prob = probs[0][1].item()
return judol_prob
def predict_batch_comments(comments, batch_size=16):
"""
Process multiple comments in batches
Args:
comments: List of comment strings
batch_size: Number of comments to process at once
Returns:
List of judol probabilities for each comment
"""
results = []
# Process in batches
for i in range(0, len(comments), batch_size):
batch = comments[i:i+batch_size]
# Tokenize
inputs = tokenizer(batch, return_tensors="pt", truncation=True,
padding=True, max_length=128)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Predict
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
batch_probs = probs[:, 1].cpu().numpy().tolist()
results.extend(batch_probs)
return results
def process_single_input(comment_text):
"""
Process a single comment from the Gradio interface
Args:
comment_text: Text string containing the comment
Returns:
Formatted result for display
"""
if not comment_text.strip():
return "Please enter a comment to analyze."
start_time = time.time()
probability = predict_single_comment(comment_text)
inference_time = time.time() - start_time
# Format result
result = f"Probability of being judol: {probability:.4f} ({probability*100:.2f}%)\n"
result += f"Processing time: {inference_time:.4f} seconds"
if probability > 0.8:
risk = "HIGH"
elif probability > 0.5:
risk = "MEDIUM"
else:
risk = "LOW"
result += f"\nJudol Risk Level: {risk}"
return result
def process_file_input(file):
"""
Process a CSV or TXT file containing multiple comments
Args:
file: File object uploaded through Gradio
Returns:
DataFrame with results and downloadable CSV
"""
start_time = time.time()
try:
# Check if file is None
if file is None:
return "No file uploaded. Please upload a CSV or TXT file.", None, None
# Debugging: Print out the file object to understand its structure
print(f"File object type: {type(file)}")
print(f"File object contents: {file}")
# In Hugging Face spaces, file might be a tuple (temp_path, original_name)
file_path = file
original_filename = "uploaded_file" # Default name
if isinstance(file, tuple) and len(file) >= 2:
file_path = file[0] # The temporary path
original_filename = file[1] # The original filename
elif hasattr(file, 'name'):
file_path = file.name
if hasattr(file, 'orig_name'):
original_filename = file.orig_name
else:
# Extract filename from path
import os
original_filename = os.path.basename(file_path)
print(f"Using file_path: {file_path}")
print(f"Original filename: {original_filename}")
# Determine file type and read accordingly
if original_filename.lower().endswith('.csv'):
# For CSV files, use pandas to read directly
df = pd.read_csv(file_path)
# Check if there's a column that might contain comments
text_columns = [col for col in df.columns if df[col].dtype == 'object']
if not text_columns:
print("No text columns found in CSV file")
return "No text columns found in CSV file", None, None
# Use the first text column if 'text' or 'comment' not found
if 'text' in text_columns:
comment_col = 'text'
elif 'comment' in text_columns:
comment_col = 'comment'
else:
comment_col = text_columns[0]
comments = df[comment_col].fillna('').astype(str).tolist()
print(comments)
elif original_filename.lower().endswith('.txt'):
# For TXT files, read lines
with open(file_path, 'r', encoding='utf-8') as f:
comments = [line.strip() for line in f if line.strip()]
# Create a DataFrame for consistent output handling
df = pd.DataFrame({'text': comments})
else:
print("unsupported file format")
return "Unsupported file format. Please upload a CSV or TXT file.", None, None
# Process comments in batches
probabilities = predict_batch_comments(comments)
print(probabilities)
# Create output dataframe
df['judol_probability'] = probabilities
df['is_judol'] = df['judol_probability'] > 0.5
# Create output file
import os
import tempfile
# Extract base filename without extension for output
base_filename = os.path.splitext(original_filename)[0]
output_filename = f"{base_filename}_analyzed.csv"
# Create a temporary file to write to
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
df.to_csv(temp_file.name, index=False)
temp_file.close()
print("Create CSV done")
print(f"output filename:{output_filename}")
# Summary stats
total_comments = len(comments)
judol_comments = sum(1 for p in probabilities if p > 0.5)
processing_time = time.time() - start_time
summary = f"Processed {total_comments} comments in {processing_time:.2f} seconds\n"
summary += f"Found {judol_comments} potential judol comments ({judol_comments/total_comments*100:.1f}%)\n"
summary += f"Average processing time per comment: {processing_time/total_comments*1000:.2f} ms"
# Return the temporary file path with the desired filename
return summary, df.head(100), temp_file.name
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error details: {error_details}")
return f"Error processing file: {str(e)}\n\nDetails: {error_details}", None, None
# Define Gradio interface
with gr.Blocks(title="YouTube Judol Comment Detector") as demo:
gr.Markdown("# YouTube Judol Comment Detector")
gr.Markdown("Detect potential judi online (judol) spam comments on YouTube videos.")
with gr.Tab("Single Comment"):
with gr.Row():
with gr.Column():
comment_input = gr.Textbox(
label="Enter YouTube comment",
placeholder="Type or paste a YouTube comment here",
lines=4
)
analyze_btn = gr.Button("Analyze Comment", variant="primary")
with gr.Column():
result_output = gr.Textbox(
label="Analysis Result",
lines=5
)
analyze_btn.click(
fn=process_single_input,
inputs=comment_input,
outputs=result_output
)
with gr.Tab("Bulk Analysis"):
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload CSV or TXT file with comments",
file_types=[".csv", ".txt"]
)
analyze_file_btn = gr.Button("Analyze File", variant="primary")
gr.Markdown("""
## File Format Requirements:
- CSV files should have a column named 'text' or 'comment'
- TXT files should have one comment per line
- Maximum recommended file size: 10MB (approx. 50,000 comments)
""")
with gr.Column():
summary_output = gr.Textbox(
label="Analysis Summary",
lines=5
)
results_table = gr.DataFrame(label="Results Preview (First 100 rows)")
download_output = gr.File(label="Download Complete Results")
analyze_file_btn.click(
fn=process_file_input,
inputs=file_input,
outputs=[summary_output, results_table, download_output]
)
gr.Markdown("## About")
gr.Markdown("""
This model detects Indonesian judi online (judol) spam comments on YouTube.
- Built with IndoBERT base model
- Fine-tuned on annotated YouTube comments dataset
- Detects common judol spam patterns in Indonesian language
Created by yekaii.
""")
# Launch the app
demo.launch()