Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import pandas as pd | |
import numpy as np | |
import time | |
import io | |
# Load model and tokenizer once at startup | |
model_name = "yekaii/ytb-comment-judol-bulk" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
# Set device and move model to it | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
model.eval() | |
def predict_single_comment(comment): | |
""" | |
Process a single comment through the judol detection model | |
Args: | |
comment: Text string containing the comment | |
Returns: | |
Float probability of being judol (0-1) | |
""" | |
# Tokenize input | |
inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=128) | |
inputs = {key: value.to(device) for key, value in inputs.items()} | |
# Predict without calculating gradients | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
probs = torch.softmax(outputs.logits, dim=1) | |
judol_prob = probs[0][1].item() | |
return judol_prob | |
def predict_batch_comments(comments, batch_size=16): | |
""" | |
Process multiple comments in batches | |
Args: | |
comments: List of comment strings | |
batch_size: Number of comments to process at once | |
Returns: | |
List of judol probabilities for each comment | |
""" | |
results = [] | |
# Process in batches | |
for i in range(0, len(comments), batch_size): | |
batch = comments[i:i+batch_size] | |
# Tokenize | |
inputs = tokenizer(batch, return_tensors="pt", truncation=True, | |
padding=True, max_length=128) | |
inputs = {key: value.to(device) for key, value in inputs.items()} | |
# Predict | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
probs = torch.softmax(outputs.logits, dim=1) | |
batch_probs = probs[:, 1].cpu().numpy().tolist() | |
results.extend(batch_probs) | |
return results | |
def process_single_input(comment_text): | |
""" | |
Process a single comment from the Gradio interface | |
Args: | |
comment_text: Text string containing the comment | |
Returns: | |
Formatted result for display | |
""" | |
if not comment_text.strip(): | |
return "Please enter a comment to analyze." | |
start_time = time.time() | |
probability = predict_single_comment(comment_text) | |
inference_time = time.time() - start_time | |
# Format result | |
result = f"Probability of being judol: {probability:.4f} ({probability*100:.2f}%)\n" | |
result += f"Processing time: {inference_time:.4f} seconds" | |
if probability > 0.8: | |
risk = "HIGH" | |
elif probability > 0.5: | |
risk = "MEDIUM" | |
else: | |
risk = "LOW" | |
result += f"\nJudol Risk Level: {risk}" | |
return result | |
def process_file_input(file): | |
""" | |
Process a CSV or TXT file containing multiple comments | |
Args: | |
file: File object uploaded through Gradio | |
Returns: | |
DataFrame with results and downloadable CSV | |
""" | |
start_time = time.time() | |
try: | |
# Check if file is None | |
if file is None: | |
return "No file uploaded. Please upload a CSV or TXT file.", None, None | |
# Debugging: Print out the file object to understand its structure | |
print(f"File object type: {type(file)}") | |
print(f"File object contents: {file}") | |
# In Hugging Face spaces, file might be a tuple (temp_path, original_name) | |
file_path = file | |
original_filename = "uploaded_file" # Default name | |
if isinstance(file, tuple) and len(file) >= 2: | |
file_path = file[0] # The temporary path | |
original_filename = file[1] # The original filename | |
elif hasattr(file, 'name'): | |
file_path = file.name | |
if hasattr(file, 'orig_name'): | |
original_filename = file.orig_name | |
else: | |
# Extract filename from path | |
import os | |
original_filename = os.path.basename(file_path) | |
print(f"Using file_path: {file_path}") | |
print(f"Original filename: {original_filename}") | |
# Determine file type and read accordingly | |
if original_filename.lower().endswith('.csv'): | |
# For CSV files, use pandas to read directly | |
df = pd.read_csv(file_path) | |
# Check if there's a column that might contain comments | |
text_columns = [col for col in df.columns if df[col].dtype == 'object'] | |
if not text_columns: | |
print("No text columns found in CSV file") | |
return "No text columns found in CSV file", None, None | |
# Use the first text column if 'text' or 'comment' not found | |
if 'text' in text_columns: | |
comment_col = 'text' | |
elif 'comment' in text_columns: | |
comment_col = 'comment' | |
else: | |
comment_col = text_columns[0] | |
comments = df[comment_col].fillna('').astype(str).tolist() | |
print(comments) | |
elif original_filename.lower().endswith('.txt'): | |
# For TXT files, read lines | |
with open(file_path, 'r', encoding='utf-8') as f: | |
comments = [line.strip() for line in f if line.strip()] | |
# Create a DataFrame for consistent output handling | |
df = pd.DataFrame({'text': comments}) | |
else: | |
print("unsupported file format") | |
return "Unsupported file format. Please upload a CSV or TXT file.", None, None | |
# Process comments in batches | |
probabilities = predict_batch_comments(comments) | |
print(probabilities) | |
# Create output dataframe | |
df['judol_probability'] = probabilities | |
df['is_judol'] = df['judol_probability'] > 0.5 | |
# Create output file | |
import os | |
import tempfile | |
# Extract base filename without extension for output | |
base_filename = os.path.splitext(original_filename)[0] | |
output_filename = f"{base_filename}_analyzed.csv" | |
# Create a temporary file to write to | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") | |
df.to_csv(temp_file.name, index=False) | |
temp_file.close() | |
print("Create CSV done") | |
print(f"output filename:{output_filename}") | |
# Summary stats | |
total_comments = len(comments) | |
judol_comments = sum(1 for p in probabilities if p > 0.5) | |
processing_time = time.time() - start_time | |
summary = f"Processed {total_comments} comments in {processing_time:.2f} seconds\n" | |
summary += f"Found {judol_comments} potential judol comments ({judol_comments/total_comments*100:.1f}%)\n" | |
summary += f"Average processing time per comment: {processing_time/total_comments*1000:.2f} ms" | |
# Return the temporary file path with the desired filename | |
return summary, df.head(100), temp_file.name | |
except Exception as e: | |
import traceback | |
error_details = traceback.format_exc() | |
print(f"Error details: {error_details}") | |
return f"Error processing file: {str(e)}\n\nDetails: {error_details}", None, None | |
# Define Gradio interface | |
with gr.Blocks(title="YouTube Judol Comment Detector") as demo: | |
gr.Markdown("# YouTube Judol Comment Detector") | |
gr.Markdown("Detect potential judi online (judol) spam comments on YouTube videos.") | |
with gr.Tab("Single Comment"): | |
with gr.Row(): | |
with gr.Column(): | |
comment_input = gr.Textbox( | |
label="Enter YouTube comment", | |
placeholder="Type or paste a YouTube comment here", | |
lines=4 | |
) | |
analyze_btn = gr.Button("Analyze Comment", variant="primary") | |
with gr.Column(): | |
result_output = gr.Textbox( | |
label="Analysis Result", | |
lines=5 | |
) | |
analyze_btn.click( | |
fn=process_single_input, | |
inputs=comment_input, | |
outputs=result_output | |
) | |
with gr.Tab("Bulk Analysis"): | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File( | |
label="Upload CSV or TXT file with comments", | |
file_types=[".csv", ".txt"] | |
) | |
analyze_file_btn = gr.Button("Analyze File", variant="primary") | |
gr.Markdown(""" | |
## File Format Requirements: | |
- CSV files should have a column named 'text' or 'comment' | |
- TXT files should have one comment per line | |
- Maximum recommended file size: 10MB (approx. 50,000 comments) | |
""") | |
with gr.Column(): | |
summary_output = gr.Textbox( | |
label="Analysis Summary", | |
lines=5 | |
) | |
results_table = gr.DataFrame(label="Results Preview (First 100 rows)") | |
download_output = gr.File(label="Download Complete Results") | |
analyze_file_btn.click( | |
fn=process_file_input, | |
inputs=file_input, | |
outputs=[summary_output, results_table, download_output] | |
) | |
gr.Markdown("## About") | |
gr.Markdown(""" | |
This model detects Indonesian judi online (judol) spam comments on YouTube. | |
- Built with IndoBERT base model | |
- Fine-tuned on annotated YouTube comments dataset | |
- Detects common judol spam patterns in Indonesian language | |
Created by yekaii. | |
""") | |
# Launch the app | |
demo.launch() |