import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification import pandas as pd import numpy as np import time import io # Load model and tokenizer once at startup model_name = "yekaii/ytb-comment-judol-bulk" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) # Set device and move model to it device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() def predict_single_comment(comment): """ Process a single comment through the judol detection model Args: comment: Text string containing the comment Returns: Float probability of being judol (0-1) """ # Tokenize input inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=128) inputs = {key: value.to(device) for key, value in inputs.items()} # Predict without calculating gradients with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=1) judol_prob = probs[0][1].item() return judol_prob def predict_batch_comments(comments, batch_size=16): """ Process multiple comments in batches Args: comments: List of comment strings batch_size: Number of comments to process at once Returns: List of judol probabilities for each comment """ results = [] # Process in batches for i in range(0, len(comments), batch_size): batch = comments[i:i+batch_size] # Tokenize inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=128) inputs = {key: value.to(device) for key, value in inputs.items()} # Predict with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=1) batch_probs = probs[:, 1].cpu().numpy().tolist() results.extend(batch_probs) return results def process_single_input(comment_text): """ Process a single comment from the Gradio interface Args: comment_text: Text string containing the comment Returns: Formatted result for display """ if not comment_text.strip(): return "Please enter a comment to analyze." start_time = time.time() probability = predict_single_comment(comment_text) inference_time = time.time() - start_time # Format result result = f"Probability of being judol: {probability:.4f} ({probability*100:.2f}%)\n" result += f"Processing time: {inference_time:.4f} seconds" if probability > 0.8: risk = "HIGH" elif probability > 0.5: risk = "MEDIUM" else: risk = "LOW" result += f"\nJudol Risk Level: {risk}" return result def process_file_input(file): """ Process a CSV or TXT file containing multiple comments Args: file: File object uploaded through Gradio Returns: DataFrame with results and downloadable CSV """ start_time = time.time() try: # Check if file is None if file is None: return "No file uploaded. Please upload a CSV or TXT file.", None, None # Debugging: Print out the file object to understand its structure print(f"File object type: {type(file)}") print(f"File object contents: {file}") # In Hugging Face spaces, file might be a tuple (temp_path, original_name) file_path = file original_filename = "uploaded_file" # Default name if isinstance(file, tuple) and len(file) >= 2: file_path = file[0] # The temporary path original_filename = file[1] # The original filename elif hasattr(file, 'name'): file_path = file.name if hasattr(file, 'orig_name'): original_filename = file.orig_name else: # Extract filename from path import os original_filename = os.path.basename(file_path) print(f"Using file_path: {file_path}") print(f"Original filename: {original_filename}") # Determine file type and read accordingly if original_filename.lower().endswith('.csv'): # For CSV files, use pandas to read directly df = pd.read_csv(file_path) # Check if there's a column that might contain comments text_columns = [col for col in df.columns if df[col].dtype == 'object'] if not text_columns: print("No text columns found in CSV file") return "No text columns found in CSV file", None, None # Use the first text column if 'text' or 'comment' not found if 'text' in text_columns: comment_col = 'text' elif 'comment' in text_columns: comment_col = 'comment' else: comment_col = text_columns[0] comments = df[comment_col].fillna('').astype(str).tolist() print(comments) elif original_filename.lower().endswith('.txt'): # For TXT files, read lines with open(file_path, 'r', encoding='utf-8') as f: comments = [line.strip() for line in f if line.strip()] # Create a DataFrame for consistent output handling df = pd.DataFrame({'text': comments}) else: print("unsupported file format") return "Unsupported file format. Please upload a CSV or TXT file.", None, None # Process comments in batches probabilities = predict_batch_comments(comments) print(probabilities) # Create output dataframe df['judol_probability'] = probabilities df['is_judol'] = df['judol_probability'] > 0.5 # Create output file import os import tempfile # Extract base filename without extension for output base_filename = os.path.splitext(original_filename)[0] output_filename = f"{base_filename}_analyzed.csv" # Create a temporary file to write to temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") df.to_csv(temp_file.name, index=False) temp_file.close() print("Create CSV done") print(f"output filename:{output_filename}") # Summary stats total_comments = len(comments) judol_comments = sum(1 for p in probabilities if p > 0.5) processing_time = time.time() - start_time summary = f"Processed {total_comments} comments in {processing_time:.2f} seconds\n" summary += f"Found {judol_comments} potential judol comments ({judol_comments/total_comments*100:.1f}%)\n" summary += f"Average processing time per comment: {processing_time/total_comments*1000:.2f} ms" # Return the temporary file path with the desired filename return summary, df.head(100), temp_file.name except Exception as e: import traceback error_details = traceback.format_exc() print(f"Error details: {error_details}") return f"Error processing file: {str(e)}\n\nDetails: {error_details}", None, None # Define Gradio interface with gr.Blocks(title="YouTube Judol Comment Detector") as demo: gr.Markdown("# YouTube Judol Comment Detector") gr.Markdown("Detect potential judi online (judol) spam comments on YouTube videos.") with gr.Tab("Single Comment"): with gr.Row(): with gr.Column(): comment_input = gr.Textbox( label="Enter YouTube comment", placeholder="Type or paste a YouTube comment here", lines=4 ) analyze_btn = gr.Button("Analyze Comment", variant="primary") with gr.Column(): result_output = gr.Textbox( label="Analysis Result", lines=5 ) analyze_btn.click( fn=process_single_input, inputs=comment_input, outputs=result_output ) with gr.Tab("Bulk Analysis"): with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload CSV or TXT file with comments", file_types=[".csv", ".txt"] ) analyze_file_btn = gr.Button("Analyze File", variant="primary") gr.Markdown(""" ## File Format Requirements: - CSV files should have a column named 'text' or 'comment' - TXT files should have one comment per line - Maximum recommended file size: 10MB (approx. 50,000 comments) """) with gr.Column(): summary_output = gr.Textbox( label="Analysis Summary", lines=5 ) results_table = gr.DataFrame(label="Results Preview (First 100 rows)") download_output = gr.File(label="Download Complete Results") analyze_file_btn.click( fn=process_file_input, inputs=file_input, outputs=[summary_output, results_table, download_output] ) gr.Markdown("## About") gr.Markdown(""" This model detects Indonesian judi online (judol) spam comments on YouTube. - Built with IndoBERT base model - Fine-tuned on annotated YouTube comments dataset - Detects common judol spam patterns in Indonesian language Created by yekaii. """) # Launch the app demo.launch()