Spaces:

yekaii
/

ytb-comment-judol-indonesia

Sleeping

App Files Files Community

ytb-comment-judol-indonesia / app.py

yekaii

Update app.py

4786981 verified 6 days ago

raw

history blame contribute delete

10.1 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import pandas as pd
	import numpy as np
	import time
	import io

	# Load model and tokenizer once at startup
	model_name = "yekaii/ytb-comment-judol-bulk"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	# Set device and move model to it
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.eval()

	def predict_single_comment(comment):
	"""
	Process a single comment through the judol detection model

	Args:
	comment: Text string containing the comment

	Returns:
	Float probability of being judol (0-1)
	"""
	# Tokenize input
	inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=128)
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Predict without calculating gradients
	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	judol_prob = probs[0][1].item()

	return judol_prob

	def predict_batch_comments(comments, batch_size=16):
	"""
	Process multiple comments in batches

	Args:
	comments: List of comment strings
	batch_size: Number of comments to process at once

	Returns:
	List of judol probabilities for each comment
	"""
	results = []

	# Process in batches
	for i in range(0, len(comments), batch_size):
	batch = comments[i:i+batch_size]

	# Tokenize
	inputs = tokenizer(batch, return_tensors="pt", truncation=True,
	padding=True, max_length=128)
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Predict
	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	batch_probs = probs[:, 1].cpu().numpy().tolist()

	results.extend(batch_probs)

	return results

	def process_single_input(comment_text):
	"""
	Process a single comment from the Gradio interface

	Args:
	comment_text: Text string containing the comment

	Returns:
	Formatted result for display
	"""
	if not comment_text.strip():
	return "Please enter a comment to analyze."

	start_time = time.time()
	probability = predict_single_comment(comment_text)
	inference_time = time.time() - start_time

	# Format result
	result = f"Probability of being judol: {probability:.4f} ({probability*100:.2f}%)\n"
	result += f"Processing time: {inference_time:.4f} seconds"

	if probability > 0.8:
	risk = "HIGH"
	elif probability > 0.5:
	risk = "MEDIUM"
	else:
	risk = "LOW"

	result += f"\nJudol Risk Level: {risk}"

	return result

	def process_file_input(file):
	"""
	Process a CSV or TXT file containing multiple comments

	Args:
	file: File object uploaded through Gradio

	Returns:
	DataFrame with results and downloadable CSV
	"""
	start_time = time.time()

	try:
	# Check if file is None
	if file is None:
	return "No file uploaded. Please upload a CSV or TXT file.", None, None

	# Debugging: Print out the file object to understand its structure
	print(f"File object type: {type(file)}")
	print(f"File object contents: {file}")

	# In Hugging Face spaces, file might be a tuple (temp_path, original_name)
	file_path = file
	original_filename = "uploaded_file" # Default name

	if isinstance(file, tuple) and len(file) >= 2:
	file_path = file[0] # The temporary path
	original_filename = file[1] # The original filename
	elif hasattr(file, 'name'):
	file_path = file.name
	if hasattr(file, 'orig_name'):
	original_filename = file.orig_name
	else:
	# Extract filename from path
	import os
	original_filename = os.path.basename(file_path)

	print(f"Using file_path: {file_path}")
	print(f"Original filename: {original_filename}")

	# Determine file type and read accordingly
	if original_filename.lower().endswith('.csv'):
	# For CSV files, use pandas to read directly
	df = pd.read_csv(file_path)

	# Check if there's a column that might contain comments
	text_columns = [col for col in df.columns if df[col].dtype == 'object']
	if not text_columns:
	print("No text columns found in CSV file")
	return "No text columns found in CSV file", None, None

	# Use the first text column if 'text' or 'comment' not found
	if 'text' in text_columns:
	comment_col = 'text'
	elif 'comment' in text_columns:
	comment_col = 'comment'
	else:
	comment_col = text_columns[0]

	comments = df[comment_col].fillna('').astype(str).tolist()
	print(comments)

	elif original_filename.lower().endswith('.txt'):
	# For TXT files, read lines
	with open(file_path, 'r', encoding='utf-8') as f:
	comments = [line.strip() for line in f if line.strip()]

	# Create a DataFrame for consistent output handling
	df = pd.DataFrame({'text': comments})
	else:
	print("unsupported file format")
	return "Unsupported file format. Please upload a CSV or TXT file.", None, None

	# Process comments in batches
	probabilities = predict_batch_comments(comments)
	print(probabilities)

	# Create output dataframe
	df['judol_probability'] = probabilities
	df['is_judol'] = df['judol_probability'] > 0.5

	# Create output file
	import os
	import tempfile

	# Extract base filename without extension for output
	base_filename = os.path.splitext(original_filename)[0]
	output_filename = f"{base_filename}_analyzed.csv"

	# Create a temporary file to write to
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
	df.to_csv(temp_file.name, index=False)
	temp_file.close()
	print("Create CSV done")
	print(f"output filename:{output_filename}")

	# Summary stats
	total_comments = len(comments)
	judol_comments = sum(1 for p in probabilities if p > 0.5)
	processing_time = time.time() - start_time

	summary = f"Processed {total_comments} comments in {processing_time:.2f} seconds\n"
	summary += f"Found {judol_comments} potential judol comments ({judol_comments/total_comments*100:.1f}%)\n"
	summary += f"Average processing time per comment: {processing_time/total_comments*1000:.2f} ms"

	# Return the temporary file path with the desired filename
	return summary, df.head(100), temp_file.name

	except Exception as e:
	import traceback
	error_details = traceback.format_exc()
	print(f"Error details: {error_details}")
	return f"Error processing file: {str(e)}\n\nDetails: {error_details}", None, None

	# Define Gradio interface
	with gr.Blocks(title="YouTube Judol Comment Detector") as demo:
	gr.Markdown("# YouTube Judol Comment Detector")
	gr.Markdown("Detect potential judi online (judol) spam comments on YouTube videos.")

	with gr.Tab("Single Comment"):
	with gr.Row():
	with gr.Column():
	comment_input = gr.Textbox(
	label="Enter YouTube comment",
	placeholder="Type or paste a YouTube comment here",
	lines=4
	)
	analyze_btn = gr.Button("Analyze Comment", variant="primary")

	with gr.Column():
	result_output = gr.Textbox(
	label="Analysis Result",
	lines=5
	)

	analyze_btn.click(
	fn=process_single_input,
	inputs=comment_input,
	outputs=result_output
	)

	with gr.Tab("Bulk Analysis"):
	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload CSV or TXT file with comments",
	file_types=[".csv", ".txt"]
	)
	analyze_file_btn = gr.Button("Analyze File", variant="primary")

	gr.Markdown("""
	## File Format Requirements:
	- CSV files should have a column named 'text' or 'comment'
	- TXT files should have one comment per line
	- Maximum recommended file size: 10MB (approx. 50,000 comments)
	""")

	with gr.Column():
	summary_output = gr.Textbox(
	label="Analysis Summary",
	lines=5
	)
	results_table = gr.DataFrame(label="Results Preview (First 100 rows)")
	download_output = gr.File(label="Download Complete Results")

	analyze_file_btn.click(
	fn=process_file_input,
	inputs=file_input,
	outputs=[summary_output, results_table, download_output]
	)

	gr.Markdown("## About")
	gr.Markdown("""
	This model detects Indonesian judi online (judol) spam comments on YouTube.

	- Built with IndoBERT base model
	- Fine-tuned on annotated YouTube comments dataset
	- Detects common judol spam patterns in Indonesian language

	Created by yekaii.
	""")

	# Launch the app
	demo.launch()