import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
import time
import io

# Load model and tokenizer once at startup
model_name = "yekaii/ytb-comment-judol-bulk"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Set device and move model to it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_single_comment(comment):
    """
    Process a single comment through the judol detection model
    
    Args:
        comment: Text string containing the comment
        
    Returns:
        Float probability of being judol (0-1)
    """
    # Tokenize input
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Predict without calculating gradients
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        judol_prob = probs[0][1].item()
    
    return judol_prob

def predict_batch_comments(comments, batch_size=16):
    """
    Process multiple comments in batches
    
    Args:
        comments: List of comment strings
        batch_size: Number of comments to process at once
        
    Returns:
        List of judol probabilities for each comment
    """
    results = []
    
    # Process in batches
    for i in range(0, len(comments), batch_size):
        batch = comments[i:i+batch_size]
        
        # Tokenize
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, 
                         padding=True, max_length=128)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            batch_probs = probs[:, 1].cpu().numpy().tolist()
        
        results.extend(batch_probs)
    
    return results

def process_single_input(comment_text):
    """
    Process a single comment from the Gradio interface
    
    Args:
        comment_text: Text string containing the comment
        
    Returns:
        Formatted result for display
    """
    if not comment_text.strip():
        return "Please enter a comment to analyze."
    
    start_time = time.time()
    probability = predict_single_comment(comment_text)
    inference_time = time.time() - start_time
    
    # Format result
    result = f"Probability of being judol: {probability:.4f} ({probability*100:.2f}%)\n"
    result += f"Processing time: {inference_time:.4f} seconds"
    
    if probability > 0.8:
        risk = "HIGH"
    elif probability > 0.5:
        risk = "MEDIUM"
    else:
        risk = "LOW"
    
    result += f"\nJudol Risk Level: {risk}"
    
    return result

def process_file_input(file):
    """
    Process a CSV or TXT file containing multiple comments
    
    Args:
        file: File object uploaded through Gradio
        
    Returns:
        DataFrame with results and downloadable CSV
    """
    start_time = time.time()
    
    try:
        # Check if file is None
        if file is None:
            return "No file uploaded. Please upload a CSV or TXT file.", None, None
            
        # Debugging: Print out the file object to understand its structure
        print(f"File object type: {type(file)}")
        print(f"File object contents: {file}")
        
        # In Hugging Face spaces, file might be a tuple (temp_path, original_name)
        file_path = file
        original_filename = "uploaded_file"  # Default name
        
        if isinstance(file, tuple) and len(file) >= 2:
            file_path = file[0]  # The temporary path
            original_filename = file[1]  # The original filename
        elif hasattr(file, 'name'):
            file_path = file.name
            if hasattr(file, 'orig_name'):
                original_filename = file.orig_name
            else:
                # Extract filename from path
                import os
                original_filename = os.path.basename(file_path)
                
        print(f"Using file_path: {file_path}")
        print(f"Original filename: {original_filename}")
        
        # Determine file type and read accordingly
        if original_filename.lower().endswith('.csv'):
            # For CSV files, use pandas to read directly
            df = pd.read_csv(file_path)
            
            # Check if there's a column that might contain comments
            text_columns = [col for col in df.columns if df[col].dtype == 'object']
            if not text_columns:
                print("No text columns found in CSV file")
                return "No text columns found in CSV file", None, None
            
            # Use the first text column if 'text' or 'comment' not found
            if 'text' in text_columns:
                comment_col = 'text'
            elif 'comment' in text_columns:
                comment_col = 'comment'
            else:
                comment_col = text_columns[0]
                
            comments = df[comment_col].fillna('').astype(str).tolist()
            print(comments)
            
        elif original_filename.lower().endswith('.txt'):
            # For TXT files, read lines
            with open(file_path, 'r', encoding='utf-8') as f:
                comments = [line.strip() for line in f if line.strip()]
                
            # Create a DataFrame for consistent output handling
            df = pd.DataFrame({'text': comments})
        else:
            print("unsupported file format")
            return "Unsupported file format. Please upload a CSV or TXT file.", None, None
        
        # Process comments in batches
        probabilities = predict_batch_comments(comments)
        print(probabilities)
        
        # Create output dataframe
        df['judol_probability'] = probabilities
        df['is_judol'] = df['judol_probability'] > 0.5
        
        # Create output file
        import os
        import tempfile
        
        # Extract base filename without extension for output
        base_filename = os.path.splitext(original_filename)[0]
        output_filename = f"{base_filename}_analyzed.csv"
        
        # Create a temporary file to write to
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
        df.to_csv(temp_file.name, index=False)
        temp_file.close()
        print("Create CSV done")
        print(f"output filename:{output_filename}")
        
        # Summary stats
        total_comments = len(comments)
        judol_comments = sum(1 for p in probabilities if p > 0.5)
        processing_time = time.time() - start_time
        
        summary = f"Processed {total_comments} comments in {processing_time:.2f} seconds\n"
        summary += f"Found {judol_comments} potential judol comments ({judol_comments/total_comments*100:.1f}%)\n"
        summary += f"Average processing time per comment: {processing_time/total_comments*1000:.2f} ms"
        
        # Return the temporary file path with the desired filename
        return summary, df.head(100), temp_file.name
        
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error details: {error_details}")
        return f"Error processing file: {str(e)}\n\nDetails: {error_details}", None, None

# Define Gradio interface
with gr.Blocks(title="YouTube Judol Comment Detector") as demo:
    gr.Markdown("# YouTube Judol Comment Detector")
    gr.Markdown("Detect potential judi online (judol) spam comments on YouTube videos.")
    
    with gr.Tab("Single Comment"):
        with gr.Row():
            with gr.Column():
                comment_input = gr.Textbox(
                    label="Enter YouTube comment",
                    placeholder="Type or paste a YouTube comment here",
                    lines=4
                )
                analyze_btn = gr.Button("Analyze Comment", variant="primary")
                
            with gr.Column():
                result_output = gr.Textbox(
                    label="Analysis Result",
                    lines=5
                )
        
        analyze_btn.click(
            fn=process_single_input,
            inputs=comment_input,
            outputs=result_output
        )
    
    with gr.Tab("Bulk Analysis"):
        with gr.Row():
            with gr.Column():
                file_input = gr.File(
                    label="Upload CSV or TXT file with comments",
                    file_types=[".csv", ".txt"]
                )
                analyze_file_btn = gr.Button("Analyze File", variant="primary")
                
                gr.Markdown("""
                ## File Format Requirements:
                - CSV files should have a column named 'text' or 'comment'
                - TXT files should have one comment per line
                - Maximum recommended file size: 10MB (approx. 50,000 comments)
                """)
                
            with gr.Column():
                summary_output = gr.Textbox(
                    label="Analysis Summary",
                    lines=5
                )
                results_table = gr.DataFrame(label="Results Preview (First 100 rows)")
                download_output = gr.File(label="Download Complete Results")
        
        analyze_file_btn.click(
            fn=process_file_input,
            inputs=file_input,
            outputs=[summary_output, results_table, download_output]
        )
    
    gr.Markdown("## About")
    gr.Markdown("""
    This model detects Indonesian judi online (judol) spam comments on YouTube.
    
    - Built with IndoBERT base model
    - Fine-tuned on annotated YouTube comments dataset
    - Detects common judol spam patterns in Indonesian language
    
    Created by yekaii.
    """)

# Launch the app
demo.launch()