Spaces:

MHamdan
/

smart-web-analyzer-plus

Running

App Files Files Community

MHamdan commited on Feb 15

Commit

8cc50db

verified ·

1 Parent(s): 625d455

Update files

Browse files

To simplify the agent

Files changed (4) hide show

app.py +90 -204
requirements.txt +4 -8
smart_web_analyzer.py +184 -0
space.yml +3 -0

app.py CHANGED Viewed

@@ -1,228 +1,114 @@
-import gradio as gr
-import json
-from smolagents import load_tool
-import time
-from datetime import datetime
-import plotly.graph_objects as go
-from fpdf import FPDF
-import tempfile
-import os
-# Load the analyzer with caching
-analyzer = load_tool("MHamdan/smart-web-analyzer-plus", trust_remote_code=True)
-analysis_cache = {}
-def create_sentiment_chart(sentiment_data):
-    """Creates an interactive bar chart for sentiment analysis."""
-    sections = []
-    scores = []
-    for item in sentiment_data['sections']:
-        sections.append(f"Section {item['section']}")
-        scores.append(item['score'])
-    fig = go.Figure(data=[
-        go.Bar(
-            x=sections,
-            y=scores,
-            marker_color='rgb(55, 83, 109)',
-            text=scores,
-            textposition='auto'
-        )
-    ])
-    fig.update_layout(
-        title='Sentiment Analysis by Section',
-        xaxis_title='Content Sections',
-        yaxis_title='Sentiment Score (1-5)',
-        yaxis_range=[0, 5]
-    )
-    return fig
-def generate_pdf_report(analysis_result):
-    """Generates a PDF report from analysis results."""
-    pdf = FPDF()
-    pdf.add_page()
-    # Header
-    pdf.set_font('Arial', 'B', 16)
-    pdf.cell(0, 10, 'Content Analysis Report', 0, 1, 'C')
-    pdf.line(10, 30, 200, 30)
-    # Date
-    pdf.set_font('Arial', '', 10)
-    pdf.cell(0, 10, f'Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', 0, 1)
-    # Content
-    pdf.set_font('Arial', 'B', 12)
-    pdf.cell(0, 10, 'Basic Statistics:', 0, 1)
-    pdf.set_font('Arial', '', 10)
-    stats = analysis_result.get('stats', {})
-    for key, value in stats.items():
-        pdf.cell(0, 10, f'{key.title()}: {value}', 0, 1)
-    if 'summary' in analysis_result:
-        pdf.set_font('Arial', 'B', 12)
-        pdf.cell(0, 10, 'Summary:', 0, 1)
-        pdf.set_font('Arial', '', 10)
-        pdf.multi_cell(0, 10, analysis_result['summary'])
-    # Save to temporary file
-    temp_dir = tempfile.gettempdir()
-    pdf_path = os.path.join(temp_dir, 'analysis_report.pdf')
-    pdf.output(pdf_path)
-    return pdf_path
-def process_content(input_text, mode, theme, progress=gr.Progress()):
-    """Main processing function with progress updates."""
-    try:
-        # Check cache
-        cache_key = f"{input_text}_{mode}"
-        if cache_key in analysis_cache:
-            return (
-                analysis_cache[cache_key],
-                "Content preview unavailable for cached results",
-                "Using cached results",
-                None
-            )
-        # Process in steps
-        progress(0, desc="Initializing analysis...")
-        time.sleep(0.5)  # Simulate processing
-        progress(0.3, desc="Fetching content...")
-        result = analyzer(input_text, mode)
-        analysis_result = json.loads(result)
-        progress(0.6, desc="Analyzing content...")
-        # Create visualization if sentiment mode
-        chart = None
-        if mode == "sentiment" and analysis_result.get('status') == 'success':
-            progress(0.8, desc="Generating visualizations...")
-            chart = create_sentiment_chart(analysis_result['sentiment_analysis'])
-        # Cache results
-        analysis_cache[cache_key] = analysis_result
-        # Generate preview text
-        preview = analysis_result.get('stats', {}).get('title', '')
-        if 'summary' in analysis_result:
-            preview += f"\n\nSummary:\n{analysis_result['summary']}"
-        progress(1.0, desc="Complete!")
-        return analysis_result, preview, "Analysis complete!", chart
-    except Exception as e:
-        return (
-            {"status": "error", "message": str(e)},
-            "Error occurred",
-            f"Error: {str(e)}",
-            None
         )
-def create_interface():
-    with gr.Blocks(title="Smart Web Analyzer Plus", theme=gr.themes.Base()) as iface:
-        # Header
-        gr.Markdown("# 🚀 Smart Web Analyzer Plus")
-        gr.Markdown("""
-        Advanced content analysis with AI-powered insights:
-        * 📊 Comprehensive Analysis
-        * 😊 Detailed Sentiment Analysis
-        * 📝 Smart Summarization
-        * 🎯 Topic Detection
-        """)
-        # Theme toggle
         with gr.Row():
-            theme = gr.Radio(
-                choices=["light", "dark"],
-                value="light",
-                label="Theme",
-                interactive=True
             )
-        # Main content
-        with gr.Tabs():
-            # Analysis Tab
-            with gr.Tab("Analysis"):
-                with gr.Row():
-                    with gr.Column():
-                        input_text = gr.Textbox(
-                            label="URL or Text to Analyze",
-                            placeholder="Enter URL or paste text",
-                            lines=5
-                        )
-                        mode = gr.Radio(
-                            choices=["analyze", "summarize", "sentiment", "topics"],
-                            value="analyze",
-                            label="Analysis Mode"
-                        )
-                        analyze_btn = gr.Button("🔍 Analyze", variant="primary")
-                        status = gr.Markdown("Status: Ready")
-                    with gr.Column():
-                        results = gr.JSON(label="Analysis Results")
-                        chart = gr.Plot(label="Visualization", visible=False)
-                # Show/hide chart based on mode
-                mode.change(
-                    lambda m: gr.update(visible=(m == "sentiment")),
-                    inputs=[mode],
-                    outputs=[chart]
-                )
-            # Preview Tab
-            with gr.Tab("Preview"):
-                preview = gr.Textbox(
-                    label="Content Preview",
-                    lines=10,
-                    interactive=False
-                )
-            # Report Tab
-            with gr.Tab("Report"):
-                download_btn = gr.Button("📥 Download PDF Report")
-                pdf_output = gr.File(label="Generated Report")
-        # Examples
-        gr.Examples(
-            examples=[
-                ["https://www.artificialintelligence-news.com/2024/02/14/openai-anthropic-google-white-house-red-teaming/", "analyze", "light"],
-                ["https://www.artificialintelligence-news.com/2024/02/13/ai-21-labs-wordtune-chatgpt-plugin/", "sentiment", "light"]
-            ],
-            inputs=[input_text, mode, theme],
-            outputs=[results, preview, status, chart],
-            fn=process_content,
-            cache_examples=True
-        )
-        # Handle theme changes
-        theme.change(
-            lambda t: gr.update(theme=gr.themes.Base() if t == "light" else gr.themes.Soft()),
-            inputs=[theme],
-            outputs=[iface]
-        )
-        # Wire up the analysis button
-        analyze_btn.click(
-            fn=process_content,
-            inputs=[input_text, mode, theme],
-            outputs=[results, preview, status, chart]
         )
-        # Wire up PDF download
-        download_btn.click(
-            fn=lambda: generate_pdf_report(json.loads(results.value)),
-            inputs=[],
-            outputs=[pdf_output]
         )
-    return iface
-demo = create_interface()
-demo.launch()

+# app.py
+"""
+Gradio App for Smart Web Analyzer Plus
+Key Features:
+- Accepts a URL
+- Lets users select analysis modes (Clean Text, Summarization, Sentiment, Topic)
+- Fetches and processes content
+- Displays JSON output with results
+- Includes example URLs
+"""
+import gradio as gr
+from smart_web_analyzer import (
+    fetch_web_content,
+    clean_text,
+    summarize_text,
+    analyze_sentiment,
+    detect_topic,
+    preview_clean_text,
+)
+def analyze_url(url, modes):
+    """
+    Fetches web content and performs selected analyses (modes).
+    Parameters:
+        url (str): URL to analyze
+        modes (list): list of selected modes
+    Returns:
+        dict: a dictionary of results or an error message
+    """
+    results = {}
+    # Attempt to fetch the web content
+    try:
+        html_content = fetch_web_content(url)
+    except Exception as e:
+        return {"error": str(e)}  # show error in JSON output
+    # Clean the content
+    cleaned = clean_text(html_content)
+    # Perform selected analyses
+    if "Clean Text Preview" in modes:
+        results["Clean Text Preview"] = preview_clean_text(cleaned, max_chars=500)
+    if "Summarization" in modes:
+        results["Summarization"] = summarize_text(cleaned)
+    if "Sentiment Analysis" in modes:
+        results["Sentiment Analysis"] = analyze_sentiment(cleaned)
+    if "Topic Detection" in modes:
+        topics = detect_topic(cleaned)
+        if isinstance(topics, dict) and "error" in topics:
+            results["Topic Detection"] = topics["error"]
+        else:
+            # Format detected topics into a readable string
+            # for the output
+            topics_formatted = "\n".join([f"{t}: {s:.2f}" for t, s in topics.items()])
+            results["Topic Detection"] = topics_formatted
+    return results
+# Build Gradio Interface
+def build_app():
+    with gr.Blocks(title="Smart Web Analyzer Plus") as demo:
+        gr.Markdown("# Smart Web Analyzer Plus")
+        gr.Markdown(
+            "Analyze web content for summarization, sentiment, and topics. "
+            "Choose your analysis modes and enter a URL below."
         )
         with gr.Row():
+            url_input = gr.Textbox(
+                label="Enter URL",
+                placeholder="https://example.com",
+                lines=1
+            )
+            mode_selector = gr.CheckboxGroup(
+                label="Select Analysis Modes",
+                choices=["Clean Text Preview", "Summarization", "Sentiment Analysis", "Topic Detection"],
+                value=["Clean Text Preview", "Summarization", "Sentiment Analysis", "Topic Detection"]
             )
+        output_box = gr.JSON(label="Analysis Results")
+        # Button to run analysis
+        analyze_button = gr.Button("Analyze")
+        # On click, run the analysis function
+        analyze_button.click(
+            fn=analyze_url,
+            inputs=[url_input, mode_selector],
+            outputs=output_box
         )
+        # Example URLs
+        gr.Markdown("### Example URLs")
+        gr.Examples(
+            examples=[
+                ["https://www.artificialintelligence-news.com/2024/02/14/openai-anthropic-google-white-house-red-teaming/"],
+                ["https://www.artificialintelligence-news.com/2024/02/13/ai-21-labs-wordtune-chatgpt-plugin/"]
+            ],
+            inputs=url_input,
+            label="Click an example to analyze"
         )
+    return demo
+if __name__ == "__main__":
+    demo_app = build_app()
+    demo_app.launch()

requirements.txt CHANGED Viewed

@@ -1,9 +1,5 @@
 gradio>=4.0.0
-beautifulsoup4>=4.9.3
-requests>=2.25.1
-smolagents
-transformers
-torch
-plotly
-fpdf

 gradio>=4.0.0
+beautifulsoup4>=4.12.0
+requests>=2.31.0
+transformers>=4.40.0
+torch>=2.2.0

smart_web_analyzer.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# smart_web_analyzer.py
+"""
+Smart Web Analyzer Plus - Core Functionality
+Features:
+- Web content fetching with custom User-Agent (to avoid 403 errors)
+- Basic HTML cleaning (no removal of script/style)
+- Summarization using "facebook/bart-large-cnn"
+- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
+- Topic detection via zero-shot classification ("facebook/bart-large-mnli")
+- Preview text for display
+"""
+import requests
+from bs4 import BeautifulSoup
+from transformers import pipeline
+# 1) Summarization Pipeline
+try:
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+except Exception as e:
+    summarizer = None
+    print("Error loading summarization model:", e)
+# 2) Sentiment Analysis Pipeline
+try:
+    sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
+except Exception as e:
+    sentiment_analyzer = None
+    print("Error loading sentiment analysis model:", e)
+# 3) Zero-Shot Topic Detection Pipeline
+try:
+    zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+except Exception as e:
+    zero_shot_classifier = None
+    print("Error loading topic detection model:", e)
+def fetch_web_content(url):
+    """
+    Fetches the HTML content of a given URL, using a spoofed User-Agent.
+    Parameters:
+        url (str): The URL to fetch.
+    Returns:
+        str: HTML content if successful.
+    Raises:
+        ValueError: if the URL is invalid.
+        Exception: if the request fails (network error, 4xx/5xx, etc.).
+    """
+    # Validate input URL
+    if not url.startswith("http://") and not url.startswith("https://"):
+        raise ValueError("Invalid URL. URL must start with http:// or https://")
+    # Spoof common browser User-Agent to reduce 403 errors
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+            "(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
+        )
+    }
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()  # Raises HTTPError for 4XX or 5XX
+        return response.text
+    except requests.exceptions.RequestException as e:
+        # Catch all exceptions from the requests library
+        raise Exception(f"Error fetching the URL: {e}")
+def clean_text(html_content):
+    """
+    Cleans HTML content to extract raw text (keeps <script> and <style>).
+    Parameters:
+        html_content (str): The raw HTML content.
+    Returns:
+        str: Cleaned text extracted from the HTML.
+    """
+    soup = BeautifulSoup(html_content, "html.parser")
+    # NOTE: We are NOT removing <script> or <style> tags here:
+    # for script_or_style in soup(["script", "style"]):
+    #     script_or_style.decompose()
+    text = soup.get_text(separator=" ")
+    # Collapse multiple whitespaces
+    cleaned_text = " ".join(text.split())
+    return cleaned_text
+def summarize_text(text, max_length=130, min_length=30):
+    """
+    Summarizes text using the facebook/bart-large-cnn model.
+    Parameters:
+        text (str): The text to summarize.
+        max_length (int): Maximum length for the summary.
+        min_length (int): Minimum length for the summary.
+    Returns:
+        str: The summarized text or an error message.
+    """
+    if not summarizer:
+        return "Summarization model is not available."
+    try:
+        summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
+        return summary_list[0]["summary_text"]
+    except Exception as e:
+        return f"Error during summarization: {e}"
+def analyze_sentiment(text):
+    """
+    Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.
+    Parameters:
+        text (str): Text for sentiment analysis.
+    Returns:
+        str: A label describing sentiment (e.g., '4 stars') or an error message.
+    """
+    if not sentiment_analyzer:
+        return "Sentiment analysis model is not available."
+    try:
+        results = sentiment_analyzer(text)
+        # Typically returns a list of results; we grab the first
+        label = results[0]["label"]
+        return label
+    except Exception as e:
+        return f"Error during sentiment analysis: {e}"
+def detect_topic(text):
+    """
+    Detects topics in text using zero-shot classification via facebook/bart-large-mnli.
+    Parameters:
+        text (str): The text to analyze.
+    Returns:
+        dict or str: Dictionary of topics & confidence scores OR an error string.
+    """
+    if not zero_shot_classifier:
+        return {"error": "Topic detection model is not available."}
+    # Example candidate labels
+    candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]
+    try:
+        result = zero_shot_classifier(text, candidate_labels)
+        # result['labels'] are sorted by confidence
+        # We'll map each label to its corresponding score
+        topics = {
+            label: score for label, score
+            in zip(result["labels"], result["scores"])
+        }
+        return topics
+    except Exception as e:
+        return {"error": f"Error during topic detection: {e}"}
+def preview_clean_text(text, max_chars=500):
+    """
+    Returns a preview slice of the cleaned text for display.
+    Parameters:
+        text (str): The text to preview.
+        max_chars (int): Maximum number of characters in the preview.
+    Returns:
+        str: The truncated text plus ellipsis if it's longer than max_chars.
+    """
+    if len(text) > max_chars:
+        return text[:max_chars] + "..."
+    return text
+# End of smart_web_analyzer.py

space.yml ADDED Viewed

	@@ -0,0 +1,3 @@

+title: Smart Web Analyzer Plus
+sdk: gradio
+python_version: 3.10