MHamdan commited on
Commit
8cc50db
·
verified ·
1 Parent(s): 625d455

Update files

Browse files

To simplify the agent

Files changed (4) hide show
  1. app.py +90 -204
  2. requirements.txt +4 -8
  3. smart_web_analyzer.py +184 -0
  4. space.yml +3 -0
app.py CHANGED
@@ -1,228 +1,114 @@
 
 
 
1
 
2
- import gradio as gr
3
- import json
4
- from smolagents import load_tool
5
- import time
6
- from datetime import datetime
7
- import plotly.graph_objects as go
8
- from fpdf import FPDF
9
- import tempfile
10
- import os
11
 
12
- # Load the analyzer with caching
13
- analyzer = load_tool("MHamdan/smart-web-analyzer-plus", trust_remote_code=True)
14
- analysis_cache = {}
 
 
 
 
 
 
15
 
16
- def create_sentiment_chart(sentiment_data):
17
- """Creates an interactive bar chart for sentiment analysis."""
18
- sections = []
19
- scores = []
20
-
21
- for item in sentiment_data['sections']:
22
- sections.append(f"Section {item['section']}")
23
- scores.append(item['score'])
24
-
25
- fig = go.Figure(data=[
26
- go.Bar(
27
- x=sections,
28
- y=scores,
29
- marker_color='rgb(55, 83, 109)',
30
- text=scores,
31
- textposition='auto'
32
- )
33
- ])
34
 
35
- fig.update_layout(
36
- title='Sentiment Analysis by Section',
37
- xaxis_title='Content Sections',
38
- yaxis_title='Sentiment Score (1-5)',
39
- yaxis_range=[0, 5]
40
- )
41
 
42
- return fig
43
-
44
- def generate_pdf_report(analysis_result):
45
- """Generates a PDF report from analysis results."""
46
- pdf = FPDF()
47
- pdf.add_page()
48
 
49
- # Header
50
- pdf.set_font('Arial', 'B', 16)
51
- pdf.cell(0, 10, 'Content Analysis Report', 0, 1, 'C')
52
- pdf.line(10, 30, 200, 30)
 
53
 
54
- # Date
55
- pdf.set_font('Arial', '', 10)
56
- pdf.cell(0, 10, f'Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', 0, 1)
57
 
58
- # Content
59
- pdf.set_font('Arial', 'B', 12)
60
- pdf.cell(0, 10, 'Basic Statistics:', 0, 1)
61
- pdf.set_font('Arial', '', 10)
62
 
63
- stats = analysis_result.get('stats', {})
64
- for key, value in stats.items():
65
- pdf.cell(0, 10, f'{key.title()}: {value}', 0, 1)
66
 
67
- if 'summary' in analysis_result:
68
- pdf.set_font('Arial', 'B', 12)
69
- pdf.cell(0, 10, 'Summary:', 0, 1)
70
- pdf.set_font('Arial', '', 10)
71
- pdf.multi_cell(0, 10, analysis_result['summary'])
72
 
73
- # Save to temporary file
74
- temp_dir = tempfile.gettempdir()
75
- pdf_path = os.path.join(temp_dir, 'analysis_report.pdf')
76
- pdf.output(pdf_path)
 
 
 
 
 
77
 
78
- return pdf_path
79
 
80
- def process_content(input_text, mode, theme, progress=gr.Progress()):
81
- """Main processing function with progress updates."""
82
- try:
83
- # Check cache
84
- cache_key = f"{input_text}_{mode}"
85
- if cache_key in analysis_cache:
86
- return (
87
- analysis_cache[cache_key],
88
- "Content preview unavailable for cached results",
89
- "Using cached results",
90
- None
91
- )
92
-
93
- # Process in steps
94
- progress(0, desc="Initializing analysis...")
95
- time.sleep(0.5) # Simulate processing
96
-
97
- progress(0.3, desc="Fetching content...")
98
- result = analyzer(input_text, mode)
99
- analysis_result = json.loads(result)
100
-
101
- progress(0.6, desc="Analyzing content...")
102
-
103
- # Create visualization if sentiment mode
104
- chart = None
105
- if mode == "sentiment" and analysis_result.get('status') == 'success':
106
- progress(0.8, desc="Generating visualizations...")
107
- chart = create_sentiment_chart(analysis_result['sentiment_analysis'])
108
-
109
- # Cache results
110
- analysis_cache[cache_key] = analysis_result
111
-
112
- # Generate preview text
113
- preview = analysis_result.get('stats', {}).get('title', '')
114
- if 'summary' in analysis_result:
115
- preview += f"\n\nSummary:\n{analysis_result['summary']}"
116
-
117
- progress(1.0, desc="Complete!")
118
- return analysis_result, preview, "Analysis complete!", chart
119
-
120
- except Exception as e:
121
- return (
122
- {"status": "error", "message": str(e)},
123
- "Error occurred",
124
- f"Error: {str(e)}",
125
- None
126
  )
127
-
128
- def create_interface():
129
- with gr.Blocks(title="Smart Web Analyzer Plus", theme=gr.themes.Base()) as iface:
130
- # Header
131
- gr.Markdown("# 🚀 Smart Web Analyzer Plus")
132
- gr.Markdown("""
133
- Advanced content analysis with AI-powered insights:
134
- * 📊 Comprehensive Analysis
135
- * 😊 Detailed Sentiment Analysis
136
- * 📝 Smart Summarization
137
- * 🎯 Topic Detection
138
- """)
139
 
140
- # Theme toggle
141
  with gr.Row():
142
- theme = gr.Radio(
143
- choices=["light", "dark"],
144
- value="light",
145
- label="Theme",
146
- interactive=True
 
 
 
 
147
  )
148
 
149
- # Main content
150
- with gr.Tabs():
151
- # Analysis Tab
152
- with gr.Tab("Analysis"):
153
- with gr.Row():
154
- with gr.Column():
155
- input_text = gr.Textbox(
156
- label="URL or Text to Analyze",
157
- placeholder="Enter URL or paste text",
158
- lines=5
159
- )
160
- mode = gr.Radio(
161
- choices=["analyze", "summarize", "sentiment", "topics"],
162
- value="analyze",
163
- label="Analysis Mode"
164
- )
165
- analyze_btn = gr.Button("🔍 Analyze", variant="primary")
166
- status = gr.Markdown("Status: Ready")
167
-
168
- with gr.Column():
169
- results = gr.JSON(label="Analysis Results")
170
- chart = gr.Plot(label="Visualization", visible=False)
171
-
172
- # Show/hide chart based on mode
173
- mode.change(
174
- lambda m: gr.update(visible=(m == "sentiment")),
175
- inputs=[mode],
176
- outputs=[chart]
177
- )
178
-
179
- # Preview Tab
180
- with gr.Tab("Preview"):
181
- preview = gr.Textbox(
182
- label="Content Preview",
183
- lines=10,
184
- interactive=False
185
- )
186
-
187
- # Report Tab
188
- with gr.Tab("Report"):
189
- download_btn = gr.Button("📥 Download PDF Report")
190
- pdf_output = gr.File(label="Generated Report")
191
 
192
- # Examples
193
- gr.Examples(
194
- examples=[
195
- ["https://www.artificialintelligence-news.com/2024/02/14/openai-anthropic-google-white-house-red-teaming/", "analyze", "light"],
196
- ["https://www.artificialintelligence-news.com/2024/02/13/ai-21-labs-wordtune-chatgpt-plugin/", "sentiment", "light"]
197
- ],
198
- inputs=[input_text, mode, theme],
199
- outputs=[results, preview, status, chart],
200
- fn=process_content,
201
- cache_examples=True
202
- )
203
-
204
- # Handle theme changes
205
- theme.change(
206
- lambda t: gr.update(theme=gr.themes.Base() if t == "light" else gr.themes.Soft()),
207
- inputs=[theme],
208
- outputs=[iface]
209
- )
210
 
211
- # Wire up the analysis button
212
- analyze_btn.click(
213
- fn=process_content,
214
- inputs=[input_text, mode, theme],
215
- outputs=[results, preview, status, chart]
216
  )
217
 
218
- # Wire up PDF download
219
- download_btn.click(
220
- fn=lambda: generate_pdf_report(json.loads(results.value)),
221
- inputs=[],
222
- outputs=[pdf_output]
 
 
 
 
223
  )
224
-
225
- return iface
226
 
227
- demo = create_interface()
228
- demo.launch()
 
 
1
+ # app.py
2
+ """
3
+ Gradio App for Smart Web Analyzer Plus
4
 
5
+ Key Features:
6
+ - Accepts a URL
7
+ - Lets users select analysis modes (Clean Text, Summarization, Sentiment, Topic)
8
+ - Fetches and processes content
9
+ - Displays JSON output with results
10
+ - Includes example URLs
11
+ """
 
 
12
 
13
+ import gradio as gr
14
+ from smart_web_analyzer import (
15
+ fetch_web_content,
16
+ clean_text,
17
+ summarize_text,
18
+ analyze_sentiment,
19
+ detect_topic,
20
+ preview_clean_text,
21
+ )
22
 
23
+ def analyze_url(url, modes):
24
+ """
25
+ Fetches web content and performs selected analyses (modes).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ Parameters:
28
+ url (str): URL to analyze
29
+ modes (list): list of selected modes
 
 
 
30
 
31
+ Returns:
32
+ dict: a dictionary of results or an error message
33
+ """
34
+ results = {}
 
 
35
 
36
+ # Attempt to fetch the web content
37
+ try:
38
+ html_content = fetch_web_content(url)
39
+ except Exception as e:
40
+ return {"error": str(e)} # show error in JSON output
41
 
42
+ # Clean the content
43
+ cleaned = clean_text(html_content)
 
44
 
45
+ # Perform selected analyses
46
+ if "Clean Text Preview" in modes:
47
+ results["Clean Text Preview"] = preview_clean_text(cleaned, max_chars=500)
 
48
 
49
+ if "Summarization" in modes:
50
+ results["Summarization"] = summarize_text(cleaned)
 
51
 
52
+ if "Sentiment Analysis" in modes:
53
+ results["Sentiment Analysis"] = analyze_sentiment(cleaned)
 
 
 
54
 
55
+ if "Topic Detection" in modes:
56
+ topics = detect_topic(cleaned)
57
+ if isinstance(topics, dict) and "error" in topics:
58
+ results["Topic Detection"] = topics["error"]
59
+ else:
60
+ # Format detected topics into a readable string
61
+ # for the output
62
+ topics_formatted = "\n".join([f"{t}: {s:.2f}" for t, s in topics.items()])
63
+ results["Topic Detection"] = topics_formatted
64
 
65
+ return results
66
 
67
+ # Build Gradio Interface
68
+ def build_app():
69
+ with gr.Blocks(title="Smart Web Analyzer Plus") as demo:
70
+ gr.Markdown("# Smart Web Analyzer Plus")
71
+ gr.Markdown(
72
+ "Analyze web content for summarization, sentiment, and topics. "
73
+ "Choose your analysis modes and enter a URL below."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  )
 
 
 
 
 
 
 
 
 
 
 
 
75
 
 
76
  with gr.Row():
77
+ url_input = gr.Textbox(
78
+ label="Enter URL",
79
+ placeholder="https://example.com",
80
+ lines=1
81
+ )
82
+ mode_selector = gr.CheckboxGroup(
83
+ label="Select Analysis Modes",
84
+ choices=["Clean Text Preview", "Summarization", "Sentiment Analysis", "Topic Detection"],
85
+ value=["Clean Text Preview", "Summarization", "Sentiment Analysis", "Topic Detection"]
86
  )
87
 
88
+ output_box = gr.JSON(label="Analysis Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # Button to run analysis
91
+ analyze_button = gr.Button("Analyze")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ # On click, run the analysis function
94
+ analyze_button.click(
95
+ fn=analyze_url,
96
+ inputs=[url_input, mode_selector],
97
+ outputs=output_box
98
  )
99
 
100
+ # Example URLs
101
+ gr.Markdown("### Example URLs")
102
+ gr.Examples(
103
+ examples=[
104
+ ["https://www.artificialintelligence-news.com/2024/02/14/openai-anthropic-google-white-house-red-teaming/"],
105
+ ["https://www.artificialintelligence-news.com/2024/02/13/ai-21-labs-wordtune-chatgpt-plugin/"]
106
+ ],
107
+ inputs=url_input,
108
+ label="Click an example to analyze"
109
  )
110
+ return demo
 
111
 
112
+ if __name__ == "__main__":
113
+ demo_app = build_app()
114
+ demo_app.launch()
requirements.txt CHANGED
@@ -1,9 +1,5 @@
1
-
2
  gradio>=4.0.0
3
- beautifulsoup4>=4.9.3
4
- requests>=2.25.1
5
- smolagents
6
- transformers
7
- torch
8
- plotly
9
- fpdf
 
 
1
  gradio>=4.0.0
2
+ beautifulsoup4>=4.12.0
3
+ requests>=2.31.0
4
+ transformers>=4.40.0
5
+ torch>=2.2.0
 
 
 
smart_web_analyzer.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # smart_web_analyzer.py
2
+ """
3
+ Smart Web Analyzer Plus - Core Functionality
4
+
5
+ Features:
6
+ - Web content fetching with custom User-Agent (to avoid 403 errors)
7
+ - Basic HTML cleaning (no removal of script/style)
8
+ - Summarization using "facebook/bart-large-cnn"
9
+ - Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
10
+ - Topic detection via zero-shot classification ("facebook/bart-large-mnli")
11
+ - Preview text for display
12
+ """
13
+
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+ from transformers import pipeline
17
+
18
+ # 1) Summarization Pipeline
19
+ try:
20
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
21
+ except Exception as e:
22
+ summarizer = None
23
+ print("Error loading summarization model:", e)
24
+
25
+ # 2) Sentiment Analysis Pipeline
26
+ try:
27
+ sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
28
+ except Exception as e:
29
+ sentiment_analyzer = None
30
+ print("Error loading sentiment analysis model:", e)
31
+
32
+ # 3) Zero-Shot Topic Detection Pipeline
33
+ try:
34
+ zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
35
+ except Exception as e:
36
+ zero_shot_classifier = None
37
+ print("Error loading topic detection model:", e)
38
+
39
+
40
+ def fetch_web_content(url):
41
+ """
42
+ Fetches the HTML content of a given URL, using a spoofed User-Agent.
43
+
44
+ Parameters:
45
+ url (str): The URL to fetch.
46
+
47
+ Returns:
48
+ str: HTML content if successful.
49
+
50
+ Raises:
51
+ ValueError: if the URL is invalid.
52
+ Exception: if the request fails (network error, 4xx/5xx, etc.).
53
+ """
54
+ # Validate input URL
55
+ if not url.startswith("http://") and not url.startswith("https://"):
56
+ raise ValueError("Invalid URL. URL must start with http:// or https://")
57
+
58
+ # Spoof common browser User-Agent to reduce 403 errors
59
+ headers = {
60
+ "User-Agent": (
61
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
62
+ "(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
63
+ )
64
+ }
65
+
66
+ try:
67
+ response = requests.get(url, headers=headers, timeout=10)
68
+ response.raise_for_status() # Raises HTTPError for 4XX or 5XX
69
+ return response.text
70
+ except requests.exceptions.RequestException as e:
71
+ # Catch all exceptions from the requests library
72
+ raise Exception(f"Error fetching the URL: {e}")
73
+
74
+
75
+ def clean_text(html_content):
76
+ """
77
+ Cleans HTML content to extract raw text (keeps <script> and <style>).
78
+
79
+ Parameters:
80
+ html_content (str): The raw HTML content.
81
+
82
+ Returns:
83
+ str: Cleaned text extracted from the HTML.
84
+ """
85
+ soup = BeautifulSoup(html_content, "html.parser")
86
+ # NOTE: We are NOT removing <script> or <style> tags here:
87
+ # for script_or_style in soup(["script", "style"]):
88
+ # script_or_style.decompose()
89
+
90
+ text = soup.get_text(separator=" ")
91
+ # Collapse multiple whitespaces
92
+ cleaned_text = " ".join(text.split())
93
+ return cleaned_text
94
+
95
+
96
+ def summarize_text(text, max_length=130, min_length=30):
97
+ """
98
+ Summarizes text using the facebook/bart-large-cnn model.
99
+
100
+ Parameters:
101
+ text (str): The text to summarize.
102
+ max_length (int): Maximum length for the summary.
103
+ min_length (int): Minimum length for the summary.
104
+
105
+ Returns:
106
+ str: The summarized text or an error message.
107
+ """
108
+ if not summarizer:
109
+ return "Summarization model is not available."
110
+
111
+ try:
112
+ summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
113
+ return summary_list[0]["summary_text"]
114
+ except Exception as e:
115
+ return f"Error during summarization: {e}"
116
+
117
+
118
+ def analyze_sentiment(text):
119
+ """
120
+ Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.
121
+
122
+ Parameters:
123
+ text (str): Text for sentiment analysis.
124
+
125
+ Returns:
126
+ str: A label describing sentiment (e.g., '4 stars') or an error message.
127
+ """
128
+ if not sentiment_analyzer:
129
+ return "Sentiment analysis model is not available."
130
+
131
+ try:
132
+ results = sentiment_analyzer(text)
133
+ # Typically returns a list of results; we grab the first
134
+ label = results[0]["label"]
135
+ return label
136
+ except Exception as e:
137
+ return f"Error during sentiment analysis: {e}"
138
+
139
+
140
+ def detect_topic(text):
141
+ """
142
+ Detects topics in text using zero-shot classification via facebook/bart-large-mnli.
143
+
144
+ Parameters:
145
+ text (str): The text to analyze.
146
+
147
+ Returns:
148
+ dict or str: Dictionary of topics & confidence scores OR an error string.
149
+ """
150
+ if not zero_shot_classifier:
151
+ return {"error": "Topic detection model is not available."}
152
+
153
+ # Example candidate labels
154
+ candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]
155
+
156
+ try:
157
+ result = zero_shot_classifier(text, candidate_labels)
158
+ # result['labels'] are sorted by confidence
159
+ # We'll map each label to its corresponding score
160
+ topics = {
161
+ label: score for label, score
162
+ in zip(result["labels"], result["scores"])
163
+ }
164
+ return topics
165
+ except Exception as e:
166
+ return {"error": f"Error during topic detection: {e}"}
167
+
168
+
169
+ def preview_clean_text(text, max_chars=500):
170
+ """
171
+ Returns a preview slice of the cleaned text for display.
172
+
173
+ Parameters:
174
+ text (str): The text to preview.
175
+ max_chars (int): Maximum number of characters in the preview.
176
+
177
+ Returns:
178
+ str: The truncated text plus ellipsis if it's longer than max_chars.
179
+ """
180
+ if len(text) > max_chars:
181
+ return text[:max_chars] + "..."
182
+ return text
183
+
184
+ # End of smart_web_analyzer.py
space.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ title: Smart Web Analyzer Plus
2
+ sdk: gradio
3
+ python_version: 3.10