bangaboy commited on
Commit
2a70c18
·
verified ·
1 Parent(s): 1be26e8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -0
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from sklearn.linear_model import LinearRegression
8
+ from sklearn.ensemble import RandomForestRegressor
9
+ from sklearn.preprocessing import StandardScaler
10
+ from sklearn.model_selection import train_test_split
11
+
12
+ # Page configuration
13
+ st.set_page_config(page_title="Data Analysis Platform", layout="wide")
14
+
15
+ # Initialize session state
16
+ if 'data' not in st.session_state:
17
+ # Create sample data
18
+ np.random.seed(42)
19
+ dates = pd.date_range('2023-01-01', periods=100, freq='D')
20
+ st.session_state.data = pd.DataFrame({
21
+ 'date': dates,
22
+ 'sales': np.random.normal(1000, 200, 100),
23
+ 'visitors': np.random.normal(500, 100, 100),
24
+ 'conversion_rate': np.random.uniform(0.01, 0.05, 100),
25
+ 'customer_satisfaction': np.random.normal(4.2, 0.5, 100),
26
+ 'region': np.random.choice(['North', 'South', 'East', 'West'], 100)
27
+ })
28
+
29
+ # Sidebar for navigation
30
+ st.sidebar.title("Data Analytics Platform")
31
+ page = st.sidebar.radio("Navigation", ["Home", "Data Explorer", "Visualization", "Predictions"])
32
+
33
+ # Home page
34
+ if page == "Home":
35
+ st.title("Data Analysis Platform")
36
+ st.markdown("""
37
+ Welcome to the Data Analysis Platform. Explore your data with powerful
38
+ visualizations and machine learning insights.
39
+ """)
40
+
41
+ col1, col2 = st.columns(2)
42
+
43
+ with col1:
44
+ st.subheader("Upload Your Dataset")
45
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
46
+ if uploaded_file is not None:
47
+ try:
48
+ st.session_state.data = pd.read_csv(uploaded_file)
49
+ st.success("Data uploaded successfully!")
50
+ except Exception as e:
51
+ st.error(f"Error uploading file: {e}")
52
+
53
+ with col2:
54
+ st.subheader("Dataset Overview")
55
+ st.write(st.session_state.data.describe())
56
+
57
+ # Data Explorer page
58
+ elif page == "Data Explorer":
59
+ st.title("Data Explorer")
60
+
61
+ # Data summary
62
+ st.subheader("Dataset Summary")
63
+ st.write(f"Shape: {st.session_state.data.shape[0]} rows, {st.session_state.data.shape[1]} columns")
64
+
65
+ # Show first few rows
66
+ st.subheader("Data Preview")
67
+ st.dataframe(st.session_state.data.head())
68
+
69
+ # Column analysis
70
+ st.subheader("Column Analysis")
71
+ col1, col2 = st.columns(2)
72
+
73
+ with col1:
74
+ column = st.selectbox("Select column to analyze:", st.session_state.data.columns)
75
+
76
+ with col2:
77
+ if pd.api.types.is_numeric_dtype(st.session_state.data[column]):
78
+ analysis_type = st.selectbox(
79
+ "Analysis type:",
80
+ ["Distribution", "Time Series"] if "date" in column.lower() else ["Distribution"]
81
+ )
82
+ else:
83
+ analysis_type = st.selectbox("Analysis type:", ["Value Counts"])
84
+
85
+ # Display analysis
86
+ if pd.api.types.is_numeric_dtype(st.session_state.data[column]):
87
+ st.write(f"**Min:** {st.session_state.data[column].min():.2f}")
88
+ st.write(f"**Max:** {st.session_state.data[column].max():.2f}")
89
+ st.write(f"**Mean:** {st.session_state.data[column].mean():.2f}")
90
+ st.write(f"**Median:** {st.session_state.data[column].median():.2f}")
91
+ st.write(f"**Std Dev:** {st.session_state.data[column].std():.2f}")
92
+
93
+ fig, ax = plt.subplots(figsize=(10, 6))
94
+ sns.histplot(st.session_state.data[column], ax=ax, kde=True)
95
+ ax.set_title(f"Distribution of {column}")
96
+ st.pyplot(fig)
97
+ else:
98
+ value_counts = st.session_state.data[column].value_counts()
99
+ st.write(f"**Unique Values:** {len(value_counts)}")
100
+ st.write(f"**Most Common:** {value_counts.index[0]} ({value_counts.iloc[0]} occurrences)")
101
+
102
+ fig, ax = plt.subplots(figsize=(10, 6))
103
+ value_counts.plot(kind='bar', ax=ax)
104
+ ax.set_title(f"Value counts for {column}")
105
+ st.pyplot(fig)
106
+
107
+ # Visualization page
108
+ elif page == "Visualization":
109
+ st.title("Data Visualization")
110
+
111
+ chart_type = st.selectbox(
112
+ "Select chart type:",
113
+ ["Bar Chart", "Line Chart", "Scatter Plot", "Heatmap"]
114
+ )
115
+
116
+ if chart_type in ["Bar Chart", "Line Chart"]:
117
+ col1, col2 = st.columns(2)
118
+ with col1:
119
+ x_column = st.selectbox("X-axis:", st.session_state.data.columns)
120
+ with col2:
121
+ y_column = st.selectbox("Y-axis:",
122
+ [col for col in st.session_state.data.columns
123
+ if pd.api.types.is_numeric_dtype(st.session_state.data[col])])
124
+
125
+ # Aggregation for categorical x-axis
126
+ if not pd.api.types.is_numeric_dtype(st.session_state.data[x_column]):
127
+ agg_data = st.session_state.data.groupby(x_column)[y_column].mean().reset_index()
128
+
129
+ fig, ax = plt.subplots(figsize=(10, 6))
130
+ if chart_type == "Bar Chart":
131
+ sns.barplot(x=x_column, y=y_column, data=agg_data, ax=ax)
132
+ else: # Line chart
133
+ sns.lineplot(x=x_column, y=y_column, data=agg_data, ax=ax, marker='o')
134
+ ax.set_title(f"{y_column} by {x_column}")
135
+ st.pyplot(fig)
136
+ else:
137
+ fig, ax = plt.subplots(figsize=(10, 6))
138
+ if chart_type == "Bar Chart":
139
+ sns.barplot(x=x_column, y=y_column, data=st.session_state.data, ax=ax)
140
+ else: # Line chart
141
+ sns.lineplot(x=x_column, y=y_column, data=st.session_state.data, ax=ax)
142
+ ax.set_title(f"{y_column} vs {x_column}")
143
+ st.pyplot(fig)
144
+
145
+ elif chart_type == "Scatter Plot":
146
+ col1, col2 = st.columns(2)
147
+ with col1:
148
+ x_column = st.selectbox("X-axis:",
149
+ [col for col in st.session_state.data.columns
150
+ if pd.api.types.is_numeric_dtype(st.session_state.data[col])])
151
+ with col2:
152
+ y_column = st.selectbox("Y-axis:",
153
+ [col for col in st.session_state.data.columns
154
+ if pd.api.types.is_numeric_dtype(st.session_state.data[col]) and col != x_column])
155
+
156
+ fig, ax = plt.subplots(figsize=(10, 6))
157
+ sns.scatterplot(x=x_column, y=y_column, data=st.session_state.data, ax=ax)
158
+ ax.set_title(f"{y_column} vs {x_column}")
159
+ st.pyplot(fig)
160
+
161
+ elif chart_type == "Heatmap":
162
+ numeric_cols = st.session_state.data.select_dtypes(include=['number']).columns.tolist()
163
+ correlation = st.session_state.data[numeric_cols].corr()
164
+
165
+ fig, ax = plt.subplots(figsize=(10, 8))
166
+ sns.heatmap(correlation, annot=True, cmap='coolwarm', ax=ax)
167
+ ax.set_title("Correlation Heatmap")
168
+ st.pyplot(fig)
169
+
170
+ # Predictions page
171
+ elif page == "Predictions":
172
+ st.title("ML Predictions")
173
+
174
+ numeric_cols = st.session_state.data.select_dtypes(include=['number']).columns.tolist()
175
+
176
+ st.subheader("Train a Model")
177
+ col1, col2 = st.columns(2)
178
+
179
+ with col1:
180
+ target_column = st.selectbox("Target variable:", numeric_cols)
181
+
182
+ with col2:
183
+ model_type = st.selectbox("Model type:", ["Linear Regression", "Random Forest"])
184
+
185
+ # Select features
186
+ feature_cols = [col for col in numeric_cols if col != target_column]
187
+ selected_features = st.multiselect("Select features:", feature_cols, default=feature_cols)
188
+
189
+ if st.button("Train Model"):
190
+ if len(selected_features) > 0:
191
+ # Prepare data
192
+ X = st.session_state.data[selected_features]
193
+ y = st.session_state.data[target_column]
194
+
195
+ # Split data
196
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
197
+
198
+ # Scale features
199
+ scaler = StandardScaler()
200
+ X_train_scaled = scaler.fit_transform(X_train)
201
+ X_test_scaled = scaler.transform(X_test)
202
+
203
+ # Train model
204
+ if model_type == "Linear Regression":
205
+ model = LinearRegression()
206
+ else:
207
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
208
+
209
+ model.fit(X_train_scaled, y_train)
210
+
211
+ # Evaluate
212
+ train_score = model.score(X_train_scaled, y_train)
213
+ test_score = model.score(X_test_scaled, y_test)
214
+
215
+ st.session_state.model = model
216
+ st.session_state.scaler = scaler
217
+ st.session_state.features = selected_features
218
+
219
+ st.success("Model trained successfully!")
220
+ st.write(f"Training R² score: {train_score:.4f}")
221
+ st.write(f"Testing R² score: {test_score:.4f}")
222
+
223
+ # Feature importance for Random Forest
224
+ if model_type == "Random Forest":
225
+ importance = pd.DataFrame({
226
+ 'Feature': selected_features,
227
+ 'Importance': model.feature_importances_
228
+ }).sort_values('Importance', ascending=False)
229
+
230
+ fig, ax = plt.subplots(figsize=(10, 6))
231
+ sns.barplot(x='Importance', y='Feature', data=importance, ax=ax)
232
+ ax.set_title("Feature Importance")
233
+ st.pyplot(fig)
234
+ else:
235
+ st.error("Please select at least one feature")
236
+
237
+ # Make predictions section
238
+ st.subheader("Make Predictions")
239
+ if 'model' in st.session_state:
240
+ input_data = {}
241
+
242
+ # Create input fields for each feature
243
+ for feature in st.session_state.features:
244
+ min_val = float(st.session_state.data[feature].min())
245
+ max_val = float(st.session_state.data[feature].max())
246
+ mean_val = float(st.session_state.data[feature].mean())
247
+
248
+ input_data[feature] = st.slider(
249
+ f"Input {feature}:",
250
+ min_value=min_val,
251
+ max_value=max_val,
252
+ value=mean_val
253
+ )
254
+
255
+ if st.button("Predict"):
256
+ # Prepare input for prediction
257
+ input_df = pd.DataFrame([input_data])
258
+ input_scaled = st.session_state.scaler.transform(input_df)
259
+
260
+ # Make prediction
261
+ prediction = st.session_state.model.predict(input_scaled)[0]
262
+
263
+ st.success(f"Predicted {target_column}: {prediction:.2f}")
264
+ else:
265
+ st.info("Train a model first to make predictions")