{ "cells": [ { "cell_type": "markdown", "metadata": {}, "id": 1, "source": [ "# Data Science Analysis Notebook\n", "\n", "This notebook contains some example Python code for data analysis." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "id": 2, "source": [ "# Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Set visualization style\n", "sns.set(style='whitegrid')\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "id": 3, "source": [ "# Load the dataset\n", "df = pd.read_csv('housing_data.csv')\n", "\n", "# Display basic information\n", "print(f\"Dataset shape: {df.shape}\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "id": 4, "source": [ "# Perform data cleaning\n", "# Fill missing values with median\n", "for column in df.columns:\n", " if df[column].dtype in ['float64', 'int64']:\n", " df[column].fillna(df[column].median(), inplace=True)\n", " else:\n", " df[column].fillna(df[column].mode()[0], inplace=True)\n", "\n", "# Check for remaining missing values\n", "print(\"Missing values after cleaning:\")\n", "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "id": 5, "source": [ "# Exploratory data analysis\n", "# Create correlation matrix\n", "numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n", "correlation_matrix = df[numeric_columns].corr()\n", "\n", "# Plot heatmap\n", "plt.figure(figsize=(12, 10))\n", "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)\n", "plt.title('Correlation Matrix of Numeric Features', fontsize=18)\n", "plt.xticks(rotation=45, ha='right')\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "id": 6, "source": [ "# Feature engineering\n", "# Create new features\n", "if 'bedrooms' in df.columns and 'total_rooms' in df.columns:\n", " df['bedrooms_ratio'] = df['bedrooms'] / df['total_rooms']\n", "\n", "if 'total_rooms' in df.columns and 'households' in df.columns:\n", " df['rooms_per_household'] = df['total_rooms'] / df['households']\n", "\n", "# Scale numeric features\n", "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()\n", "df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n", "\n", "# Display transformed data\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "id": 7, "source": [ "# Build a simple prediction model\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "\n", "# Assume we're predicting median_house_value\n", "if 'median_house_value' in df.columns:\n", " # Prepare features and target\n", " X = df.drop('median_house_value', axis=1)\n", " y = df['median_house_value']\n", " \n", " # Split the data\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", " \n", " # Train the model\n", " model = LinearRegression()\n", " model.fit(X_train, y_train)\n", " \n", " # Make predictions\n", " y_pred = model.predict(X_test)\n", " \n", " # Evaluate the model\n", " mse = mean_squared_error(y_test, y_pred)\n", " r2 = r2_score(y_test, y_pred)\n", " \n", " print(f\"Mean Squared Error: {mse:.2f}\")\n", " print(f\"R² Score: {r2:.2f}\")\n", " \n", " # Plot actual vs predicted values\n", " plt.figure(figsize=(10, 6))\n", " plt.scatter(y_test, y_pred, alpha=0.5)\n", " plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')\n", " plt.xlabel('Actual Values')\n", " plt.ylabel('Predicted Values')\n", " plt.title('Actual vs Predicted Values')\n", " plt.tight_layout()\n", " plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }